In [1]:
from numba import jit
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('https://raw.githubusercontent.com/prasertcbs/basic-dataset/master/bnk48_members.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 64 entries, 0 to 63
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   name     64 non-null     object
 1   dob      64 non-null     object
 2   role     64 non-null     int64 
 3   img_url  64 non-null     object
dtypes: int64(1), object(3)
memory usage: 2.1+ KB


In [3]:
df.head()

Unnamed: 0,name,dob,role,img_url
0,BAMBOO,2002-09-03,51,https://www.bnk48.com/data/Members/51/s/202008...
1,CHERPRANG,1996-05-02,4,https://www.bnk48.com/data/Members/4/s/2020080...
2,Earn,1998-12-03,55,https://www.bnk48.com/data/Members/55/s/202008...
3,Earth,2002-04-22,56,https://www.bnk48.com/data/Members/56/s/202008...
4,Eve,2003-01-22,57,https://www.bnk48.com/data/Members/57/s/202008...


In [5]:
def cal_age(df: pd.DataFrame)->pd.DataFrame:
    new_df = (
        df
        .assign(age = lambda x: ((pd.Timestamp.now() - pd.to_datetime(x.dob)).dt.days/365).astype(int))
    )
    return new_df


@jit
def cal_age_numba(df: pd.DataFrame)->pd.DataFrame:
    new_df = (
        df
        .assign(age = lambda x: ((pd.Timestamp.now() - pd.to_datetime(x.dob)).dt.days/365).astype(int))
    )
    return new_df

In [6]:
processed_df = cal_age(df)

In [17]:
def cal_corr(df):
    return df.corr()

@jit
def cal_corr_numba(df):
    return df.corr()

@jit
def cal_corr_numba2(df):
    return np.corrcoef(df.role.to_numpy(), df.age.to_numpy())

@jit
def cal_corr_numba3(x: np.array, y: np.array):
    return np.corrcoef(x, y)

In [68]:
%%timeit
cal_corr(processed_df)

174 µs ± 5.46 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)


In [67]:
%%timeit
cal_corr_numba(processed_df)
# fallback to object mode consume twice the time

300 µs ± 28.8 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [16]:
%%timeit
cal_corr_numba2(processed_df)

224 µs ± 29.9 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [25]:
cal_corr_numba3(processed_df.role.to_numpy(), processed_df.age.to_numpy())

array([[ 1.        , -0.55268846],
       [-0.55268846,  1.        ]])

In [29]:
%%timeit
cal_corr_numba3(processed_df.role.to_numpy(), processed_df.age.to_numpy())

107 µs ± 8.66 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)


In [27]:
role_col = processed_df.role.to_numpy()
age_col = processed_df.age.to_numpy()

In [30]:
%%timeit
cal_corr_numba3(role_col, age_col)

81.4 µs ± 2.98 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)
