In [157]:
# 数据：https://github.com/aceimnorstuvwxz/toutiao-text-classfication-dataset
!head toutiao_cat_data.txt

6551700932705387022_!_101_!_news_culture_!_京城最值得你来场文化之旅的博物馆_!_保利集团,马未都,中国科学技术馆,博物馆,新中国
6552368441838272771_!_101_!_news_culture_!_发酵床的垫料种类有哪些？哪种更好？_!_
6552407965343678723_!_101_!_news_culture_!_上联：黄山黄河黄皮肤黄土高原。怎么对下联？_!_
6552332417753940238_!_101_!_news_culture_!_林徽因什么理由拒绝了徐志摩而选择梁思成为终身伴侣？_!_
6552475601595269390_!_101_!_news_culture_!_黄杨木是什么树？_!_
6552387648126714125_!_101_!_news_culture_!_上联：草根登上星光道，怎么对下联？_!_
6552271725814350087_!_101_!_news_culture_!_什么是超写实绘画？_!_
6552452982015787268_!_101_!_news_culture_!_松涛听雨莺婉转，下联？_!_
6552400379030536455_!_101_!_news_culture_!_上联：老子骑牛读书，下联怎么对？_!_
6552339283632455939_!_101_!_news_culture_!_上联：山水醉人何须酒。如何对下联？_!_


In [158]:
import pandas as pd
import jieba
jieba.initialize()

In [139]:
def get_data(file: str = "./toutiao_cat_data.txt"):
    df = pd.read_csv(
        file, 
        sep="_!_", 
        names=["id", "cate_code", "cate_name", "title", "keywords"],
        engine='python'
    )
    df = df[pd.notna(df["keywords"])]
    df = df.sample(10000)
    df["length"] = df["title"].apply(lambda x: len(x))
    return df

## Pandas

### 循环任务

In [140]:
def tokenize_forloop(df):
    data = []
    for i in range(df.shape[0]):
        tokens = jieba.lcut(df.iloc[i]["title"])
        data.append(tokens)
    df["tokens"] = data
    return df

In [141]:
df = get_data()
%time new = tokenize_forloop(df)

CPU times: user 3.06 s, sys: 220 ms, total: 3.28 s
Wall time: 3.48 s


In [142]:
def tokenize_iterrows(df):
    data = []
    for i,v in df.iterrows():
        tokens = jieba.lcut(v.title)
        data.append(tokens)
    df["tokens"] = data
    return df

In [143]:
df = get_data()
%time new = tokenize_iterrows(df)

CPU times: user 2.76 s, sys: 45.3 ms, total: 2.8 s
Wall time: 3.09 s


In [144]:
def tokenize_itertuples(df):
    data = []
    for v in df.itertuples(index=False):
        tokens = jieba.lcut(v.title)
        data.append(tokens)
    df["tokens"] = data
    return df

In [145]:
df = get_data()
%time new = tokenize_itertuples(df)

CPU times: user 1.26 s, sys: 13.8 ms, total: 1.28 s
Wall time: 1.29 s


In [146]:
df = get_data()
%time df["tokens"] = df["title"].apply(lambda x: jieba.lcut(x))

CPU times: user 1.41 s, sys: 19.1 ms, total: 1.43 s
Wall time: 1.6 s


### 批量任务

In [100]:
def reset_len(df, series, min_len, max_len):
    df.loc[series < min_len, "length"] = min_len
    df.loc[series > max_len, "length"] = max_len

In [147]:
df = get_data()
%timeit reset_len(df, df["length"], 5, 20)

982 µs ± 97.7 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [148]:
df = get_data()
%timeit reset_len(df, df["length"].values, 5, 20)

593 µs ± 38 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [153]:
def deal_length(x, min_len, max_len):
    if x < min_len:
        return min_len
    elif x > max_len:
        return max_len
    else:
        return x

In [155]:
df = get_data()
%timeit df["length"] = df["length"].apply(lambda x: deal_length(x, 5, 20))

5.33 ms ± 533 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


## 加速数据任务

In [192]:
import numpy as np

### numba

In [167]:
import numba

In [206]:
def integrate(a, b, N):
    s = 0
    dx = (b - a) / N
    for i in range(N):
        x = a + i * dx
        s += x * (x-1)
    return s * dx

@numba.jit
def integrate_numba(a, b, N):
    s = 0
    dx = (b - a) / N
    for i in range(N):
        x = a + i * dx
        s += x * (x-1)
    return s * dx

In [208]:
# with numba
df = pd.DataFrame(np.random.randint(1,100,size=(100000, 3)),columns=['a', 'b', 'N'])
%timeit df.apply(lambda x: integrate_numba(*x), axis=1)

957 ms ± 86.5 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [207]:
# without numba
df = pd.DataFrame(np.random.randint(1,100,size=(100000, 3)),columns=['a', 'b', 'N'])
%timeit df.apply(lambda x: integrate(*x), axis=1)

1.73 s ± 101 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [201]:
def double_every_value_nonumba(x):
    return x*2

@numba.vectorize
def double_every_value_withnumba_vec(x):
    return x*2

@numba.jit
def double_every_value_withnumba(x):
    return x*2

In [202]:
df = pd.DataFrame(np.random.randint(1,100,size=(100000, 3)),columns=['a', 'b', 'N'])
%timeit df.a.apply(double_every_value_nonumba)

33.4 ms ± 2.37 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [204]:
df = pd.DataFrame(np.random.randint(1,100,size=(100000, 3)),columns=['a', 'b', 'N'])
%timeit df.a.apply(double_every_value_withnumba)

32.8 ms ± 851 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [203]:
df = pd.DataFrame(np.random.randint(1,100,size=(100000, 3)),columns=['a', 'b', 'N'])
%timeit df.a * 2

270 µs ± 15 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [209]:
df = pd.DataFrame(np.random.randint(1,100,size=(100000, 3)),columns=['a', 'b', 'N'])
%timeit df.a.values * 2

196 µs ± 13.7 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [224]:
df = pd.DataFrame(np.random.randint(1,100,size=(100000, 3)),columns=['a', 'b', 'N'])
%timeit double_every_value_withnumba_vec(df.a)

284 µs ± 53.1 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [227]:
df = pd.DataFrame(np.random.randint(1,100,size=(100000, 3)),columns=['a', 'b', 'N'])
%timeit double_every_value_withnumba(df.a.values)

82.3 µs ± 3.59 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)


In [228]:
df = pd.DataFrame(np.random.randint(1,100,size=(100000, 3)),columns=['a', 'b', 'N'])
%timeit double_every_value_withnumba_vec(df.a.values)

91.7 µs ± 7.41 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)


### jax

In [217]:
from jax import jit

In [221]:
@jit
def double_every_value_withjit(x):
    return x*2

In [223]:
df = pd.DataFrame(np.random.randint(1,100,size=(100000, 3)),columns=['a', 'b', 'N'])
%timeit df.a.apply(double_every_value_withjit)

769 ms ± 63.1 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [222]:
df = pd.DataFrame(np.random.randint(1,100,size=(100000, 3)),columns=['a', 'b', 'N'])
%timeit double_every_value_withjit(df.a.values)

361 µs ± 30.9 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


### Cython

In [229]:
%load_ext Cython

In [231]:
%%cython
def integrate_cython(a, b, N):
    s = 0
    dx = (b - a) / N
    for i in range(N):
        x = a + i * dx
        s += x * (x-1)
    return s * dx

In [232]:
df = pd.DataFrame(np.random.randint(1,100,size=(100000, 3)),columns=['a', 'b', 'N'])
%timeit df.apply(lambda x: integrate_cython(*x), axis=1)

1.48 s ± 83.6 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [233]:
%%cython
cpdef double integrate_cython_add_type(double a, double b, int N):
    cdef int i
    cdef double s, dx, x
    s = 0
    dx = (b - a) / N
    for i in range(N):
        x = a + i * dx
        s += x * (x-1)
    return s * dx

In [234]:
df = pd.DataFrame(np.random.randint(1,100,size=(100000, 3)),columns=['a', 'b', 'N'])
%timeit df.apply(lambda x: integrate_cython_add_type(*x), axis=1)

887 ms ± 76.3 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [238]:
%%cython
cimport numpy as np
import numpy as np
cpdef double integrate_cython_ndarray(double a, double b, int N):
    cdef int i
    cdef double s, dx, x
    s = 0
    dx = (b - a) / N
    for i in range(N):
        x = a + i * dx
        s += x * (x-1)
    return s * dx
cpdef np.ndarray[double] apply_integrate_f(np.ndarray col_a, np.ndarray col_b, np.ndarray col_N):
    cdef Py_ssize_t i, n = len(col_N)
    cdef np.ndarray[double] res = np.empty(n)
    for i in range(len(col_a)):
        res[i] = integrate_cython_ndarray(col_a[i], col_b[i], col_N[i])
    return res

In [245]:
df = pd.DataFrame(np.random.randint(1,100,size=(100000, 3)),columns=['a', 'b', 'N'])
%timeit apply_integrate_f(df['a'].values, df['b'].values, df['N'].values)

21.7 ms ± 620 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


### eval

In [250]:
df = pd.DataFrame(np.random.randint(1,100,size=(100000, 3)),columns=['a', 'b', 'N'])
%timeit df.eval('c = a * 2', inplace=True)

1.81 ms ± 94.7 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


### Multiple

In [271]:
from pnlp import concurring

In [282]:
def tokenize(lst):
    data = []
    for s in lst:
        tokens = jieba.lcut(s)
        data.append(tokens)
    return data

@concurring(type="process_pool", max_workers=4)
def tokenize_mp(lst):
    import jieba
    jieba.initialize()
    data = []
    for s in lst:
        tokens = jieba.lcut(s)
        data.append(tokens)
    return data

@concurring(type="thread_pool", max_workers=4)
def tokenize_mt(lst):
    data = []
    for s in lst:
        tokens = jieba.lcut(s)
        data.append(tokens)
    return data

In [274]:
df = get_data()
%timeit res = tokenize(df["title"])

1.56 s ± 142 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [283]:
df = get_data()
%timeit res = list(tokenize_mp(df["title"]))

3.4 s ± 269 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [276]:
df = get_data()
%timeit res = list(tokenize_mt(df["title"]))

1.52 s ± 96.9 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [284]:
from pandarallel import pandarallel
pandarallel.initialize()

INFO: Pandarallel will run on 4 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.


In [289]:
df = get_data()
%timeit res = df["title"].parallel_apply(lambda x: tokenize(x))

1.36 s ± 143 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [291]:
df = get_data()
%timeit res = df["title"].apply(lambda x: tokenize(x))

1.6 s ± 76.4 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
