# Pandas 速度検証

In [0]:
import pandas as pd
import numpy as np
import time

In [22]:
df = pd.DataFrame({ 'A' : np.arange(100000),
                      'B' : 1,
                      'C' : (np.arange(100000)+1)%2
                   })
df.head()

Unnamed: 0,A,B,C
0,0,1,1
1,1,1,0
2,2,1,1
3,3,1,0
4,4,1,1


## 列間の要素同士のデータ操作

### 内包表記の列比較

In [0]:
def count(a,b):
  return (np.sum([1 if str(i)==str(j) else 0 for i,j in zip(a,b)]))

In [26]:
start = time.time()
print(count(df['B'],df['C']))
elapsed_time = time.time() - start
print(elapsed_time)

50000
0.0778651237487793


### Pandasの列比較

In [36]:
start = time.time()
print((df['B']==df['C']).astype(int).sum())
elapsed_time = time.time() - start
print(elapsed_time)

50000
0.003384113311767578


## 要素への関数適用

### 内包表記

In [46]:
df = pd.DataFrame({ 'A' : np.arange(100000)})
start = time.time()
df['B'] = [i if i%2==0 else np.nan for i in df['A']]
elapsed_time = time.time() - start
print(elapsed_time)
df['B'].head()

0.04317903518676758


0    0.0
1    NaN
2    2.0
3    NaN
4    4.0
Name: B, dtype: float64

### lambda式

In [52]:
df = pd.DataFrame({ 'A' : np.arange(100000)})
start = time.time()
df['B'] = df['A'].map(lambda x: x if x%2==0 else np.nan)
elapsed_time = time.time() - start
print(elapsed_time)
df['B'].head()

0.03572845458984375


0    0.0
1    NaN
2    2.0
3    NaN
4    4.0
Name: B, dtype: float64

## カウント

In [67]:
df = pd.DataFrame({ 'A' : np.arange(100000),
                      'B' : np.random.randint(0, 100000, (100000))
                   })
df

Unnamed: 0,A,B
0,0,50126
1,1,37705
2,2,58348
3,3,74887
4,4,71228
...,...,...
99995,99995,55753
99996,99996,28286
99997,99997,56685
99998,99998,99578


### len()+unique()

In [68]:
start = time.time()
print(len(df['B'].unique()))
elapsed_time = time.time() - start
print(elapsed_time)

63331
0.010328054428100586


### nunique()

In [70]:
start = time.time()
print(df['B'].nunique())
elapsed_time = time.time() - start
print(elapsed_time)

63331
0.007768869400024414


## その他カウント

In [3]:
df = pd.DataFrame({ 'A' : np.arange(100000),
                      'B' : 'To be or not to be that is the question'
                   })
df

Unnamed: 0,A,B
0,0,To be or not to be that is the question
1,1,To be or not to be that is the question
2,2,To be or not to be that is the question
3,3,To be or not to be that is the question
4,4,To be or not to be that is the question
...,...,...
99995,99995,To be or not to be that is the question
99996,99996,To be or not to be that is the question
99997,99997,To be or not to be that is the question
99998,99998,To be or not to be that is the question


In [0]:
import collections

In [26]:
start = time.time()
print(collections.Counter([i for j in df['B'].values.tolist() for i in j.split(" ")]))
elapsed_time = time.time() - start
print(elapsed_time)

Counter({'be': 200000, 'To': 100000, 'or': 100000, 'not': 100000, 'to': 100000, 'that': 100000, 'is': 100000, 'the': 100000, 'question': 100000})
0.22527837753295898


In [35]:
start = time.time()
c = collections.Counter()
c.update([i for j in df['B'].values.tolist() for i in j.split(" ")])
print(c)
elapsed_time = time.time() - start
print(elapsed_time)

Counter({'be': 200000, 'To': 100000, 'or': 100000, 'not': 100000, 'to': 100000, 'that': 100000, 'is': 100000, 'the': 100000, 'question': 100000})
0.21963047981262207
