# Pandas 速度検証

In [0]:
import pandas as pd
import numpy as np
import time

In [2]:
df = pd.DataFrame({ 'A' : np.arange(100000),
                      'B' : 1,
                      'C' : (np.arange(100000)+1)%2
                   })
df.head()

Unnamed: 0,A,B,C
0,0,1,1
1,1,1,0
2,2,1,1
3,3,1,0
4,4,1,1


## 列間の要素同士のデータ操作

### 内包表記の列比較

In [0]:
def count(a,b):
  return (np.sum([1 if str(i)==str(j) else 0 for i,j in zip(a,b)]))

In [4]:
start = time.time()
print(count(df['B'],df['C']))
elapsed_time = time.time() - start
print(elapsed_time)

50000
0.06753230094909668


### Pandasの列比較

In [5]:
start = time.time()
print((df['B']==df['C']).astype(int).sum())
elapsed_time = time.time() - start
print(elapsed_time)

50000
0.004420042037963867


### 内包表記の列間の計算

In [6]:
start = time.time()
df['D'] = [i+j for i,j in zip(df['B'],df['C'])]
print(df['D'])
elapsed_time = time.time() - start
print(elapsed_time)

0        2
1        1
2        2
3        1
4        2
        ..
99995    1
99996    2
99997    1
99998    2
99999    1
Name: D, Length: 100000, dtype: int64
0.05168890953063965


### Pandas列間の計算

In [7]:
start = time.time()
df['E'] = df['B']+df['C']
print(df['E'])
elapsed_time = time.time() - start
print(elapsed_time)

0        2
1        1
2        2
3        1
4        2
        ..
99995    1
99996    2
99997    1
99998    2
99999    1
Name: E, Length: 100000, dtype: int64
0.014134883880615234


## 要素への関数適用

### 内包表記

In [8]:
df = pd.DataFrame({ 'A' : np.arange(100000)})
start = time.time()
df['B'] = [i if i%2==0 else np.nan for i in df['A']]
elapsed_time = time.time() - start
print(elapsed_time)
df['B'].head()

0.045440673828125


0    0.0
1    NaN
2    2.0
3    NaN
4    4.0
Name: B, dtype: float64

### lambda式

In [9]:
df = pd.DataFrame({ 'A' : np.arange(100000)})
start = time.time()
df['B'] = df['A'].map(lambda x: x if x%2==0 else np.nan)
elapsed_time = time.time() - start
print(elapsed_time)
df['B'].head()

0.039142608642578125


0    0.0
1    NaN
2    2.0
3    NaN
4    4.0
Name: B, dtype: float64

## カウント

In [10]:
df = pd.DataFrame({ 'A' : np.arange(100000),
                      'B' : np.random.randint(0, 100000, (100000))
                   })
df

Unnamed: 0,A,B
0,0,64098
1,1,72369
2,2,69515
3,3,29739
4,4,21142
...,...,...
99995,99995,7862
99996,99996,93390
99997,99997,18002
99998,99998,64115


### len()+unique()

In [11]:
start = time.time()
print(len(df['B'].unique()))
elapsed_time = time.time() - start
print(elapsed_time)

63242
0.006842374801635742


### nunique()

In [12]:
start = time.time()
print(df['B'].nunique())
elapsed_time = time.time() - start
print(elapsed_time)

63242
0.00565648078918457


## その他カウント

In [13]:
df = pd.DataFrame({ 'A' : np.arange(100000),
                      'B' : 'To be or not to be that is the question'
                   })
df

Unnamed: 0,A,B
0,0,To be or not to be that is the question
1,1,To be or not to be that is the question
2,2,To be or not to be that is the question
3,3,To be or not to be that is the question
4,4,To be or not to be that is the question
...,...,...
99995,99995,To be or not to be that is the question
99996,99996,To be or not to be that is the question
99997,99997,To be or not to be that is the question
99998,99998,To be or not to be that is the question


In [0]:
import collections

In [15]:
start = time.time()
print(collections.Counter([i for j in df['B'].values.tolist() for i in j.split(" ")]))
elapsed_time = time.time() - start
print(elapsed_time)

Counter({'be': 200000, 'To': 100000, 'or': 100000, 'not': 100000, 'to': 100000, 'that': 100000, 'is': 100000, 'the': 100000, 'question': 100000})
0.2266998291015625
