## 3.3 ループ処理への対応

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns

def load_diamonds():
    df = sns.load_dataset('diamonds')

    df = df.astype({
        'cut': 'category',
        'color': 'category',
        'clarity': 'category'
    })

    cut = ['Fair', 'Good', 'Very Good', 'Premium', 'Ideal']
    df['cut'] = df['cut'].cat.set_categories(cut, ordered=True)

    color = ['J', 'I', 'H', 'G', 'F', 'E', 'D']
    df['color'] = df['color'].cat.set_categories(color, ordered=True)

    clarity = ['I1', 'SI2', 'SI1', 'VS2', 'VS1', 'VVS2', 'VVS1', 'IF']
    df['clarity'] = df['clarity'].cat.set_categories(clarity, ordered=True)

    return df

df = load_diamonds()

df

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43
1,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31
2,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31
3,0.29,Premium,I,VS2,62.4,58.0,334,4.20,4.23,2.63
4,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75
...,...,...,...,...,...,...,...,...,...,...
53935,0.72,Ideal,D,SI1,60.8,57.0,2757,5.75,5.76,3.50
53936,0.72,Good,D,SI1,63.1,55.0,2757,5.69,5.75,3.61
53937,0.70,Very Good,D,SI1,62.8,60.0,2757,5.66,5.68,3.56
53938,0.86,Premium,H,SI2,61.0,58.0,2757,6.15,6.12,3.74


In [2]:
for index, record in df.iterrows():
    print(f'index {index}:  price {record["price"]}')
    
    print(record[['carat', 'cut', 'color', 'clarity']], '\n')
    
    if index > 2:
        break

index 0:  price 326
carat       0.23
cut        Ideal
color          E
clarity      SI2
Name: 0, dtype: object 

index 1:  price 326
carat         0.21
cut        Premium
color            E
clarity        SI1
Name: 1, dtype: object 

index 2:  price 327
carat      0.23
cut        Good
color         E
clarity     VS1
Name: 2, dtype: object 

index 3:  price 334
carat         0.29
cut        Premium
color            I
clarity        VS2
Name: 3, dtype: object 



In [3]:
for record in df.itertuples():
    print(f'index {record.Index}:  price {record.price}')
    
    print(record.carat, record.cut, record.color, record.clarity, '\n')
    
    if record.Index > 2:
        break

index 0:  price 326
0.23 Ideal E SI2 

index 1:  price 326
0.21 Premium E SI1 

index 2:  price 327
0.23 Good E VS1 

index 3:  price 334
0.29 Premium I VS2 



In [4]:
%%timeit

for _, _ in df.iterrows():
    pass

1.44 s ± 27.9 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [5]:
%%timeit

for _ in df.itertuples():
    pass

39.5 ms ± 356 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [6]:
%%timeit

for i in range(len(df)):
    df.iloc[i]

5.04 s ± 27 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [7]:
for label, content in df[['price', 'carat', 'cut', 'color', 'clarity']].items():
    print(f'label: {label}')
    print(f'{content[0]}  {content[1]}  {content[2]}  {content[3]} ...\n')

# df[['price', 'carat', 'cut', 'color', 'clarity']].T.iterrows() でも同じ結果

label: price
326  326  327  334 ...

label: carat
0.23  0.21  0.23  0.29 ...

label: cut
Ideal  Premium  Good  Premium ...

label: color
E  E  E  I ...

label: clarity
SI2  SI1  VS1  VS2 ...



In [8]:
def calc_point(record):
    if record['cut'] in ['Premium', 'Ideal']:
        point = (record['carat'] + 1.) ** 2. - 1.
    elif record['cut'] in ['Good', 'Very Good']:
        point = 2. * record['carat']
    else:   # 'Fair' の場合
        point = np.log(record['carat'] + 1.)

    return point

In [9]:
df = load_diamonds()

points = []
for _, record in df.iterrows():
    point = calc_point(record)
    points.append(point)
df['point'] = points

df

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z,point
0,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43,0.5129
1,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31,0.4641
2,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31,0.4600
3,0.29,Premium,I,VS2,62.4,58.0,334,4.20,4.23,2.63,0.6641
4,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75,0.6200
...,...,...,...,...,...,...,...,...,...,...,...
53935,0.72,Ideal,D,SI1,60.8,57.0,2757,5.75,5.76,3.50,1.9584
53936,0.72,Good,D,SI1,63.1,55.0,2757,5.69,5.75,3.61,1.4400
53937,0.70,Very Good,D,SI1,62.8,60.0,2757,5.66,5.68,3.56,1.4000
53938,0.86,Premium,H,SI2,61.0,58.0,2757,6.15,6.12,3.74,2.4596


In [10]:
def calc_point2(record):
    if record.cut in ['Premium', 'Ideal']:
        point = (record.carat + 1.) ** 2 - 1.
    elif record.cut in ['Good', 'Very Good']:
        point = 2. * record.carat
    else:   # 'Fair' の場合
        point = np.log(record.carat + 1.)

    return point

In [11]:
df = load_diamonds()

points = []
for record in df.itertuples():
    point = calc_point2(record)
    points.append(point)
df['point'] = points

df

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z,point
0,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43,0.5129
1,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31,0.4641
2,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31,0.4600
3,0.29,Premium,I,VS2,62.4,58.0,334,4.20,4.23,2.63,0.6641
4,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75,0.6200
...,...,...,...,...,...,...,...,...,...,...,...
53935,0.72,Ideal,D,SI1,60.8,57.0,2757,5.75,5.76,3.50,1.9584
53936,0.72,Good,D,SI1,63.1,55.0,2757,5.69,5.75,3.61,1.4400
53937,0.70,Very Good,D,SI1,62.8,60.0,2757,5.66,5.68,3.56,1.4000
53938,0.86,Premium,H,SI2,61.0,58.0,2757,6.15,6.12,3.74,2.4596


In [12]:
df = load_diamonds()

In [13]:
%%timeit

df['point'] = df.apply(calc_point, axis=1)

445 ms ± 17.4 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [14]:
df = load_diamonds()

In [15]:
%%timeit

points = []
for _, record in df.iterrows():
    point = calc_point(record)
    points.append(point)
df['point'] = points

2.09 s ± 35.7 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [16]:
df = load_diamonds()

In [17]:
%%timeit

points = []
for record in df.itertuples():
    point = calc_point2(record)
    points.append(point)
df['point'] = points

61.3 ms ± 499 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [18]:
df = load_diamonds()

In [19]:
%%timeit

df['point'] = (df['carat'] + 1.) ** 2 - 1.  # まずPremium/Idealの点数で計算する

# Good/Very Goodの位置だけ点数を上書きする
good_verygood_mask = (df['cut'] == 'Good') | (df['cut'] == 'Very Good')
df.loc[good_verygood_mask, 'point'] = 2. * df['carat']

# Fairの位置だけ点数を上書きする
fair_mask = df['cut'] == 'Fair'
df.loc[fair_mask, 'point'] = np.log(df['carat'] + 1.)

3.11 ms ± 36 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
