## 3.2 自作関数を適用した操作

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns

def load_diamonds():
    df = sns.load_dataset('diamonds')

    df = df.astype({
        'cut': 'category',
        'color': 'category',
        'clarity': 'category'
    })

    cut = ['Fair', 'Good', 'Very Good', 'Premium', 'Ideal']
    df['cut'] = df['cut'].cat.set_categories(cut, ordered=True)

    color = ['J', 'I', 'H', 'G', 'F', 'E', 'D']
    df['color'] = df['color'].cat.set_categories(color, ordered=True)

    clarity = ['I1', 'SI2', 'SI1', 'VS2', 'VS1', 'VVS2', 'VVS1', 'IF']
    df['clarity'] = df['clarity'].cat.set_categories(clarity, ordered=True)

    return df

df = load_diamonds()

df

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43
1,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31
2,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31
3,0.29,Premium,I,VS2,62.4,58.0,334,4.20,4.23,2.63
4,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75
...,...,...,...,...,...,...,...,...,...,...
53935,0.72,Ideal,D,SI1,60.8,57.0,2757,5.75,5.76,3.50
53936,0.72,Good,D,SI1,63.1,55.0,2757,5.69,5.75,3.61
53937,0.70,Very Good,D,SI1,62.8,60.0,2757,5.66,5.68,3.56
53938,0.86,Premium,H,SI2,61.0,58.0,2757,6.15,6.12,3.74


In [2]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 53940 entries, 0 to 53939
Data columns (total 10 columns):
 #   Column   Non-Null Count  Dtype   
---  ------   --------------  -----   
 0   carat    53940 non-null  float64 
 1   cut      53940 non-null  category
 2   color    53940 non-null  category
 3   clarity  53940 non-null  category
 4   depth    53940 non-null  float64 
 5   table    53940 non-null  float64 
 6   price    53940 non-null  int64   
 7   x        53940 non-null  float64 
 8   y        53940 non-null  float64 
 9   z        53940 non-null  float64 
dtypes: category(3), float64(6), int64(1)
memory usage: 3.0 MB


In [3]:
def mm_to_cm(size):
    return size / 10.

In [4]:
df.loc[:, ['x', 'y']] = df.loc[:, ['x', 'y']].applymap(mm_to_cm)

df.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,0.23,Ideal,E,SI2,61.5,55.0,326,0.395,0.398,2.43
1,0.21,Premium,E,SI1,59.8,61.0,326,0.389,0.384,2.31
2,0.23,Good,E,VS1,56.9,65.0,327,0.405,0.407,2.31
3,0.29,Premium,I,VS2,62.4,58.0,334,0.42,0.423,2.63
4,0.31,Good,J,SI2,63.3,58.0,335,0.434,0.435,2.75


In [5]:
df.loc[:, 'z'] = df.loc[:, 'z'].map(mm_to_cm)   # df.loc[:, 'z'].applymap はエラーになる

df.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,0.23,Ideal,E,SI2,61.5,55.0,326,0.395,0.398,0.243
1,0.21,Premium,E,SI1,59.8,61.0,326,0.389,0.384,0.231
2,0.23,Good,E,VS1,56.9,65.0,327,0.405,0.407,0.231
3,0.29,Premium,I,VS2,62.4,58.0,334,0.42,0.423,0.263
4,0.31,Good,J,SI2,63.3,58.0,335,0.434,0.435,0.275


In [6]:
df = load_diamonds()
df[['x', 'y', 'z']] /= 10.

df.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,0.23,Ideal,E,SI2,61.5,55.0,326,0.395,0.398,0.243
1,0.21,Premium,E,SI1,59.8,61.0,326,0.389,0.384,0.231
2,0.23,Good,E,VS1,56.9,65.0,327,0.405,0.407,0.231
3,0.29,Premium,I,VS2,62.4,58.0,334,0.42,0.423,0.263
4,0.31,Good,J,SI2,63.3,58.0,335,0.434,0.435,0.275


In [7]:
def calc_point(record):
    if record['cut'] in ['Premium', 'Ideal']:
        point = (record['carat'] + 1.) ** 2. - 1.
    elif record['cut'] in ['Good', 'Very Good']:
        point = 2. * record['carat']
    else:   # 'Fair' の場合
        point = np.log(record['carat'] + 1.)

    return point

In [8]:
df['point'] = df.apply(calc_point, axis=1)

df.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z,point
0,0.23,Ideal,E,SI2,61.5,55.0,326,0.395,0.398,0.243,0.5129
1,0.21,Premium,E,SI1,59.8,61.0,326,0.389,0.384,0.231,0.4641
2,0.23,Good,E,VS1,56.9,65.0,327,0.405,0.407,0.231,0.46
3,0.29,Premium,I,VS2,62.4,58.0,334,0.42,0.423,0.263,0.6641
4,0.31,Good,J,SI2,63.3,58.0,335,0.434,0.435,0.275,0.62
