## データの前処理の仕方
- pandasでの前処理(欠損値の補間)
- sklearnのpreprocessing

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
from sklearn.datasets import load_iris
from sklearn import preprocessing
import pandas as pd
df = pd.read_csv("day8-sample.csv",index_col="name")
df
# df.isnull()

## 欠損値除去
pandasの変数を使う
- https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.dropna.html
- 

In [None]:
df.dropna(axis=0,how="any")
df.fillna(0)

In [None]:
def height_to_num(height):
    if type(height)==float:
        return height
    if "cm" in height:
        height = float(height[:-2])
    if (type(height)!=float) and ("m" in height):
        height = float(height[:-1])
        height *= 100
    return height

def weight_to_num(weight):
    if type(weight)==float:
        return weight
    if  (type(weight)!=float) and ("kg" in weight):
        weight = weight[:-2]
    return float(weight)


# df.dropna(how="any")
df["height"]=df.height.apply(height_to_num)
df["weight"]= df.weight.apply(weight_to_num)
print(df)

## 欠損値の補間
- `dropna`
- `fillna`
- `interpolate`

In [None]:
# drop
# print(df.dropna(how="any"))
# fill
df.height = df.height.fillna(df.height.mean())
df.weight = df.weight.fillna(df.weight.mean())
print(df)
# interpolate
# print("orig\n",df.height.values)
# print("interpolate\n",df.height.interpolate(method='linear').values)
# print(df)

## スケーリング
データの大きさ、形状を調節することで、精度向上が見込めることが多い。

`sklearn.preprpcessing`には、正規化を行う関数が多く実装されている。

In [None]:
# axis=0 同一カラム　axis=1:同一インデックス

print("ORIGINAL")
print(df)
print("mean:{} std:{}".format(df.mean(axis=0),df.std(axis=0)))

df_scaled = preprocessing.scale(df, axis=0)
print("SCALED")
print(df_scaled)
print("mean:{} std:{}".format(df_scaled.mean(axis=0),df_scaled.std(axis=0)))

In [None]:
# standard scaler
# スケールの取り出し(新しいデータが来たときに、固定された正規化定数で対応が可能)
scaler = preprocessing.StandardScaler().fit(df)
# スケールを出す
print("mean:{} std:{}".format(scaler.mean_,scaler.scale_))
# スケーリングを行う
df_scaled = scaler.transform(df)
print(df_scaled)

In [None]:
# min-max scaling
scaler = preprocessing.MinMaxScaler().fit(df)
print("max:{}, min:{}".format(scaler.data_max_, scaler.data_min_))
print("SCALED")
print(scaler.transform(df))

In [None]:
# FunctionTransformer
from sklearn.preprocessing import FunctionTransformer
transformer = FunctionTransformer(np.log1p)
transformer.transform(df)