# Chapter 7. Data Cleaning and Preparation

In [93]:
import numpy as np
import pandas as pd

In [94]:
# 使えないデータ(None,NAN)の検知
string_data = pd.Series(['aardvark', 'artichoke', np.nan, 'avocado'])

string_data[0]=None
string_data.isnull()

0     True
1    False
2     True
3    False
dtype: bool

In [95]:
# 使えないデータの除去
string_data.dropna()

1    artichoke
3      avocado
dtype: object

In [96]:
# データの穴埋め
df = pd.DataFrame(np.random.randn(7, 3))
df[3]=None
df.fillna(0)

Unnamed: 0,0,1,2,3
0,-0.277059,0.299788,0.283063,0
1,0.083983,0.037342,-1.330558,0
2,0.583178,-0.148875,0.08943,0
3,-0.616517,-0.092739,-0.165142,0
4,-0.088827,0.491183,0.490939,0
5,-0.231683,-1.514836,-0.930902,0
6,1.325745,1.367978,-0.828439,0


In [97]:
# collumごとに埋め方を指定 inplaceをTrueにすると、mutableな変更となる
df.fillna({3:0.5,4:1.0},inplace=True)

df 

Unnamed: 0,0,1,2,3
0,-0.277059,0.299788,0.283063,0.5
1,0.083983,0.037342,-1.330558,0.5
2,0.583178,-0.148875,0.08943,0.5
3,-0.616517,-0.092739,-0.165142,0.5
4,-0.088827,0.491183,0.490939,0.5
5,-0.231683,-1.514836,-0.930902,0.5
6,1.325745,1.367978,-0.828439,0.5


In [98]:
# 何らかの関数で埋める値を指定
df.iloc[3:,3]=None
df[3].fillna(df[3].mean(),inplace=True)
df

Unnamed: 0,0,1,2,3
0,-0.277059,0.299788,0.283063,0.5
1,0.083983,0.037342,-1.330558,0.5
2,0.583178,-0.148875,0.08943,0.5
3,-0.616517,-0.092739,-0.165142,0.5
4,-0.088827,0.491183,0.490939,0.5
5,-0.231683,-1.514836,-0.930902,0.5
6,1.325745,1.367978,-0.828439,0.5


In [99]:
# 重複データの除去
df=pd.DataFrame({'k1':['one','two']*3+['two'],'k2':[1, 1, 2, 3, 3, 4, 4]})
df.drop_duplicates(['k1'])

Unnamed: 0,k1,k2
0,one,1
1,two,1


In [100]:
# データのシャッフル
data=pd.DataFrame(np.arange(5*4).reshape((5,4)))
data
sample=np.random.permutation(5)
data.take(sample)

# ランダム抽出
data.sample(4)

Unnamed: 0,0,1,2,3
4,16,17,18,19
0,0,1,2,3
1,4,5,6,7
3,12,13,14,15
