# Chapter 7. Data Cleaning and Preparation

In [2]:
import numpy as np
import pandas as pd

In [3]:
# 使えないデータ(None,NAN)の検知
string_data = pd.Series(['aardvark', 'artichoke', np.nan, 'avocado'])

string_data[0]=None
string_data.isnull()

0     True
1    False
2     True
3    False
dtype: bool

In [4]:
# 使えないデータの除去
string_data.dropna()

1    artichoke
3      avocado
dtype: object

In [5]:
# データの穴埋め
df = pd.DataFrame(np.random.randn(7, 3))
df[3]=None
df.fillna(0)

Unnamed: 0,0,1,2,3
0,0.593953,-1.42147,0.949889,0
1,-1.636973,0.392335,0.273322,0
2,1.051131,-0.614226,0.002865,0
3,-0.21082,0.975289,0.739398,0
4,0.556649,-0.468502,0.6637,0
5,1.595844,-1.001478,-0.987279,0
6,0.673011,-0.50364,0.745119,0


In [6]:
# collumごとに埋め方を指定 inplaceをTrueにすると、mutableな変更となる
df.fillna({3:0.5,4:1.0},inplace=True)

df 

Unnamed: 0,0,1,2,3
0,0.593953,-1.42147,0.949889,0.5
1,-1.636973,0.392335,0.273322,0.5
2,1.051131,-0.614226,0.002865,0.5
3,-0.21082,0.975289,0.739398,0.5
4,0.556649,-0.468502,0.6637,0.5
5,1.595844,-1.001478,-0.987279,0.5
6,0.673011,-0.50364,0.745119,0.5


In [7]:
# 何らかの関数で埋める値を指定
df.iloc[3:,3]=None
df[3].fillna(df[3].mean(),inplace=True)
df

Unnamed: 0,0,1,2,3
0,0.593953,-1.42147,0.949889,0.5
1,-1.636973,0.392335,0.273322,0.5
2,1.051131,-0.614226,0.002865,0.5
3,-0.21082,0.975289,0.739398,0.5
4,0.556649,-0.468502,0.6637,0.5
5,1.595844,-1.001478,-0.987279,0.5
6,0.673011,-0.50364,0.745119,0.5


In [8]:
# 重複データの除去
df=pd.DataFrame({'k1':['one','two']*3+['two'],'k2':[1, 1, 2, 3, 3, 4, 4]})
df.drop_duplicates(['k1'])

Unnamed: 0,k1,k2
0,one,1
1,two,1


In [9]:
# データのシャッフル
data=pd.DataFrame(np.arange(5*4).reshape((5,4)))
data
sample=np.random.permutation(5)
data.take(sample)

# ランダム抽出
data.sample(4)

Unnamed: 0,0,1,2,3
3,12,13,14,15
0,0,1,2,3
4,16,17,18,19
2,8,9,10,11


In [21]:
data={'key':list('bbacab'),'data':range(6)}
df=pd.DataFrame(data)
df

# ダミー変数に変換
pd.get_dummies(df['key'])

Unnamed: 0,a,b,c
0,0,1,0
1,0,1,0
2,1,0,0
3,0,0,1
4,1,0,0
5,0,1,0


In [23]:
# prefixをつけてjoin
dummpy_with_pre=pd.get_dummies(df['key'],prefix='ket')
df[['data']].join(dummpy_with_pre)

Unnamed: 0,data,ket_a,ket_b,ket_c
0,0,0,1,0
1,1,0,1,0
2,2,1,0,0
3,3,0,0,1
4,4,1,0,0
5,5,0,1,0


In [37]:
# movieデータのデータ処理
mnamses=['movie_id','title','genres']
movies=pd.read_table('./movie.dat',sep='::',header=None,names=mnamses)

# Series
type(movies['title'])
 # DataFrame
# type(movies[['title']])

movies[:10]

  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0,movie_id,title,genres
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy
5,6,Heat (1995),Action|Crime|Thriller
6,7,Sabrina (1995),Comedy|Romance
7,8,Tom and Huck (1995),Adventure|Children's
8,9,Sudden Death (1995),Action
9,10,GoldenEye (1995),Action|Adventure|Thriller


In [39]:
# 映画の全ジャンルを取得
all_genre=[]

for x in movies.genres:
    all_genre.extend(x.split('|'))
    
genres=pd.unique(all_genre)
genres

array(['Animation', "Children's", 'Comedy', 'Adventure', 'Fantasy',
       'Romance', 'Drama', 'Action', 'Crime', 'Thriller', 'Horror',
       'Sci-Fi', 'Documentary', 'War', 'Musical', 'Mystery', 'Film-Noir',
       'Western'], dtype=object)

In [49]:
# ダミー変数を入れる用の行列
zero_matrix=np.zeros((len(movies),len(genres)))
dummies=pd.DataFrame(zero_matrix,columns=genres)

# 対応するジャンルがある場合1とする
for i,gen in enumerate(movies.genres):
    indces=dummies.columns.get_indexer(gen.split('|'))
    dummies.iloc[i,indces]=1
    
dummies.head()

Unnamed: 0,Animation,Children's,Comedy,Adventure,Fantasy,Romance,Drama,Action,Crime,Thriller,Horror,Sci-Fi,Documentary,War,Musical,Mystery,Film-Noir,Western
0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [55]:
movies.join(dummies.add_prefix('Genre')).head()

Unnamed: 0,movie_id,title,genres,GenreAnimation,GenreChildren's,GenreComedy,GenreAdventure,GenreFantasy,GenreRomance,GenreDrama,...,GenreCrime,GenreThriller,GenreHorror,GenreSci-Fi,GenreDocumentary,GenreWar,GenreMusical,GenreMystery,GenreFilm-Noir,GenreWestern
0,1,Toy Story (1995),Animation|Children's|Comedy,1.0,1.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2,Jumanji (1995),Adventure|Children's|Fantasy,0.0,1.0,0.0,1.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,3,Grumpier Old Men (1995),Comedy|Romance,0.0,0.0,1.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,4,Waiting to Exhale (1995),Comedy|Drama,0.0,0.0,1.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,5,Father of the Bride Part II (1995),Comedy,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [62]:
# 区間ごとに応じたdummy化を行う
values=np.random.rand(10)

bins=np.arange(0,1,0.2)

pd.cut(values,bins)
pd.get_dummies(pd.cut(values,bins))

Unnamed: 0,"(0.0, 0.2]","(0.2, 0.4]","(0.4, 0.6]","(0.6, 0.8]"
0,1,0,0,0
1,1,0,0,0
2,0,0,0,1
3,0,1,0,0
4,0,0,1,0
5,0,0,0,0
6,1,0,0,0
7,0,0,0,1
8,0,0,0,0
9,0,0,0,1


In [7]:
# stringの処理

# splitしてwhitespaceを除去する
csvdata='a,b, v,  a, 　　fr'
list_data=[ x.strip() for x in csvdata.split(',')]

# セパレータを指定
'::'.join(list_data)

# indexは見つからないとエラーを吐くがfindは-1を返す
# csvdata.index('q')
csvdata.find('q')

# 対象charの除去
data='a,b,c,d'
data.replace(',','')

'abcd'