# Chapter 7 - Data Cleaning and Preparation

## 7.1 Handling Missing Data

In [1]:
import pandas as pd
import numpy as np

In [2]:
string_data=pd.Series(['aardvark','artichoke',np.nan,'avocado'])

In [3]:
string_data.isnull()

0    False
1    False
2     True
3    False
dtype: bool

In [4]:
string_data[0]=None

In [5]:
string_data.isnull()

0     True
1    False
2     True
3    False
dtype: bool

### Filtering Out Missing Data

In [6]:
from numpy import nan as NA

In [11]:
data=pd.Series([1,NA,3.5,NA,7])

In [10]:
data.dropna()

0    1.0
2    3.5
4    7.0
dtype: float64

In [12]:
data[data.notnull()]

0    1.0
2    3.5
4    7.0
dtype: float64

In [13]:
data=pd.DataFrame([[1.,6.5,3.],[1.,NA,NA],[NA,NA,NA],[NA,6.5,3.]])

In [16]:
cleaned=data.dropna()

In [17]:
data

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


In [18]:
cleaned

Unnamed: 0,0,1,2
0,1.0,6.5,3.0


In [21]:
data.dropna(how='all')

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
3,,6.5,3.0


In [22]:
data[4]=NA

In [23]:
data

Unnamed: 0,0,1,2,4
0,1.0,6.5,3.0,
1,1.0,,,
2,,,,
3,,6.5,3.0,


In [25]:
data.dropna(axis=1,how='all')

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


In [26]:
df=pd.DataFrame(np.random.rand(7,3))

In [27]:
df.iloc[:4,1]=NA

In [28]:
df.iloc[:2,2]=NA

In [29]:
df

Unnamed: 0,0,1,2
0,0.127637,,
1,0.086947,,
2,0.844006,,0.216418
3,0.886952,,0.215757
4,0.949056,0.444579,0.46044
5,0.64922,0.356151,0.06784
6,0.358804,0.72661,0.352189


In [30]:
df.dropna()

Unnamed: 0,0,1,2
4,0.949056,0.444579,0.46044
5,0.64922,0.356151,0.06784
6,0.358804,0.72661,0.352189


In [31]:
df.dropna(thresh=2)

Unnamed: 0,0,1,2
2,0.844006,,0.216418
3,0.886952,,0.215757
4,0.949056,0.444579,0.46044
5,0.64922,0.356151,0.06784
6,0.358804,0.72661,0.352189


### Filling In Missing Data

In [32]:
df.fillna(0)

Unnamed: 0,0,1,2
0,0.127637,0.0,0.0
1,0.086947,0.0,0.0
2,0.844006,0.0,0.216418
3,0.886952,0.0,0.215757
4,0.949056,0.444579,0.46044
5,0.64922,0.356151,0.06784
6,0.358804,0.72661,0.352189


In [33]:
df.fillna({1:0.5,2:0})

Unnamed: 0,0,1,2
0,0.127637,0.5,0.0
1,0.086947,0.5,0.0
2,0.844006,0.5,0.216418
3,0.886952,0.5,0.215757
4,0.949056,0.444579,0.46044
5,0.64922,0.356151,0.06784
6,0.358804,0.72661,0.352189


In [34]:
_=df.fillna(0,inplace=True)

In [35]:
df

Unnamed: 0,0,1,2
0,0.127637,0.0,0.0
1,0.086947,0.0,0.0
2,0.844006,0.0,0.216418
3,0.886952,0.0,0.215757
4,0.949056,0.444579,0.46044
5,0.64922,0.356151,0.06784
6,0.358804,0.72661,0.352189


In [36]:
df=pd.DataFrame(np.random.randn(6,3))

In [37]:
df.iloc[2:,1]=NA

In [38]:
df.iloc[4:,2]=NA

In [39]:
df

Unnamed: 0,0,1,2
0,0.325057,0.095411,0.528689
1,-0.449918,-0.513062,-0.471747
2,0.823076,,-0.621266
3,-0.082537,,0.23053
4,0.659132,,
5,2.783818,,


In [40]:
df.fillna(method='ffill')

Unnamed: 0,0,1,2
0,0.325057,0.095411,0.528689
1,-0.449918,-0.513062,-0.471747
2,0.823076,-0.513062,-0.621266
3,-0.082537,-0.513062,0.23053
4,0.659132,-0.513062,0.23053
5,2.783818,-0.513062,0.23053


In [41]:
df.fillna(method='ffill',limit=2)

Unnamed: 0,0,1,2
0,0.325057,0.095411,0.528689
1,-0.449918,-0.513062,-0.471747
2,0.823076,-0.513062,-0.621266
3,-0.082537,-0.513062,0.23053
4,0.659132,,0.23053
5,2.783818,,0.23053


In [42]:
data=pd.Series([1.,NA,3.5,NA,7])

In [43]:
data.fillna(data.mean())

0    1.000000
1    3.833333
2    3.500000
3    3.833333
4    7.000000
dtype: float64

## 7.2 Data Transformation

### Removing Duplicates

In [44]:
data=pd.DataFrame({'k1':['one','two']*3+['two'],
                    'k2':[1,1,2,3,3,4,4]})

In [45]:
data

Unnamed: 0,k1,k2
0,one,1
1,two,1
2,one,2
3,two,3
4,one,3
5,two,4
6,two,4


In [46]:
data.duplicated()

0    False
1    False
2    False
3    False
4    False
5    False
6     True
dtype: bool

In [47]:
data.drop_duplicates()

Unnamed: 0,k1,k2
0,one,1
1,two,1
2,one,2
3,two,3
4,one,3
5,two,4


In [48]:
data['v1']=range(7)

In [50]:
data.drop_duplicates(['k1'])

Unnamed: 0,k1,k2,v1
0,one,1,0
1,two,1,1


In [51]:
data.drop_duplicates(['k1','k2'],keep='last')

Unnamed: 0,k1,k2,v1
0,one,1,0
1,two,1,1
2,one,2,2
3,two,3,3
4,one,3,4
6,two,4,6


### Transforming Data using a Function or Mapping

In [53]:
data=pd.DataFrame({'food':['bacon','pulled pork','bacon','Pastrami','corned beef','Bacon','pastrami','honey ham','nova lox'], 'ounces':[4,3,12,6,7.5,8,3,5,6]})

In [54]:
data

Unnamed: 0,food,ounces
0,bacon,4.0
1,pulled pork,3.0
2,bacon,12.0
3,Pastrami,6.0
4,corned beef,7.5
5,Bacon,8.0
6,pastrami,3.0
7,honey ham,5.0
8,nova lox,6.0


In [55]:
meat_to_animal = {'bacon':'pig','pulled pork':'pig','pastrami':'cow','corned beef':'cow','honey ham':'pig','nova lox':'salmon'}

In [56]:
lowercased=data['food'].str.lower()

In [57]:
lowercased

0          bacon
1    pulled pork
2          bacon
3       pastrami
4    corned beef
5          bacon
6       pastrami
7      honey ham
8       nova lox
Name: food, dtype: object

In [58]:
data['animal']=lowercased.map(meat_to_animal)

In [59]:
data

Unnamed: 0,food,ounces,animal
0,bacon,4.0,pig
1,pulled pork,3.0,pig
2,bacon,12.0,pig
3,Pastrami,6.0,cow
4,corned beef,7.5,cow
5,Bacon,8.0,pig
6,pastrami,3.0,cow
7,honey ham,5.0,pig
8,nova lox,6.0,salmon


In [60]:
data['food'].map(lambda x: meat_to_animal[x.lower()])

0       pig
1       pig
2       pig
3       cow
4       cow
5       pig
6       cow
7       pig
8    salmon
Name: food, dtype: object

### Replacing Values

In [61]:
data = pd.Series([1.,-999.,2.,-999.,-1000.,3.])

In [62]:
data

0       1.0
1    -999.0
2       2.0
3    -999.0
4   -1000.0
5       3.0
dtype: float64

In [63]:
data.replace(-999,np.nan)

0       1.0
1       NaN
2       2.0
3       NaN
4   -1000.0
5       3.0
dtype: float64

In [64]:
data.replace([-999,-1000],[np.nan,0])

0    1.0
1    NaN
2    2.0
3    NaN
4    0.0
5    3.0
dtype: float64

In [65]:
data.replace({-999:np.nan,-1000:0})

0    1.0
1    NaN
2    2.0
3    NaN
4    0.0
5    3.0
dtype: float64

### Renaming Axis Indexes

In [66]:
data=pd.DataFrame(np.arange(12).reshape((3,4)), index=['Ohio','Colorado','New York'],columns=['one','two','three','four'])

In [67]:
transform = lambda x: x[:4].upper()

In [68]:
data.index.map(transform)

Index(['OHIO', 'COLO', 'NEW '], dtype='object')