In [3]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

# Data Cleaning and Preparation

- Dec. 3, 2017

# Data Cleaning and Preparation

- 80/20 Rule
- loading, cleaning, transforming, rearranging

# Data Cleaning and Preparation

- 1\. **Handling Missing Data**
- 2\. Data Transformation: Duplicates, Replacing,Permutation, Sampling,Dummy

## 1. Handling Missing Data

- NaN: Not a Number (sentinel value)
- remove: Series, DataFrame, Time Series
- fill: `fillna`

## Handling Missing Data: *Series*


In [10]:
import pandas as pd
import numpy as np

In [11]:
string_data = pd.Series(['aardvark', 'artichoke', np.nan, 'avocado'])
string_data

0     aardvark
1    artichoke
2          NaN
3      avocado
dtype: object

In [12]:
string_data.isnull()

0    False
1    False
2     True
3    False
dtype: bool

In [13]:
string_data
string_data[0] = None
string_data

0     aardvark
1    artichoke
2          NaN
3      avocado
dtype: object

0         None
1    artichoke
2          NaN
3      avocado
dtype: object

In [19]:
string_data.isnull()
string_data[~string_data.isnull()]

0     True
1    False
2     True
3    False
dtype: bool

1    artichoke
3      avocado
dtype: object

### Handling Missing Data


![](http://oydgk2hgw.bkt.clouddn.com/pydata-book/zq0q8.png)

### Filtering Out Missing Data: *Series*

In [30]:
from numpy import nan as NA

In [64]:
data = pd.Series([1, NA, 3.5, NA, 7])

In [65]:
data.dropna()

0    1.0
2    3.5
4    7.0
dtype: float64

In [66]:
data[data.notnull()]

0    1.0
2    3.5
4    7.0
dtype: float64

### Filtering Out Missing Data: *DataFrame*

In [105]:
data = pd.DataFrame([[1., 6.5, 3.], [1., NA, NA],
                     [NA, NA, NA], [NA, 6.5, 3.]])
data

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


In [106]:
cleaned = data.dropna()

In [107]:
cleaned

Unnamed: 0,0,1,2
0,1.0,6.5,3.0


In [108]:
help(pd.DataFrame.dropna)
data.dropna(how='all')

Help on function dropna in module pandas.core.frame:

dropna(self, axis=0, how='any', thresh=None, subset=None, inplace=False)
    Return object with labels on given axis omitted where alternately any
    or all of the data are missing
    
    Parameters
    ----------
    axis : {0 or 'index', 1 or 'columns'}, or tuple/list thereof
        Pass tuple or list to drop on multiple axes
    how : {'any', 'all'}
        * any : if any NA values are present, drop that label
        * all : if all values are NA, drop that label
    thresh : int, default None
        int value : require that many non-NA values
    subset : array-like
        Labels along other axis to consider, e.g. if you are dropping rows
        these would be a list of columns to include
    inplace : boolean, default False
        If True, do operation inplace and return None.
    
    Returns
    -------
    dropped : DataFrame
    
    Examples
    --------
    >>> df = pd.DataFrame([[np.nan, 2, np.nan, 0], [3, 4, np.

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
3,,6.5,3.0


- Columns: axis=1

In [109]:
data[4] = NA
data.dropna(axis=1)
data.dropna(axis=1, how='all')

0
1
2
3


Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


### Filtering Out Missing Data: *time series*

In [92]:
df = pd.DataFrame(np.random.randn(7, 3))

In [93]:
df

Unnamed: 0,0,1,2
0,-0.684452,0.068828,0.511438
1,0.596558,-0.72796,0.041876
2,-1.495368,0.194756,0.967676
3,-0.205534,0.426757,0.400607
4,1.303002,0.62397,-0.925309
5,-0.216826,-0.847332,-0.291069
6,-0.500764,-0.722916,0.971146


In [94]:
df.iloc[:4, 1] = NA
df

Unnamed: 0,0,1,2
0,-0.684452,,0.511438
1,0.596558,,0.041876
2,-1.495368,,0.967676
3,-0.205534,,0.400607
4,1.303002,0.62397,-0.925309
5,-0.216826,-0.847332,-0.291069
6,-0.500764,-0.722916,0.971146


In [95]:
df.iloc[:2, 2] = NA
df

Unnamed: 0,0,1,2
0,-0.684452,,
1,0.596558,,
2,-1.495368,,0.967676
3,-0.205534,,0.400607
4,1.303002,0.62397,-0.925309
5,-0.216826,-0.847332,-0.291069
6,-0.500764,-0.722916,0.971146


In [56]:
df.dropna()

Unnamed: 0,0,1,2
4,-2.509427,-2.116222,0.291337
5,-0.630893,-0.797022,-1.229166
6,0.022629,1.54423,1.458866


In [57]:
df.dropna(thresh=2) 

Unnamed: 0,0,1,2
2,-0.68892,,0.572164
3,0.775055,,1.310582
4,-2.509427,-2.116222,0.291337
5,-0.630893,-0.797022,-1.229166
6,0.022629,1.54423,1.458866


## Filling In Missing Data: `fillna`

In [86]:
df
df.fillna(0)

Unnamed: 0,0,1,2
0,-1.76674,,
1,-1.496573,,
2,0.303909,,0.054455
3,-0.87837,,-0.163486
4,3.144287,0.817124,0.885524
5,0.406212,0.736881,-0.364266
6,-1.130693,0.644432,0.647587


Unnamed: 0,0,1,2
0,-1.76674,0.0,0.0
1,-1.496573,0.0,0.0
2,0.303909,0.0,0.054455
3,-0.87837,0.0,-0.163486
4,3.144287,0.817124,0.885524
5,0.406212,0.736881,-0.364266
6,-1.130693,0.644432,0.647587


In [87]:
df.fillna({1: 0.5, 2: 0})

Unnamed: 0,0,1,2
0,-1.76674,0.5,0.0
1,-1.496573,0.5,0.0
2,0.303909,0.5,0.054455
3,-0.87837,0.5,-0.163486
4,3.144287,0.817124,0.885524
5,0.406212,0.736881,-0.364266
6,-1.130693,0.644432,0.647587


In [96]:
df
df.fillna(0, inplace=True)
df

Unnamed: 0,0,1,2
0,-0.684452,,
1,0.596558,,
2,-1.495368,,0.967676
3,-0.205534,,0.400607
4,1.303002,0.62397,-0.925309
5,-0.216826,-0.847332,-0.291069
6,-0.500764,-0.722916,0.971146


Unnamed: 0,0,1,2
0,-0.684452,0.0,0.0
1,0.596558,0.0,0.0
2,-1.495368,0.0,0.967676
3,-0.205534,0.0,0.400607
4,1.303002,0.62397,-0.925309
5,-0.216826,-0.847332,-0.291069
6,-0.500764,-0.722916,0.971146


## Filling In Missing Data: interpolation with reindexing

In [110]:
df = pd.DataFrame(np.random.randn(6, 3))
df

Unnamed: 0,0,1,2
0,-0.003908,-0.983111,1.09549
1,0.060055,1.041495,1.0196
2,-0.128588,1.630826,-1.428473
3,-0.942703,-1.604473,-2.10677
4,-0.489515,0.632989,-0.830888
5,-0.263622,-1.094081,0.466595


In [111]:
df.iloc[2:, 1] = NA
df.iloc[4:, 2] = NA

In [112]:
df.fillna(method='ffill')

Unnamed: 0,0,1,2
0,-0.003908,-0.983111,1.09549
1,0.060055,1.041495,1.0196
2,-0.128588,1.041495,-1.428473
3,-0.942703,1.041495,-2.10677
4,-0.489515,1.041495,-2.10677
5,-0.263622,1.041495,-2.10677


In [113]:
df.fillna(method='ffill', limit=2)

Unnamed: 0,0,1,2
0,-0.003908,-0.983111,1.09549
1,0.060055,1.041495,1.0196
2,-0.128588,1.041495,-1.428473
3,-0.942703,1.041495,-2.10677
4,-0.489515,,-2.10677
5,-0.263622,,-2.10677


In [115]:
data = pd.Series([1., NA, 3.5, NA, 7])
data.fillna(data.mean())

0    1.000000
1    3.833333
2    3.500000
3    3.833333
4    7.000000
dtype: float64

parameters for fillna

![](http://oydgk2hgw.bkt.clouddn.com/pydata-book/vtzrf.png)

# Data Cleaning and Preparation

- 1\. Handling Missing Data
- 2\. Data Transformation: Duplicates, Replacing,Permutation, Sampling,Dummy