In [127]:
import pandas as pd
import numpy as np

# Chapter 7

## 7.1 Handing Missing Data

for numeric data, pandas uses the floating-point value NaN (Not a Number) to represent missing data. we call this a _sentinel value_

In [128]:
string_data = pd.Series(['aardvark', 'artichoke', np.nan, 'avocado'])
string_data

0     aardvark
1    artichoke
2          NaN
3      avocado
dtype: object

In [129]:
string_data.isnull()

0    False
1    False
2     True
3    False
dtype: bool

In [130]:
string_data[0] = None #None value is also trated as NA in object arrays

In [131]:
string_data.isnull()

0     True
1    False
2     True
3    False
dtype: bool

### Filtering Out Missing Data

a few ways to filter out missing data - can always do it by hand using .isnull and boolean indexing. the dropna can be useful. on a Series, it returns the Series with only the non-null data and index values:

In [132]:
from numpy import nan as NA

In [133]:
data = pd.Series([1, NA, 3.5, NA, 7])

In [134]:
data.dropna()

0    1.0
2    3.5
4    7.0
dtype: float64

**_equivalent to_**

In [135]:
data[data.notnull()]

0    1.0
2    3.5
4    7.0
dtype: float64

DataFrames are more complex. dropna by default drops any row containing a missing value:

In [136]:
data = pd.DataFrame([[1., 6.5, 3.], [1., NA, NA], [NA, NA, NA], [NA, 6.5, 3.]])

In [137]:
cleaned = data.dropna()

In [138]:
data

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


In [139]:
cleaned

Unnamed: 0,0,1,2
0,1.0,6.5,3.0


In [140]:
data.dropna(how='all') #passing how='all' will only drop rows that are all NA

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
3,,6.5,3.0


to drop columns in the same way, pass axis=1

In [141]:
data[4] = NA

In [142]:
data

Unnamed: 0,0,1,2,4
0,1.0,6.5,3.0,
1,1.0,,,
2,,,,
3,,6.5,3.0,


In [143]:
data.dropna(axis=1, how='all')

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


suppose you want to keep only rows containing a certain number of observations. you can indicate this with the thresh argument:

In [144]:
df = pd.DataFrame(np.random.randn(7, 3))

In [145]:
df.iloc[:4, 1] = NA

In [146]:
df.iloc[:2, 2] = NA

In [147]:
df

Unnamed: 0,0,1,2
0,0.636638,,
1,-0.018214,,
2,-0.098054,,2.233822
3,-0.516327,,-1.419087
4,-0.807669,0.569404,0.409202
5,-0.760152,0.350907,0.788326
6,-1.540016,0.334777,0.618995


In [148]:
df.dropna()

Unnamed: 0,0,1,2
4,-0.807669,0.569404,0.409202
5,-0.760152,0.350907,0.788326
6,-1.540016,0.334777,0.618995


In [149]:
df.dropna(thresh=2) #drops rows with more than 2 NAs

Unnamed: 0,0,1,2
2,-0.098054,,2.233822
3,-0.516327,,-1.419087
4,-0.807669,0.569404,0.409202
5,-0.760152,0.350907,0.788326
6,-1.540016,0.334777,0.618995


### Filling In Missing Data

may want to fill "holes" in data rather than discard. Using fillna can use a constant to replace missing values

In [150]:
df.fillna(0)

Unnamed: 0,0,1,2
0,0.636638,0.0,0.0
1,-0.018214,0.0,0.0
2,-0.098054,0.0,2.233822
3,-0.516327,0.0,-1.419087
4,-0.807669,0.569404,0.409202
5,-0.760152,0.350907,0.788326
6,-1.540016,0.334777,0.618995


In [151]:
df.fillna({1: 0.5, 2: 0}) #calling fillna with a dict, you can use a different fill value for each column:

Unnamed: 0,0,1,2
0,0.636638,0.5,0.0
1,-0.018214,0.5,0.0
2,-0.098054,0.5,2.233822
3,-0.516327,0.5,-1.419087
4,-0.807669,0.569404,0.409202
5,-0.760152,0.350907,0.788326
6,-1.540016,0.334777,0.618995


fillna returns a new object, but you can modify the existing object in-place:

In [152]:
_ = df.fillna(0, inplace=True)

In [153]:
df

Unnamed: 0,0,1,2
0,0.636638,0.0,0.0
1,-0.018214,0.0,0.0
2,-0.098054,0.0,2.233822
3,-0.516327,0.0,-1.419087
4,-0.807669,0.569404,0.409202
5,-0.760152,0.350907,0.788326
6,-1.540016,0.334777,0.618995


same interpolation methods available for reindexing can be used with fillna:


In [154]:
df = pd.DataFrame(np.random.randn(6, 3))

In [155]:
df.iloc[2:, 1] = NA

In [156]:
df.iloc[4:, 2] = NA

In [157]:
df

Unnamed: 0,0,1,2
0,-0.990417,-0.267845,-0.377046
1,0.09123,-0.240277,0.475625
2,0.691179,,-1.705068
3,0.038537,,0.797979
4,1.020961,,
5,-0.629596,,


In [158]:
df.fillna(method='ffill')

Unnamed: 0,0,1,2
0,-0.990417,-0.267845,-0.377046
1,0.09123,-0.240277,0.475625
2,0.691179,-0.240277,-1.705068
3,0.038537,-0.240277,0.797979
4,1.020961,-0.240277,0.797979
5,-0.629596,-0.240277,0.797979


In [159]:
df.fillna(method='ffill', limit=2)

Unnamed: 0,0,1,2
0,-0.990417,-0.267845,-0.377046
1,0.09123,-0.240277,0.475625
2,0.691179,-0.240277,-1.705068
3,0.038537,-0.240277,0.797979
4,1.020961,,0.797979
5,-0.629596,,0.797979


with fillna you can do lots of things like pass the mean or median value of aSeries:

In [160]:
data = pd.Series([1., NA, 3.5, 7])

In [161]:
data.fillna(data.mean())

0    1.000000
1    3.833333
2    3.500000
3    7.000000
dtype: float64

## 7.2 Data Transformation

### Removing Duplicates

In [162]:
data = pd.DataFrame({'k1': ['one', 'two'] * 3 + ['two'], 
                    'k2': [1, 1, 2, 3, 3, 4, 4]})

#example

In [163]:
data # example of DF with dupes

Unnamed: 0,k1,k2
0,one,1
1,two,1
2,one,2
3,two,3
4,one,3
5,two,4
6,two,4


In [164]:
data.duplicated() #duplicated returns a boolean Series indicating whether each row is a duple or not

0    False
1    False
2    False
3    False
4    False
5    False
6     True
dtype: bool

In [165]:
data.drop_duplicates() #drop_duplicates returns a DataFrame where the duplicated array is False:

Unnamed: 0,k1,k2
0,one,1
1,two,1
2,one,2
3,two,3
4,one,3
5,two,4


both methods consider all of the columns by default. you can specify any subset of them to detect duplicates. 

In [166]:
data['v1'] = range(7)

In [167]:
data.drop_duplicates(['k1']) #filter duplicates only based on the 'k1' column

Unnamed: 0,k1,k2,v1
0,one,1,0
1,two,1,1


_duplicated_ and _drop_duplicates_ by default keep the first observed value combination. passing _keep='last'_ will return the last one

In [168]:
data.drop_duplicates(['k1', 'k2'], keep='last')

Unnamed: 0,k1,k2,v1
0,one,1,0
1,two,1,1
2,one,2,2
3,two,3,3
4,one,3,4
6,two,4,6


### Transforming Data Using a Function or Mapping

In [169]:
data = pd.DataFrame({'food': ['bacon', 'pulled pork', 'bacon', 
                              'Pastrami', 'corned beef', 'Bacon', 
                              'pastrami', 'honey ham', 'nova lox'],
                    'ounces': [4, 3, 12, 6, 7.5, 8, 3, 5, 6]})

In [170]:
data #example

Unnamed: 0,food,ounces
0,bacon,4.0
1,pulled pork,3.0
2,bacon,12.0
3,Pastrami,6.0
4,corned beef,7.5
5,Bacon,8.0
6,pastrami,3.0
7,honey ham,5.0
8,nova lox,6.0


_add a column indicating the type of animal that each food came from_   
mapping of each distinct meat type to kind of animal

In [171]:
meat_to_animal = {
    'bacon': 'pig', 
    'pulled pork': 'pig',
    'pastrami': 'cow',
    'corned beef': 'cow', 
    'honey ham': 'pig',
    'nova lox': 'salmon'
}

map method on a series accepts a function or dict-like object containing a mapping, but the problem is some meats are capitalized and other not. so we need to convert each value to lowercase using the str.lower Series method:

In [172]:
lowercased = data['food'].str.lower()

In [173]:
lowercased

0          bacon
1    pulled pork
2          bacon
3       pastrami
4    corned beef
5          bacon
6       pastrami
7      honey ham
8       nova lox
Name: food, dtype: object

In [174]:
data['animal'] = lowercased.map(meat_to_animal)

In [175]:
data

Unnamed: 0,food,ounces,animal
0,bacon,4.0,pig
1,pulled pork,3.0,pig
2,bacon,12.0,pig
3,Pastrami,6.0,cow
4,corned beef,7.5,cow
5,Bacon,8.0,pig
6,pastrami,3.0,cow
7,honey ham,5.0,pig
8,nova lox,6.0,salmon


using map is a convenient way to perform element-wise transformations and other data cleaning-related operations

### Replacing Values

replace provides simpler and more flexible way to modify a subset of values in an object

In [176]:
data = pd.Series([1., -999., 2., -999., -1000., 3.])

In [177]:
data

0       1.0
1    -999.0
2       2.0
3    -999.0
4   -1000.0
5       3.0
dtype: float64

the -999 values might be sentinel values for missing data. use replace to produce a new Series to replace -999 with NA

In [178]:
data.replace(-999, np.nan)

0       1.0
1       NaN
2       2.0
3       NaN
4   -1000.0
5       3.0
dtype: float64

**replace multiple values at one by passing a list then substitute value:**


In [179]:
data.replace([-999, -1000], np.nan)

0    1.0
1    NaN
2    2.0
3    NaN
4    NaN
5    3.0
dtype: float64

**use a different replacement for each value, pass a list of substitutes**

In [180]:
data.replace([-999, -1000], [np.nan, 0])

0    1.0
1    NaN
2    2.0
3    NaN
4    0.0
5    3.0
dtype: float64

**the argument passed can also be a dict**

In [181]:
data.replace({-999: np.nan, -1000: 0})

0    1.0
1    NaN
2    2.0
3    NaN
4    0.0
5    3.0
dtype: float64

**_.replace is distinct from .str.replace_**


### Renaming Axis Indexes

In [182]:
data = pd.DataFrame(np.arange(12).reshape((3, 4)),
                   index=['Ohio', 'Colorado', 'New York'],
                   columns=['one', 'two', 'three', 'four'])

In [183]:
transform = lambda x: x[:4].upper() #like a series the axis indexes have a map method

In [184]:
data.index.map(transform)

Index(['OHIO', 'COLO', 'NEW '], dtype='object')

In [185]:
data.index = data.index.map(transform) #you can assign to index, modifying the DataFrame in place

In [186]:
data


Unnamed: 0,one,two,three,four
OHIO,0,1,2,3
COLO,4,5,6,7
NEW,8,9,10,11


In [187]:
data.rename(index=str.title, columns=str.upper) 
#create a transformed version of a dataset without modding the original by using rename

Unnamed: 0,ONE,TWO,THREE,FOUR
Ohio,0,1,2,3
Colo,4,5,6,7
New,8,9,10,11


rename can be used in conjunction with a dict-like object providing new values for a subset of the axis labels:

In [188]:
data.rename(index={'OHIO': 'INDIANA'}, columns={'three': 'peekaboo'})

Unnamed: 0,one,two,peekaboo,four
INDIANA,0,1,2,3
COLO,4,5,6,7
NEW,8,9,10,11


rename saves you from copying the DF manually and assigning to its index and columns attributes. you can modify a dataset in place pass inplace=True

In [189]:
data.rename(index={'OHIO': 'INDIANA'}, inplace=True)

In [190]:
data

Unnamed: 0,one,two,three,four
INDIANA,0,1,2,3
COLO,4,5,6,7
NEW,8,9,10,11


### Discretization and Binning

continuous data is often discretized or separated into "bins" for analysis. for example, separating groups of people into age buckets for a study:

In [191]:
ages = [20, 22, 25, 27, 21, 23, 37, 31, 61, 45, 41, 32]

use _cut_ to separate into bins of 18 to 25, 26 to 35, 25 to 60 and 61

In [192]:
bins = [18, 25, 35, 60, 100]

In [193]:
cats = pd.cut(ages, bins)

In [194]:
cats

[(18, 25], (18, 25], (18, 25], (25, 35], (18, 25], ..., (25, 35], (60, 100], (35, 60], (35, 60], (25, 35]]
Length: 12
Categories (4, interval[int64, right]): [(18, 25] < (25, 35] < (35, 60] < (60, 100]]

The object pandas returns is a special Categorical object. Can treat it like an array of strings indicating the bin name. 

In [195]:
cats.codes

array([0, 0, 0, 1, 0, 0, 2, 1, 3, 2, 2, 1], dtype=int8)

In [196]:
cats.categories

IntervalIndex([(18, 25], (25, 35], (35, 60], (60, 100]], dtype='interval[int64, right]')

In [197]:
pd.value_counts(cats) #paren means its open, bracket means its closed

(18, 25]     5
(25, 35]     3
(35, 60]     3
(60, 100]    1
dtype: int64

can change which side is closed by passing right=False

In [198]:
pd.cut(ages, [18, 26, 36, 61, 100], right=False)

[[18, 26), [18, 26), [18, 26), [26, 36), [18, 26), ..., [26, 36), [61, 100), [36, 61), [36, 61), [26, 36)]
Length: 12
Categories (4, interval[int64, left]): [[18, 26) < [26, 36) < [36, 61) < [61, 100)]

can pass your own bin names by passing a list or array to the labels option:

In [199]:
group_names = ['Youth', 'YoungAdult', 'MiddleAged', 'Senior']

In [200]:
pd.cut(ages, bins, labels=group_names)

['Youth', 'Youth', 'Youth', 'YoungAdult', 'Youth', ..., 'YoungAdult', 'Senior', 'MiddleAged', 'MiddleAged', 'YoungAdult']
Length: 12
Categories (4, object): ['Youth' < 'YoungAdult' < 'MiddleAged' < 'Senior']

passing an integer number of bins (instead of explicit bin edges) results in equal-length bins based on the min and max values in the data:


In [201]:
data = np.random.randn(20)

In [202]:
pd.cut(data, 4, precision=2) #precision=2 limist the decimal to 2 digits

[(-1.63, -0.85], (-0.85, -0.073], (-0.85, -0.073], (-1.63, -0.85], (-0.85, -0.073], ..., (-0.073, 0.7], (-0.073, 0.7], (0.7, 1.48], (0.7, 1.48], (-0.85, -0.073]]
Length: 20
Categories (4, interval[float64, right]): [(-1.63, -0.85] < (-0.85, -0.073] < (-0.073, 0.7] < (0.7, 1.48]]

_qcut_ bins the data based on sample quantiles. you will obtain roughly equal-size bins:

In [203]:
data = np.random.randn(1000) #Normally distributed

In [204]:
cats = pd.qcut(data, 4) #cut into quantiles

In [205]:
cats

[(-0.605, 0.0236], (0.0236, 0.779], (0.779, 3.213], (0.779, 3.213], (-0.605, 0.0236], ..., (-0.605, 0.0236], (-0.605, 0.0236], (0.0236, 0.779], (-3.362, -0.605], (-0.605, 0.0236]]
Length: 1000
Categories (4, interval[float64, right]): [(-3.362, -0.605] < (-0.605, 0.0236] < (0.0236, 0.779] < (0.779, 3.213]]

In [206]:
pd.value_counts(cats)

(-3.362, -0.605]    250
(-0.605, 0.0236]    250
(0.0236, 0.779]     250
(0.779, 3.213]      250
dtype: int64

similar to qcut you can pass your own quantiles (numbers between 0 and 1 inclusive)

In [207]:
pd.qcut(data, [0, 0.1, 0.5, 0.9, 1])

[(-1.256, 0.0236], (0.0236, 1.33], (0.0236, 1.33], (0.0236, 1.33], (-1.256, 0.0236], ..., (-1.256, 0.0236], (-1.256, 0.0236], (0.0236, 1.33], (-1.256, 0.0236], (-1.256, 0.0236]]
Length: 1000
Categories (4, interval[float64, right]): [(-3.362, -1.256] < (-1.256, 0.0236] < (0.0236, 1.33] < (1.33, 3.213]]

### Detecting and Filtering Outliers

In [208]:
data = pd.DataFrame(np.random.randn(1000, 4))

In [209]:
data.describe()

Unnamed: 0,0,1,2,3
count,1000.0,1000.0,1000.0,1000.0
mean,0.013953,-0.0031,0.02809,-0.016543
std,0.966462,0.981472,1.009698,1.045315
min,-3.256492,-3.15526,-3.358117,-3.627054
25%,-0.661531,-0.67307,-0.649594,-0.755298
50%,0.019429,-0.007608,0.034155,0.024068
75%,0.693965,0.693501,0.718664,0.687248
max,3.44107,3.174414,3.313291,3.353091


In [210]:
col = data[2]

In [211]:
col[np.abs(col) > 3] #find values in one column exceeding 3 in abs value

204    3.263784
372   -3.023002
688    3.313291
964   -3.358117
Name: 2, dtype: float64

In [212]:
#select all rows having a value exceeing abs 3, you can use the any method on a boolean DataFrame
data[(np.abs(data) > 3).any(1)]

Unnamed: 0,0,1,2,3
179,-0.71394,1.640767,-0.196285,-3.627054
204,1.542168,0.083151,3.263784,0.593954
235,-3.230135,-1.074306,0.248617,-0.906297
325,0.934511,0.924462,0.069072,3.089383
372,1.047513,1.411962,-3.023002,-0.749524
388,-0.601606,0.172911,0.410354,3.353091
451,-1.143979,3.174414,-1.223736,0.327404
534,0.540777,-0.232513,1.231543,-3.067723
634,-0.250956,-0.706832,0.244392,3.060367
686,-0.328958,-3.15526,-0.028107,-1.857088


In [213]:
#code to cap values outside the abs 3
data[np.abs(data) > 3] =np.sign(data) *3

In [214]:
data.describe()

Unnamed: 0,0,1,2,3
count,1000.0,1000.0,1000.0,1000.0
mean,0.013999,-0.003164,0.027894,-0.016033
std,0.963412,0.980297,1.006702,1.038754
min,-3.0,-3.0,-3.0,-3.0
25%,-0.661531,-0.67307,-0.649594,-0.755298
50%,0.019429,-0.007608,0.034155,0.024068
75%,0.693965,0.693501,0.718664,0.687248
max,3.0,3.0,3.0,3.0


statement np.sign(data) produces 1 and -1 values based on whether the values in data are positive or negative

In [215]:
np.sign(data).head()

Unnamed: 0,0,1,2,3
0,1.0,-1.0,1.0,1.0
1,1.0,-1.0,-1.0,-1.0
2,1.0,-1.0,-1.0,-1.0
3,1.0,-1.0,1.0,-1.0
4,1.0,-1.0,1.0,1.0


### Permutation and Random Sampling

permuting(randomly reording) a Series or the rows in a DataFrame is eay to do using the numpy.random.permutation function. calling permutation with the length of the axis you want to permute produces an array of integers indicating the new ordering:

In [216]:
df = pd.DataFrame(np.arange(5 * 4).reshape((5, 4)))

In [217]:
sampler = np.random.permutation(5)

In [218]:
sampler

array([0, 2, 3, 4, 1])

In [219]:
#that array can then be used in iloc based indexing or the equivalent take function:
df

Unnamed: 0,0,1,2,3
0,0,1,2,3
1,4,5,6,7
2,8,9,10,11
3,12,13,14,15
4,16,17,18,19


In [220]:
df.take(sampler)

Unnamed: 0,0,1,2,3
0,0,1,2,3
2,8,9,10,11
3,12,13,14,15
4,16,17,18,19
1,4,5,6,7


to select a random subset without replacement, you can use the sample method on Series and DataFrame:

In [221]:
df.sample(n=3)

Unnamed: 0,0,1,2,3
1,4,5,6,7
4,16,17,18,19
3,12,13,14,15


to generate a sample _with_ replacement(to allow repeat choices), pass replace=True

In [222]:
choices = pd.Series([5, 7, -1, 6, 4])

In [223]:
draws = choices.sample(n=10, replace=True)

In [224]:
draws

3    6
4    4
4    4
3    6
4    4
4    4
2   -1
0    5
4    4
4    4
dtype: int64

### Computing Indicator/Dummy Variables

statistical modeling or machine learning use another type of transformation - converting a categorical variable into a "dummy or "indicator" matrix. if a column in a DF has k distinct values, you would derive a matrix or DataFrame with k columns containing all 1s and 0s. pandas has a get_dummies function.

In [225]:
df = pd.DataFrame({'key': ['b', 'b', 'a', 'c', 'a', 'b'], 
                  'data1': range(6)})

In [226]:
pd.get_dummies(df['key'])

Unnamed: 0,a,b,c
0,0,1,0
1,0,1,0
2,1,0,0
3,0,0,1
4,1,0,0
5,0,1,0


may want to add a prefix to the columns in the indicator DataFrame, which can then be merged with the other data. get_dummies has a prefix argument for doing this:

In [227]:
dummies = pd.get_dummies(df['key'], prefix='key')

In [228]:
df_with_dummy = df[['data1']].join(dummies)

In [229]:
df_with_dummy

Unnamed: 0,data1,key_a,key_b,key_c
0,0,0,1,0
1,1,0,1,0
2,2,1,0,0
3,3,0,0,1
4,4,1,0,0
5,5,0,1,0


things are more complicated if a row in DF belongs to multiple categories. example:

In [230]:
mnames = ['movie_id', 'title', 'genres']

In [231]:
import matplotlib.pyplot as plt

In [None]:
movies = pd.read_table('datasets/movielens/movies.dat', sep='::', header=None, names=mnames)

adding indicator variables for each genre requires a little wrangling. first extract the list of unique genres in the dataset:

In [None]:
all_genres = []

In [None]:
for x in movies.genres:
    all_genres.extend(x.split('|'))

In [233]:
genres = pd.unique(all_genres)

In [None]:
genres

In [None]:
#one way to construct the indicator DF is to start with a DF of all 0s
zero_matrix = np.zeros((len(movies), len(genres)))

In [None]:
dummies = pd.DataFrame(zero_matrix, columns=genres)

now iterate through each movie and set entries in each row of dummies to 1. to do this we use the dimmies.columns to compute the column indices for each genre

In [None]:
gen = movies.genres[0]

In [None]:
gen.split('|')

In [None]:
dummies.columns.get_indexer(gen.split('|'))

In [None]:
#then use .iloc to set values based on these indices:
for i, gen in enumerate(movies.gen):
    indices = dummies.columns.get_indexer(gen.split('|'))
    dummies.iloc[i, indices] = 1

In [None]:
#then as before you can combine this with movies:
movies_windic = movies.join(dummies.add_prefix('Genre'))

In [None]:
movies_windic.iloc[0]

a useful recipe for statistical applications is to combine _get_dummies_ with a discretization function like cut:

In [234]:
np.random.seed(12345)

In [235]:
values = np.random.randn(10)

In [236]:
values

array([-0.20470766,  0.47894334, -0.51943872, -0.5557303 ,  1.96578057,
        1.39340583,  0.09290788,  0.28174615,  0.76902257,  1.24643474])

In [237]:
bins = [0, 0.2, 0.4, 0.6, 0.8, 1]


In [238]:
pd.get_dummies(pd.cut(values, bins))

Unnamed: 0,"(0.0, 0.2]","(0.2, 0.4]","(0.4, 0.6]","(0.6, 0.8]","(0.8, 1.0]"
0,0,0,0,0,0
1,0,0,1,0,0
2,0,0,0,0,0
3,0,0,0,0,0
4,0,0,0,0,0
5,0,0,0,0,0
6,1,0,0,0,0
7,0,1,0,0,0
8,0,0,0,1,0
9,0,0,0,0,0


we set the random seed with numpy.random.seed to make the example deterministic. 

## 7.3 String Manipulation