# Tidying Up your Data

In [34]:
# import numpy and pandas
import numpy as np
import pandas as pd

# used for dates
import datetime
from datetime import datetime, date

# Working with missing data

In [None]:
# create a DataFrame with 5 rows and 3 columns
df = pd.DataFrame(np.arange(0, 15).reshape(5, 3), 
               index=['a', 'b', 'c', 'd', 'e'], 
               columns=['c1', 'c2', 'c3'])
# add some columns and rows to the DataFrame
# column c4 with NaN values
df['c4'] = np.nan
# row 'f' with 15 through 18 
df.loc['f'] = np.arange(15, 19) 
# row 'g' will all NaN
df.loc['g'] = np.nan
# column 'C5' with NaN's
df['c5'] = np.nan
# change value in col 'c4' row 'a'
df['c4']['a'] = 20
df

## Determining NaN values in Series and DataFrame objects

In [None]:
# return a DataFrame that specify which items are NaN
df.isnull()

In [None]:
# count the number of NaN's in each column
df.isnull().sum()

In [None]:
# total count of NaN values
df.isnull().sum().sum()

In [None]:
# number of non-NaN values in each column
df.count()

In [None]:
# which items are not null?
df.notnull()

## Dropping missing data

In [None]:
# this gets all non NaN items in column c4
# dropna returns a copy with the values dropped
# the source DataFrame / column is not changed
df.c4.dropna()

In [None]:
# on a DataFrame this will drop entire rows
# where there is at least one NaN
# in this case, that is all rows
df.dropna()

In [None]:
# using how='all', only rows that have all values
# as NaN will be dropped
df.dropna(how = 'all')

In [None]:
# flip to drop columns instead of rows
df.dropna(how='all', axis=1) # say goodbye to c5

In [None]:
# now drop columns with any NaN values
df.dropna(how='any', axis=1)

In [None]:
# only drop columns with at least 5 NaN values
df.dropna(thresh=5, axis=1)

## How pandas handles NaN's in mathematical operations

- When a NumPy function encounters a `NaN` value, it returns `NaN`
- Pandas functions typically ignore the `NaN` values and continue processing the function

In [None]:
# create a NumPy array with one NaN value
a = np.array([1, 2, np.nan, 3])
# create a Series from the array
s = pd.Series(a)
# the mean of each is different
a.mean(), s.mean()

In [None]:
# demonstrate sum, mean and cumsum handling of NaN
# get one column
s = df.c4
s.sum(), # NaN's treated as 0

In [None]:
s.mean() # NaN also treated as 0

In [None]:
# as 0 in the cumsum, but NaN's preserved in result Series
s.cumsum()

In [None]:
# in arithmetic, a NaN value will result in NaN
df.c4 + 1

## Filling in missing data

In [None]:
# return a new DataFrame with NaN's filled with 0
filled = df.fillna(0)
filled

In [None]:
# NaN's don't count as an item in calculating
# the means
df.mean()

In [None]:
# having replaced NaN with 0 can make
# operations such as mean have different results
filled.mean()

## Forward and backwards filling of missing values

In [26]:
df.c4

a    20.0
b     NaN
c     NaN
d     NaN
e     NaN
f    18.0
g     NaN
Name: c4, dtype: float64

In [24]:
# extract the c4 column and fill NaNs forward
df.c4.fillna(method="ffill")

a    20.0
b    20.0
c    20.0
d    20.0
e    20.0
f    18.0
g    18.0
Name: c4, dtype: float64

In [25]:
# perform a backwards fill
df.c4.fillna(method="bfill")

a    20.0
b    18.0
c    18.0
d    18.0
e    18.0
f    18.0
g     NaN
Name: c4, dtype: float64

# Filling using index labels

In [27]:
# create a new Series of values to be 
# used to fill NaN's where index label matches
fill_values = pd.Series([100, 101, 102], index=['a', 'e', 'g'])
fill_values

a    100
e    101
g    102
dtype: int64

In [28]:
# using c4, fill using fill_values
# a, e and g will be filled with matching values
df.c4.fillna(fill_values)

a     20.0
b      NaN
c      NaN
d      NaN
e    101.0
f     18.0
g    102.0
Name: c4, dtype: float64

In [None]:
# fill NaN values in each column with the 
# mean of the values in that column
df.fillna(df.mean())

## Interpolation of missing values

In [30]:
# linear interpolate the NaN values from 1 through 2
s = pd.Series([1, np.nan, np.nan, np.nan, 2])
s.interpolate()

0    1.00
1    1.25
2    1.50
3    1.75
4    2.00
dtype: float64

In [35]:
# create a time series, but missing one date in the Series
ts = pd.Series([1, np.nan, 2], 
            index=[datetime(2014, 1, 1), 
                   datetime(2014, 2, 1),                   
                   datetime(2014, 4, 1)])
ts

2014-01-01    1.0
2014-02-01    NaN
2014-04-01    2.0
dtype: float64

In [36]:
# linear interpolate based on number of items in the Series
ts.interpolate()

2014-01-01    1.0
2014-02-01    1.5
2014-04-01    2.0
dtype: float64

In [37]:
# this accounts for the fact that we don't have
# an entry for 2014-03-01
ts.interpolate(method="time")

2014-01-01    1.000000
2014-02-01    1.344444
2014-04-01    2.000000
dtype: float64

In [45]:
# a Series to demonstrate index label based interpolation
s = pd.Series([0, np.nan, 100], index=[0, 1, 10])
s

0       0.0
1       NaN
10    100.0
dtype: float64

In [46]:
# linear interpolate
s.interpolate()

0       0.0
1      50.0
10    100.0
dtype: float64

In [42]:
# interpolate based upon the values in the index
s.interpolate(method="values")

0       0.0
1      10.0
10    100.0
dtype: float64

## Handling Duplicate Data

In [47]:
# a DataFrame with lots of duplicate data
data = pd.DataFrame({'a': ['x'] * 3 + ['y'] * 4, 
                     'b': [1, 1, 2, 3, 3, 4, 4]})
data

Unnamed: 0,a,b
0,x,1
1,x,1
2,x,2
3,y,3
4,y,3
5,y,4
6,y,4


In [48]:
# reports which rows are duplicates based upon
# if the data in all columns was seen before
data.duplicated()

0    False
1     True
2    False
3    False
4     True
5    False
6     True
dtype: bool

In [49]:
# drop duplicate rows retaining first row of the duplicates
data.drop_duplicates()

Unnamed: 0,a,b
0,x,1
2,x,2
3,y,3
5,y,4


In [50]:
# drop duplicate rows, only keeping the last 
# instance of any data
data.drop_duplicates(keep='last')

Unnamed: 0,a,b
1,x,1
2,x,2
4,y,3
6,y,4


In [51]:
# add a column c with values 0..6
# this makes .duplicated() report no duplicate rows
data['c'] = range(7)
data.duplicated()

0    False
1    False
2    False
3    False
4    False
5    False
6    False
dtype: bool

In [None]:
# but if we specify duplicates to be dropped only in columns a & b
# they will be dropped
data.drop_duplicates(['a', 'b'])

## Mapping

In [52]:
# create two Series objects to demonstrate mapping
x = pd.Series({"one": 1, "two": 2, "three": 3})
y = pd.Series({1: "a", 2: "b", 3: "c"})
x

one      1
two      2
three    3
dtype: int64

In [None]:
y

In [54]:
# map values in x to values in y
# index labels of x but the values from y
x.map(y)

one      a
two      b
three    c
dtype: object

In [55]:
# three in x will not align / map to a value in y
x = pd.Series({"one": 1, "two": 2, "three": 3})
y = pd.Series({1: "a", 2: "b"})
x.map(y)

one        a
two        b
three    NaN
dtype: object

## Replacing values

In [65]:
# create a Series to demonstrate replace
s = pd.Series([0., 1., 2., 3., 2., 4.])
s

0    0.0
1    1.0
2    2.0
3    3.0
4    2.0
5    4.0
dtype: float64

In [57]:
# replace all items with index label 2 with value 5
s.replace(2, 5)

0    0.0
1    1.0
2    5.0
3    3.0
4    5.0
5    4.0
dtype: float64

In [58]:
# replace all items with new values
s.replace([0, 1, 2, 3, 4], [4, 3, 2, 1, 0])

0    4.0
1    3.0
2    2.0
3    1.0
4    2.0
5    0.0
dtype: float64

In [59]:
# replace using entries in a dictionary
s.replace({0: 10, 1: 100})

0     10.0
1    100.0
2      2.0
3      3.0
4      2.0
5      4.0
dtype: float64

In [61]:
# DataFrame with two columns
df = pd.DataFrame({'a': [0, 1, 2, 3, 4], 'b': [5, 6, 7, 8, 9]})
df

Unnamed: 0,a,b
0,0,5
1,1,6
2,2,7
3,3,8
4,4,9


In [62]:
# specify different replacement values for each column
# replace value 1 in column a with value 100
# replace value 8 in column b with value 100
df.replace({'a': 1, 'b': 8}, 100)

Unnamed: 0,a,b
0,0,5
1,100,6
2,2,7
3,3,100
4,4,9


In [67]:
# demonstrate replacement with pad method
# set first item to 10, to have a distinct replacement value
s[0] = 10
s

0    10.0
1     1.0
2     2.0
3     3.0
4     2.0
5     4.0
dtype: float64

In [68]:
# replace items with index label 1, 2, 3, using fill from the
# most recent value prior to the specified labels (10)
s.replace([1, 2, 3], method='pad')

0    10.0
1    10.0
2    10.0
3    10.0
4    10.0
5     4.0
dtype: float64

## Applying functions to transform data

In [None]:
# demonstrate applying a function to every item of a Series
s = pd.Series(np.arange(0, 5))
s.apply(lambda v: v * 2)

In [69]:
# demonstrate applying a sum on each column
df = pd.DataFrame(np.arange(12).reshape(4, 3), 
                  columns=['a', 'b', 'c'])
df

Unnamed: 0,a,b,c
0,0,1,2
1,3,4,5
2,6,7,8
3,9,10,11


In [70]:
# calculate cumulative sum of items in each column
df.apply(lambda col: col.sum())

a    18
b    22
c    26
dtype: int64

In [72]:
# calculate sum of items in each row
df.apply(lambda row: row.sum(), axis=1)

0     3
1    12
2    21
3    30
dtype: int64

In [73]:
# create a new column 'interim' with a * b
df['interim'] = df.apply(lambda r: r.a * r.b, axis=1)
df

Unnamed: 0,a,b,c,interim
0,0,1,2,0
1,3,4,5,12
2,6,7,8,42
3,9,10,11,90


In [None]:
# replace column a with the sum of columns a, b and c
df.a = df.a + df.b + df.c
df

In [None]:
# create a 3x5 DataFrame
# only second row has a NaN
df = pd.DataFrame(np.arange(0, 15).reshape(3,5))
df.loc[1, 2] = np.nan
df

In [None]:
# demonstrate applying a function to only rows having
# a count of 0 NaN values
df.dropna().apply(lambda x: x.sum(), axis=1)

In [None]:
# use applymap to format all items of the DataFrame
df.applymap(lambda x: '%.2f' % x)