In [2]:
###############################################################
# Examples below are (more or less) taken from Wes McKinney's
# "Python for Data Analysis".
# TOPICS COVERED:
# - Intro to Series
# - Intro to DataFrame
# - Statistics for DataFrame
# - Handling Missing Data
###############################################################

# This notebook is written in Python 3.

from pandas import Series, DataFrame
import pandas as pd
import numpy as np
from numpy import nan as NA

## Introduction to Series
Series: 1D array-like object with an array of data & assoc. array of data labels.  
Can think of Series as a fixed-length, ordered dict.

In [50]:
obj = Series([4, 5, 6, 7], index=['a','b','d','c'])  # index optional, default is [0,1,...]
print(obj)
print(obj.values)
print(obj.index)

a    4
b    5
d    6
c    7
dtype: int64
[4 5 6 7]
Index(['a', 'b', 'd', 'c'], dtype='object')


In [51]:
# Can create a Series by passing a Python dict
obj = Series({'Ohio':1000, 'Texas':2000, 'Utah':5000})
print(obj)

Ohio     1000
Texas    2000
Utah     5000
dtype: int64


In [52]:
# isnull & notnull methods to detect if something is true or not
obj = Series({'Ohio':1000, 'Texas':2000, 'Utah':5000}, 
             index = ['California', 'Ohio', 'Texas', 'Utah'])
print(obj)
obj.isnull()

California     NaN
Ohio          1000
Texas         2000
Utah          5000
dtype: float64


California     True
Ohio          False
Texas         False
Utah          False
dtype: bool

In [53]:
# You can add Series together, it automatically aligns the indexed data.
obj2 = Series({'Ohio':500, 'Texas':20, 'Utah':50, 'Wyoming':2})
print(obj)
print(obj2)
print(obj + obj2)

California     NaN
Ohio          1000
Texas         2000
Utah          5000
dtype: float64
Ohio       500
Texas       20
Utah        50
Wyoming      2
dtype: int64
California     NaN
Ohio          1500
Texas         2020
Utah          5050
Wyoming        NaN
dtype: float64


In [54]:
# Series object and its index can have a name attribute.
obj.name = "Population"
obj.index.name = "State"
obj

State
California     NaN
Ohio          1000
Texas         2000
Utah          5000
Name: Population, dtype: float64

In [80]:
# Sorting an array. Missing values put to the end by default
obj = Series([4, np.nan, 7, np.nan, -3, 2])
print(obj)
print(obj.order())

# Ranking an array
print(obj.rank())
# If there are ties, they are all given the mean rank.
# To break ties by the order they appear in order: obj.rank(method='first')
# To rank in descending order: obj.rank(ascending=False, method='max')

0     4
1   NaN
2     7
3   NaN
4    -3
5     2
dtype: float64
4    -3
5     2
0     4
2     7
1   NaN
3   NaN
dtype: float64
0     3
1   NaN
2     4
3   NaN
4     1
5     2
dtype: float64




## Introduction to DataFrame
DataFrame represents a tabular, spreadsheet-like data structure.
Can be thought of as a dict of Series (all sharing the same index).

In [55]:
data = {'state': ['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada'],
        'year' : [2000, 2001, 2002, 2001, 2002],
        'pop'  : [1.5, 1.7, 3.6, 2.4, 2.9]}
frame = DataFrame(data)
frame

Unnamed: 0,pop,state,year
0,1.5,Ohio,2000
1,1.7,Ohio,2001
2,3.6,Ohio,2002
3,2.4,Nevada,2001
4,2.9,Nevada,2002


In [56]:
# Can assign column names and row indices
# NOTE: Row indices need not be unique!
frame2 = DataFrame(data, columns = ['year', 'state', 'pop', 'debt'],
                   index = [1, 2, 3, 4, 5])
print(frame2)

# can set values by assignment
frame2['debt'] = np.arange(5.)
print(frame2)

   year   state  pop debt
1  2000    Ohio  1.5  NaN
2  2001    Ohio  1.7  NaN
3  2002    Ohio  3.6  NaN
4  2001  Nevada  2.4  NaN
5  2002  Nevada  2.9  NaN
   year   state  pop  debt
1  2000    Ohio  1.5     0
2  2001    Ohio  1.7     1
3  2002    Ohio  3.6     2
4  2001  Nevada  2.4     3
5  2002  Nevada  2.9     4


In [57]:
# Assigning a column that doesn't exist will create a new column.
# When assigning, value's length must match length of DataFrame.
# If not, missing values will be inserted into the holes.
frame2['debt'] = Series([1, 2, 3], index=[1,3,5])
print(frame2)

# Can create columns with boolean expressions
frame2['eastern'] = frame2.state == 'Ohio'
print(frame2)

   year   state  pop  debt
1  2000    Ohio  1.5     1
2  2001    Ohio  1.7   NaN
3  2002    Ohio  3.6     2
4  2001  Nevada  2.4   NaN
5  2002  Nevada  2.9     3
   year   state  pop  debt eastern
1  2000    Ohio  1.5     1    True
2  2001    Ohio  1.7   NaN    True
3  2002    Ohio  3.6     2    True
4  2001  Nevada  2.4   NaN   False
5  2002  Nevada  2.9     3   False


In [58]:
# Use del to delete columns
del frame2['eastern']
print(frame2)

# Use drop to delete rows
tempframe = frame2.drop([4,5])
print(tempframe)

   year   state  pop  debt
1  2000    Ohio  1.5     1
2  2001    Ohio  1.7   NaN
3  2002    Ohio  3.6     2
4  2001  Nevada  2.4   NaN
5  2002  Nevada  2.9     3
   year state  pop  debt
1  2000  Ohio  1.5     1
2  2001  Ohio  1.7   NaN
3  2002  Ohio  3.6     2


In [59]:
print(frame2.columns)   # returns list of columns
print(frame2['state'])  # prints state column
print(frame2.state)     # also prints state column!

Index(['year', 'state', 'pop', 'debt'], dtype='object')
1      Ohio
2      Ohio
3      Ohio
4    Nevada
5    Nevada
Name: state, dtype: object
1      Ohio
2      Ohio
3      Ohio
4    Nevada
5    Nevada
Name: state, dtype: object


In [60]:
# Rows can be retrieved using ix
print(frame2)
frame2.ix[3]

   year   state  pop  debt
1  2000    Ohio  1.5     1
2  2001    Ohio  1.7   NaN
3  2002    Ohio  3.6     2
4  2001  Nevada  2.4   NaN
5  2002  Nevada  2.9     3


year     2002
state    Ohio
pop       3.6
debt        2
Name: 3, dtype: object

In [61]:
# values attribute returns data in 2D ndarray
# If DataFrame’s columns are different dtypes, 
# the dtype of the values array will be chosen to accomodate all the columns.
frame2.values

array([[2000, 'Ohio', 1.5, 1.0],
       [2001, 'Ohio', 1.7, nan],
       [2002, 'Ohio', 3.6, 2.0],
       [2001, 'Nevada', 2.4, nan],
       [2002, 'Nevada', 2.9, 3.0]], dtype=object)

### Some Functionality of DataFrames

In [63]:
# reindex: creates a new object with the data conformed to a new index
obj = Series([4.5, 7.2, -5.3, 3.6], index=['d', 'b', 'a', 'c'])
print(obj)
print(obj.reindex(['a', 'b', 'c', 'd', 'e']))

# columns can also be reindexed: e.g. obj.reindex(columns=...)

d    4.5
b    7.2
a   -5.3
c    3.6
dtype: float64
a   -5.3
b    7.2
c    3.6
d    4.5
e    NaN
dtype: float64


In [64]:
# various ways to fill in the missing values
obj3 = Series(['blue', 'purple', 'yellow'], index=[0, 2, 4])
print(obj3)
print(obj3.reindex(range(6)))
print(obj3.reindex(range(6), fill_value = 0))
print(obj3.reindex(range(6), method='ffill'))  # forward fill
# can use ffill or pad
# to backfill, use bfill or backfill

0      blue
2    purple
4    yellow
dtype: object
0      blue
1       NaN
2    purple
3       NaN
4    yellow
5       NaN
dtype: object
0      blue
1         0
2    purple
3         0
4    yellow
5         0
dtype: object
0      blue
1      blue
2    purple
3    purple
4    yellow
5    yellow
dtype: object


In [69]:
# DataFrame slicing with labels behaves differently from Python in that
# it includes endpoint.
obj = Series(np.arange(4.), index=['a', 'b', 'c', 'd'])
print(obj)
print(obj['b':'c'])

a    0
b    1
c    2
d    3
dtype: float64
b    1
c    2
dtype: float64


In [73]:
# DataFrames can interact with Series
arr = np.arange(12.).reshape((3, 4))
print(arr)
print(arr[0])
print(arr - arr[0])  # broadcasting on rows

[[  0.   1.   2.   3.]
 [  4.   5.   6.   7.]
 [  8.   9.  10.  11.]]
[ 0.  1.  2.  3.]
[[ 0.  0.  0.  0.]
 [ 4.  4.  4.  4.]
 [ 8.  8.  8.  8.]]


In [76]:
# applying functions across rows & columns
frame = DataFrame(np.random.randn(4, 3), columns=list('bde'),
                  index=['Utah', 'Ohio', 'Texas', 'Oregon'])
print(frame)
f = lambda x: x.max()
print(frame.apply(f))            # gives one answer per column
print(frame.apply(f, axis = 1))  # gives one answer per row

               b         d         e
Utah    1.771692 -0.512530  0.585855
Ohio   -1.201704 -0.308852 -0.081640
Texas   0.768470 -0.130008 -0.363920
Oregon  0.528494 -0.690417 -0.911682
b    1.771692
d   -0.130008
e    0.585855
dtype: float64
Utah      1.771692
Ohio     -0.081640
Texas     0.768470
Oregon    0.528494
dtype: float64


In [77]:
# applying element-wise functions with applymap
format = lambda x: '%.2f' % x
frame.applymap(format)

Unnamed: 0,b,d,e
Utah,1.77,-0.51,0.59
Ohio,-1.2,-0.31,-0.08
Texas,0.77,-0.13,-0.36
Oregon,0.53,-0.69,-0.91


In [94]:
# Sorting and ranking in a DataFrame
np.random.seed(13)
frame = DataFrame(np.random.randint(3, size=12).reshape((3, 4)),
                  index=['three', 'one', 'two'],
                  columns=['d', 'a', 'b', 'c'])
print(frame)
print(frame.sort_index())        # sort the rows (lexicographical)
print(frame.sort_index(axis=1))  # sort the columns (lexicographical)
print(frame.sort_index(by='d'))  # sort by column 'd'
print(frame.sort_index(by=['d','a']))  # sort by column 'd', then 'a'

       d  a  b  c
three  2  0  2  0
one    2  2  0  1
two    0  2  2  0
       d  a  b  c
one    2  2  0  1
three  2  0  2  0
two    0  2  2  0
       a  b  c  d
three  0  2  0  2
one    2  0  1  2
two    2  2  0  0
       d  a  b  c
two    0  2  2  0
three  2  0  2  0
one    2  2  0  1
       d  a  b  c
two    0  2  2  0
three  2  0  2  0
one    2  2  0  1




## Statistics for DataFrame

In [3]:
df = DataFrame([[1.4, np.nan], [7.1, -4.5], [np.nan, np.nan], [0.75, -1.3]],
               index=['a', 'b', 'c', 'd'],
               columns=['one', 'two'])
print(df)
print(df.sum())         # gives column sums
print(df.sum(axis = 1)) # gives row sums

# Typically NAs are excluded unless whole column (or row) is NA.
# This can be disabled:
print(df.sum(axis=1, skipna=False))

    one  two
a  1.40  NaN
b  7.10 -4.5
c   NaN  NaN
d  0.75 -1.3
one    9.25
two   -5.80
dtype: float64
a    1.40
b    2.60
c     NaN
d   -0.55
dtype: float64
a     NaN
b    2.60
c     NaN
d   -0.55
dtype: float64


In [4]:
# Multiple summary statistics in one shot
print(df)
df.describe()

    one  two
a  1.40  NaN
b  7.10 -4.5
c   NaN  NaN
d  0.75 -1.3


Unnamed: 0,one,two
count,3.0,2.0
mean,3.083333,-2.9
std,3.493685,2.262742
min,0.75,-4.5
25%,1.075,-3.7
50%,1.4,-2.9
75%,4.25,-2.1
max,7.1,-1.3


In [5]:
# On non-numeric data, describe gives other summary stats
obj = Series(['a', 'a', 'b', 'c'] * 4)
obj.describe()

count     16
unique     3
top        a
freq       8
dtype: object

## Handling Missing Data

In [6]:
# pandas uses NaN to represent missing data for both floating & non-floating pt arrays.
# Python's built-in None value is also treated as NA.

string_data = Series(['aardvark', 'artichoke', np.nan, 'avocado', None])
print(string_data)
print(string_data.isnull())

0     aardvark
1    artichoke
2          NaN
3      avocado
4         None
dtype: object
0    False
1    False
2     True
3    False
4     True
dtype: bool


In [7]:
# dropna to drop missing data
data = Series([1, NA, 3.5, NA, 7])
data.dropna()

# same as data[data.notnull()]

0    1.0
2    3.5
4    7.0
dtype: float64

In [8]:
# For a DataFrame, dropna by default drops any row containing a missing value.
# To drop columns, pass the attribute axis=1.
data = DataFrame([[1., 6.5, 3.], [1., NA, NA], [NA, NA, NA], [NA, 6.5, 3.]])
cleaned1 = data.dropna()           # row dropped if any is NA
cleaned2 = data.dropna(how='all')  # row only dropped if all NA
cleaned3 = data.dropna(thresh=2)   # row only dropped if it has < 2 non-NAs
print(data)
print(cleaned1)
print(cleaned2)
print(cleaned3)

    0    1   2
0   1  6.5   3
1   1  NaN NaN
2 NaN  NaN NaN
3 NaN  6.5   3
   0    1  2
0  1  6.5  3
    0    1   2
0   1  6.5   3
1   1  NaN NaN
3 NaN  6.5   3
    0    1  2
0   1  6.5  3
3 NaN  6.5  3


In [9]:
# fillna to fill in NA values
# fillna returns a new object. To fill in place, pass the attribute inplace = True.

df = DataFrame(np.random.randn(7, 3))
df.ix[:4, 1] = NA; df.ix[:2, 2] = NA
print(df)
print(df.fillna(0))           # Fill NAs with value 0
print(df.fillna({1:4, 2:3}))  # Fill NAs in col 1 with 4, col 2 with 3

          0         1         2
0  0.134889       NaN       NaN
1 -0.257648       NaN       NaN
2  0.456481       NaN       NaN
3 -2.632616       NaN -1.092949
4  1.157239       NaN  0.478331
5 -0.578996  0.192436 -1.443022
6  0.481761  0.395612  0.966585
          0         1         2
0  0.134889  0.000000  0.000000
1 -0.257648  0.000000  0.000000
2  0.456481  0.000000  0.000000
3 -2.632616  0.000000 -1.092949
4  1.157239  0.000000  0.478331
5 -0.578996  0.192436 -1.443022
6  0.481761  0.395612  0.966585
          0         1         2
0  0.134889  4.000000  3.000000
1 -0.257648  4.000000  3.000000
2  0.456481  4.000000  3.000000
3 -2.632616  4.000000 -1.092949
4  1.157239  4.000000  0.478331
5 -0.578996  0.192436 -1.443022
6  0.481761  0.395612  0.966585
