# Getting Started w/ Pandas

- high-level data structures
- built on top of Numpy

In [6]:
#from pandas import Series, Dataframe
import pandas as pd
import numpy as np

from pandas import Series, DataFrame

## Pandas Data Structures

### Series
- one-dimensional array-like object containing an array of data and an associated array of data label

In [4]:
obj = Series([4, 7, -5, 3])

In [6]:
obj

0    4
1    7
2   -5
3    3
dtype: int64

In [8]:
obj2 = Series([4,7,-5,3], index=['a', 'b', 'c', 'd'])
obj2

a    4
b    7
c   -5
d    3
dtype: int64

In [10]:
obj2.index

Index(['a', 'b', 'c', 'd'], dtype='object')

In [15]:
obj2[['a', 'b']]

a    4
b    7
dtype: int64

In [16]:
obj2[obj2>0]

a    4
b    7
d    3
dtype: int64

In [18]:
obj2*2

a     8
b    14
c   -10
d     6
dtype: int64

In [19]:
'b' in obj2

True

In [20]:
'e' in obj2

False

### Create a Series from Python Dict

In [21]:
data = {'a': 1, 'b': 2, 'c': 3, 'd':4}

In [22]:
obj3 = Series(data)

In [23]:
obj3

a    1
b    2
c    3
d    4
dtype: int64

In [27]:
### Passing index in the Series with missing data
obj4 = Series(data, index=['a', 'b', 'c', 'd','e'])
obj4

a    1.0
b    2.0
c    3.0
d    4.0
e    NaN
dtype: float64

### How to identify missing values in Pandas?

In [28]:
pd.isnull(obj4)

a    False
b    False
c    False
d    False
e     True
dtype: bool

In [30]:
pd.notnull(obj4)

a     True
b     True
c     True
d     True
e    False
dtype: bool

### How to set Series attributes?

In [31]:
obj4.name = "Letters Mapping"

In [32]:
obj4.index.name = 'Letters'

In [33]:
obj4

Letters
a    1.0
b    2.0
c    3.0
d    4.0
e    NaN
Name: Letters Mapping, dtype: float64

## DataFrame
- tabular, spreadsheet-like data structures
- it has both row and column index


### 2 ways to create DataFrame

### 1. most common way is from a dict of equal-length of arrays

In [39]:
# most common way is from a dict of equal-length of arrays
data = {'letters': ['a', 'b', 'c', 'd'],
         'numbers': [1,2,3,4],
         'year':[2010, 2011, 2012, 2013]}
frame = DataFrame(data)
frame

Unnamed: 0,letters,numbers,year
0,a,1,2010
1,b,2,2011
2,c,3,2012
3,d,4,2013


In [43]:
# can pass columns
DataFrame(data, columns=['year', 'letters', 'numbers'])

Unnamed: 0,year,letters,numbers
0,2010,a,1
1,2011,b,2
2,2012,c,3
3,2013,d,4


### 2. Nested dict

In [44]:
nested_data = {'Male': {2010: 2.4, 2011: 2.5}, 'Female': {2010: 2.6, 2012: 3}}
DataFrame(nested_data)

Unnamed: 0,Female,Male
2010,2.6,2.4
2011,,2.5
2012,3.0,


In [46]:
DataFrame(nested_data, index=[2009, 2010, 2011, 2012])

Unnamed: 0,Female,Male
2009,,
2010,2.6,2.4
2011,,2.5
2012,3.0,


## Fundamental Mechanics of interacting with the data in Series and DataFrame

### Reindexing

In [48]:
obj = Series([1,2,3,4], index=['b', 'd','c','a'])
obj

b    1
d    2
c    3
a    4
dtype: int64

In [50]:
new_obj = obj.reindex(['a', 'b', 'c', 'd'])
new_obj

a    4
b    1
c    3
d    2
dtype: int64

In [51]:
obj.reindex(['a', 'b', 'c', 'd', 'e'], fill_value=0)

a    4
b    1
c    3
d    2
e    0
dtype: int64

In [54]:
### for ordered data like time series
obj3 = Series(['a', 'b', 'c'], index=[0,2,4])
obj3

0    a
2    b
4    c
dtype: object

In [57]:
# fill values forward
obj3.reindex(range(6), method='ffill')

#try bfill and see what happens...

0    a
1    a
2    b
3    b
4    c
5    c
dtype: object

In [60]:
import numpy as np

In [62]:
df = DataFrame(np.arange(9).reshape((3,3)), index=['a','b','c'], columns=['2010', '2011', '2012'])
df

Unnamed: 0,2010,2011,2012
a,0,1,2
b,3,4,5
c,6,7,8


In [63]:
df.reindex(['a','b','c', 'd'])

Unnamed: 0,2010,2011,2012
a,0.0,1.0,2.0
b,3.0,4.0,5.0
c,6.0,7.0,8.0
d,,,


In [66]:
df.reindex(columns=['2013', '2012', '2011', '2010'])

Unnamed: 0,2013,2012,2011,2010
a,,2,1,0
b,,5,4,3
c,,8,7,6


In [67]:
df

Unnamed: 0,2010,2011,2012
a,0,1,2
b,3,4,5
c,6,7,8


In [69]:
# succinct way
df.ix[['b', 'c', 'a', 'd'], ['2013', '2012', '2011', '2010']]

Unnamed: 0,2013,2012,2011,2010
b,,5.0,4.0,3.0
c,,8.0,7.0,6.0
a,,2.0,1.0,0.0
d,,,,


### Dropping entries from an axis

In [72]:
# For Series,
series_obj = Series(np.arange(5.), index=['a', 'b', 'c','d', 'e'])
series_obj

a    0.0
b    1.0
c    2.0
d    3.0
e    4.0
dtype: float64

In [74]:
# delete single index
new_obj = obj.drop('c')
new_obj

b    1
d    2
a    4
dtype: int64

In [77]:
# delete multiple indexes
new_obj = obj.drop(['a', 'b'])
new_obj

d    2
c    3
dtype: int64

In [79]:
# For DataFrame
data = DataFrame(np.arange(16).reshape((4,4)), index=[2010, 2011, 2012, 2013], columns=['a', 'b', 'c','d'])
data

Unnamed: 0,a,b,c,d
2010,0,1,2,3
2011,4,5,6,7
2012,8,9,10,11
2013,12,13,14,15


In [81]:
# drop multiple rows
data.drop([2011, 2012])

Unnamed: 0,a,b,c,d
2010,0,1,2,3
2013,12,13,14,15


In [83]:
# drop multiple columns
data.drop(['b', 'c'], axis=1)

Unnamed: 0,a,d
2010,0,3
2011,4,7
2012,8,11
2013,12,15


# Indexing

In [23]:
# Series
obj = Series(np.arange(4.0), index=['a', 'b', 'c', 'd'])
obj

a    0.0
b    1.0
c    2.0
d    3.0
dtype: float64

In [24]:
obj['b']

1.0

In [27]:
obj[0:2] #retrieve those value between 0 and 2(exclusive)

a    0.0
b    1.0
dtype: float64

In [28]:
obj[['b', 'c', 'd']] #retrieves all rows with the index

b    1.0
c    2.0
d    3.0
dtype: float64

In [30]:
obj[[1,3]] # retrieves all rows containing the specified array

b    1.0
d    3.0
dtype: float64

In [35]:
obj[obj < 2] # retrieves all elements less than 2

a    0.0
b    1.0
dtype: float64

In [37]:
obj['b':'c'] # index slicing is inclusive

b    1.0
c    2.0
dtype: float64

In [40]:
obj['b':'c'] = 5 # sets the value of the index to 5
obj

a    0.0
b    5.0
c    5.0
d    3.0
dtype: float64

## DataFrame Indexing

In [44]:
df = DataFrame(np.arange(16).reshape((4,4)), index=['a', 'b', 'c', 'd'], columns=['2010', '2011', '2012', '2013'])
df

Unnamed: 0,2010,2011,2012,2013
a,0,1,2,3
b,4,5,6,7
c,8,9,10,11
d,12,13,14,15


In [46]:
df['2011'] # retrieves the column

a     1
b     5
c     9
d    13
Name: 2011, dtype: int64

In [47]:
df[['2011', '2012']] # retrieves multiple columns

Unnamed: 0,2011,2012
a,1,2
b,5,6
c,9,10
d,13,14


In [53]:
df[2:4] # selecting rows by slicing

Unnamed: 0,2010,2011,2012,2013
c,8,9,10,11
d,12,13,14,15


In [55]:
df[ df['2012'] > 8] #selecting rows by boolean

Unnamed: 0,2010,2011,2012,2013
c,8,9,10,11
d,12,13,14,15


In [58]:
df < 8 # scalar comparison

Unnamed: 0,2010,2011,2012,2013
a,True,True,True,True
b,True,True,True,True
c,False,False,False,False
d,False,False,False,False


In [60]:
df[df < 8] = 0 # set those values less than 8
df

Unnamed: 0,2010,2011,2012,2013
a,0,0,0,0
b,0,0,0,0
c,8,9,10,11
d,12,13,14,15


In [62]:
#special indexing field ix, it allows you to select subset of the rows and columns
df.ix['c', ['2011', '2013']]

2011     9
2013    11
Name: c, dtype: int64

In [63]:
df.ix[['c', 'b'], ['2011', '2012']] # pass arrays of index and columns

Unnamed: 0,2011,2012
c,9,10
b,0,0


In [68]:
df.ix[2] # retrieves the 3rd rows in the dataframe

2010     8
2011     9
2012    10
2013    11
Name: c, dtype: int64

In [71]:
df.ix[:'c', :'2012' ] #slicing

Unnamed: 0,2010,2011,2012
a,0,0,0
b,0,0,0
c,8,9,10


In [73]:
df.ix[df['2011'] > 10, :3]

Unnamed: 0,2010,2011,2012
d,12,13,14
