# 6.26 - Pandas DataFrames, part I

In [1]:
import numpy as np
import pandas as pd

In [2]:
from numpy.random import randn

In [20]:
np.random.seed(101) # the seed random dataset...meaning the 'random' numbers will actually be the same as the video

### Creating DataFrames

In [21]:
df = pd.DataFrame(randn(5,4),['a','b','c','d','e'],['w','x','y','z'])

In [22]:
df # nuhhh

Unnamed: 0,w,x,y,z
a,2.70685,0.628133,0.907969,0.503826
b,0.651118,-0.319318,-0.848077,0.605965
c,-2.018168,0.740122,0.528813,-0.589001
d,0.188695,-0.758872,-0.933237,0.955057
e,0.190794,1.978757,2.605967,0.683509


#### Selecting columns

In [23]:
df['w'] # each column is a Series tho

a    2.706850
b    0.651118
c   -2.018168
d    0.188695
e    0.190794
Name: w, dtype: float64

In [24]:
type(df['w']) # o_0

pandas.core.series.Series

In [25]:
type(df)

pandas.core.frame.DataFrame

In [26]:
df.w # SQL-like notation

a    2.706850
b    0.651118
c   -2.018168
d    0.188695
e    0.190794
Name: w, dtype: float64

In [27]:
df[['w','z']]

Unnamed: 0,w,z
a,2.70685,0.503826
b,0.651118,0.605965
c,-2.018168,-0.589001
d,0.188695,0.955057
e,0.190794,0.683509


### Adding and dropping columns

adding

In [41]:
df['new'] = df['w'] + df['y']

In [42]:
df # oh dang we just made something!

Unnamed: 0,w,x,y,z,new
a,2.70685,0.628133,0.907969,0.503826,3.614819
b,0.651118,-0.319318,-0.848077,0.605965,-0.196959
c,-2.018168,0.740122,0.528813,-0.589001,-1.489355
d,0.188695,-0.758872,-0.933237,0.955057,-0.744542
e,0.190794,1.978757,2.605967,0.683509,2.796762


dropping to new DataFrame

In [43]:
df.drop('new',1) # gotta define which axis to drop

Unnamed: 0,w,x,y,z
a,2.70685,0.628133,0.907969,0.503826
b,0.651118,-0.319318,-0.848077,0.605965
c,-2.018168,0.740122,0.528813,-0.589001
d,0.188695,-0.758872,-0.933237,0.955057
e,0.190794,1.978757,2.605967,0.683509


In [44]:
df # but wait! we didn't change the original

Unnamed: 0,w,x,y,z,new
a,2.70685,0.628133,0.907969,0.503826,3.614819
b,0.651118,-0.319318,-0.848077,0.605965,-0.196959
c,-2.018168,0.740122,0.528813,-0.589001,-1.489355
d,0.188695,-0.758872,-0.933237,0.955057,-0.744542
e,0.190794,1.978757,2.605967,0.683509,2.796762


To modify the original, use the inplace=True option

In [45]:
df.drop('new',axis=1,inplace=True)

In [46]:
df

Unnamed: 0,w,x,y,z
a,2.70685,0.628133,0.907969,0.503826
b,0.651118,-0.319318,-0.848077,0.605965
c,-2.018168,0.740122,0.528813,-0.589001
d,0.188695,-0.758872,-0.933237,0.955057
e,0.190794,1.978757,2.605967,0.683509


In [48]:
df.drop('e') #dropping columns

Unnamed: 0,w,x,y,z
a,2.70685,0.628133,0.907969,0.503826
b,0.651118,-0.319318,-0.848077,0.605965
c,-2.018168,0.740122,0.528813,-0.589001
d,0.188695,-0.758872,-0.933237,0.955057


In [49]:
df.shape

(5, 4)

#### Selecting rows

In [51]:
df.loc['a'] # rows are also series

w    2.706850
x    0.628133
y    0.907969
z    0.503826
Name: a, dtype: float64

In [54]:
df.iloc[2]

w   -2.018168
x    0.740122
y    0.528813
z   -0.589001
Name: c, dtype: float64

### Selecting subsets

In [55]:
df.loc['b','y']

-0.84807698340363147

In [56]:
df.loc['a':'b','x':'y'] # similar to matrix selection

Unnamed: 0,x,y
a,0.628133,0.907969
b,-0.319318,-0.848077


In [58]:
df.loc[['a','b'],['x','y']] # individual row-column selection

Unnamed: 0,x,y
a,0.628133,0.907969
b,-0.319318,-0.848077


# 6.27 - Pandas DataFrames, part II

### Conditional selection

In [60]:
booldf = df > 0
booldf

Unnamed: 0,w,x,y,z
a,True,True,True,True
b,True,False,False,True
c,False,True,True,False
d,True,False,False,True
e,True,True,True,True


In [63]:
df[booldf] # also `df[df>0]`

Unnamed: 0,w,x,y,z
a,2.70685,0.628133,0.907969,0.503826
b,0.651118,,,0.605965
c,,0.740122,0.528813,
d,0.188695,,,0.955057
e,0.190794,1.978757,2.605967,0.683509


In [64]:
df['w']>0

a     True
b     True
c    False
d     True
e     True
Name: w, dtype: bool

In [66]:
df[df['w']>0] # oooooOOOOOooo

Unnamed: 0,w,x,y,z
a,2.70685,0.628133,0.907969,0.503826
b,0.651118,-0.319318,-0.848077,0.605965
d,0.188695,-0.758872,-0.933237,0.955057
e,0.190794,1.978757,2.605967,0.683509


In [67]:
df[df['z']<0]

Unnamed: 0,w,x,y,z
c,-2.018168,0.740122,0.528813,-0.589001


In [71]:
resultdf = df[df['w']>0]
resultdf['x'] # two steps

a    0.628133
b   -0.319318
d   -0.758872
e    1.978757
Name: x, dtype: float64

In [74]:
df[df['w']>0]['x'] # or ['x','y'] or ['x':'z']

a    0.628133
b   -0.319318
d   -0.758872
e    1.978757
Name: x, dtype: float64

### Selection using multiple conditions

In [76]:
df[(df['w']>0) & (df['y']>1)] # one ampersand, not `and`

Unnamed: 0,w,x,y,z
e,0.190794,1.978757,2.605967,0.683509


In [77]:
df[(df['w']>0) | (df['y']>1)] # pipe operator for `or`

Unnamed: 0,w,x,y,z
a,2.70685,0.628133,0.907969,0.503826
b,0.651118,-0.319318,-0.848077,0.605965
d,0.188695,-0.758872,-0.933237,0.955057
e,0.190794,1.978757,2.605967,0.683509


### Changing the index

In [79]:
df.reset_index() # specify `inplace=True` to change original dataset

Unnamed: 0,index,w,x,y,z
0,a,2.70685,0.628133,0.907969,0.503826
1,b,0.651118,-0.319318,-0.848077,0.605965
2,c,-2.018168,0.740122,0.528813,-0.589001
3,d,0.188695,-0.758872,-0.933237,0.955057
4,e,0.190794,1.978757,2.605967,0.683509


In [83]:
newind = 'CO ME NH MT NY'.split() # nice

In [84]:
df['States'] = newind

In [85]:
df.set_index('States')

Unnamed: 0_level_0,w,x,y,z
States,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
CO,2.70685,0.628133,0.907969,0.503826
ME,0.651118,-0.319318,-0.848077,0.605965
NH,-2.018168,0.740122,0.528813,-0.589001
MT,0.188695,-0.758872,-0.933237,0.955057
NY,0.190794,1.978757,2.605967,0.683509


# 6.28 - Pandas DataFrames, part III

### Creating MultiIndex and index hierarchies

In [91]:
# Index Levels
outside = ['G1','G1','G1','G2','G2','G2']
inside = [1,2,3,1,2,3]
hier_index = list(zip(outside,inside)) # this is cool and probably useful
hier_index = pd.MultiIndex.from_tuples(hier_index)

In [102]:
dfh = pd.DataFrame(randn(6,2),hier_index,('a','b'))

In [101]:
dfh # hnngggh

Unnamed: 0,Unnamed: 1,a,b
G1,1,0.302665,1.693723
G1,2,-1.706086,-1.159119
G1,3,-0.134841,0.390528
G2,1,0.166905,0.184502
G2,2,0.807706,0.07296
G2,3,0.638787,0.329646


In [100]:
dfh.loc['G2'].loc[2]

a    0.807706
b    0.072960
Name: 2, dtype: float64

In [104]:
dfh.index.names = ['Groups','Num']
dfh

Unnamed: 0_level_0,Unnamed: 1_level_0,a,b
Groups,Num,Unnamed: 2_level_1,Unnamed: 3_level_1
G1,1,-0.497104,-0.75407
G1,2,-0.943406,0.484752
G1,3,-0.116773,1.901755
G2,1,0.238127,1.996652
G2,2,-0.993263,0.1968
G2,3,-1.136645,0.000366


In [105]:
dfh.loc['G2'].loc[2]['b']

0.19679950499134005

In [106]:
dfh.loc['G1'].loc[3]['a']

-0.11677331646707445

### Cross-sections

In [111]:
dfh.xs(1,level='Num')

Unnamed: 0_level_0,a,b
Groups,Unnamed: 1_level_1,Unnamed: 2_level_1
G1,-0.497104,-0.75407
G2,0.238127,1.996652
