# DataFrames II

In [2]:
import numpy as np

In [3]:
import pandas as pd

In [4]:
from numpy.random import randn

In [5]:
np.random.seed(101)

In [6]:
df = pd.DataFrame(randn(5,4), index = 'a b c d e'.split(), columns='w x y z'.split())

In [7]:
df

Unnamed: 0,w,x,y,z
a,2.70685,0.628133,0.907969,0.503826
b,0.651118,-0.319318,-0.848077,0.605965
c,-2.018168,0.740122,0.528813,-0.589001
d,0.188695,-0.758872,-0.933237,0.955057
e,0.190794,1.978757,2.605967,0.683509


# Conditional Selection
An important feature of pandas is conditional selection using bracket notation, very similar to numpy:

In [8]:
# let's see different exmaples of conditional selection

In [9]:
df['w']>0  # this will output the series with all the rows where value of the column 'W' is greater than 0 in boolean form meani
# true false

a     True
b     True
c    False
d     True
e     True
Name: w, dtype: bool

In [10]:
bool1 = df['w']>0

In [11]:
df[bool1] # we can see it gave the dataframe and removed the all the row entries where 'w' was less than 0 

Unnamed: 0,w,x,y,z
a,2.70685,0.628133,0.907969,0.503826
b,0.651118,-0.319318,-0.848077,0.605965
d,0.188695,-0.758872,-0.933237,0.955057
e,0.190794,1.978757,2.605967,0.683509


In [24]:
df[bool1]['w'] # this is how we can get only w column out of this result

a    2.706850
b    0.651118
d    0.188695
e    0.190794
Name: w, dtype: float64

In [26]:
df[bool1][['w','x']]

Unnamed: 0,w,x
a,2.70685,0.628133
b,0.651118,-0.319318
d,0.188695,-0.758872
e,0.190794,1.978757


In [28]:
acc = df[bool1][['w','x']]

In [30]:
acc.loc[['a','b']]

Unnamed: 0,w,x
a,2.70685,0.628133
b,0.651118,-0.319318


In [16]:
# Let us try to apply conditional selection on the whole dataframe

In [17]:
df>0

Unnamed: 0,w,x,y,z
a,True,True,True,True
b,True,False,False,True
c,False,True,True,False
d,True,False,False,True
e,True,True,True,True


In [18]:
df[df>0] # this is how you can get the values

Unnamed: 0,w,x,y,z
a,2.70685,0.628133,0.907969,0.503826
b,0.651118,,,0.605965
c,,0.740122,0.528813,
d,0.188695,,,0.955057
e,0.190794,1.978757,2.605967,0.683509


In [19]:
# for multiple conditions we have to use '&' for 'and' AND for 'or' we should use '|'. let's see some examples

In [None]:
df[(df['w']>3) &
   
   
   
   (df['y']>0.5)]

In [21]:
df[(df['w']>0.2) & (df['y']>0.5)][['w','y']] # thats the way to do it 

Unnamed: 0,w,y
a,2.70685,0.907969


In [22]:
# Let's discuss some more features of indexing, including resetting the index or setting it something else. 
#We'll also talk about index hierarchy!

In [23]:
df.reset_index() # before our index was 'a,b,c,d,e now with reset index option we can reset it again [0,1,2,3,4]

Unnamed: 0,index,w,x,y,z
0,a,2.70685,0.628133,0.907969,0.503826
1,b,0.651118,-0.319318,-0.848077,0.605965
2,c,-2.018168,0.740122,0.528813,-0.589001
3,d,0.188695,-0.758872,-0.933237,0.955057
4,e,0.190794,1.978757,2.605967,0.683509


In [24]:
# we can even set entrirely new index let's see how can we do that 
new_index = 'uk us ussr italy japan'.split()

In [25]:
new_index

['uk', 'us', 'ussr', 'italy', 'japan']

In [26]:
df['country']= new_index

In [27]:
df

Unnamed: 0,w,x,y,z,country
a,2.70685,0.628133,0.907969,0.503826,uk
b,0.651118,-0.319318,-0.848077,0.605965,us
c,-2.018168,0.740122,0.528813,-0.589001,ussr
d,0.188695,-0.758872,-0.933237,0.955057,italy
e,0.190794,1.978757,2.605967,0.683509,japan


In [28]:
df.set_index('country')

Unnamed: 0_level_0,w,x,y,z
country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
uk,2.70685,0.628133,0.907969,0.503826
us,0.651118,-0.319318,-0.848077,0.605965
ussr,-2.018168,0.740122,0.528813,-0.589001
italy,0.188695,-0.758872,-0.933237,0.955057
japan,0.190794,1.978757,2.605967,0.683509


# some more operation on dataframe indexing 

In [29]:
data = {'name': ['Jason', 'Molly', 'Tina', 'Jake', 'Amy'], 
        'year': [2012, 2012, 2013, 2014, 2014], 
        'reports': [4, 24, 31, 2, 3],
        'coverage': [25, 94, 57, 62, 70]}
df2 = pd.DataFrame(data, index = ['Cochice', 'Pima', 'Santa Cruz', 'Maricopa', 'Yuma'])
df2

Unnamed: 0,coverage,name,reports,year
Cochice,25,Jason,4,2012
Pima,94,Molly,24,2012
Santa Cruz,57,Tina,31,2013
Maricopa,62,Jake,2,2014
Yuma,70,Amy,3,2014


In [30]:
df2.loc[['Yuma','Maricopa'],['coverage','year']]

Unnamed: 0,coverage,year
Yuma,70,2014
Maricopa,62,2014


In [31]:
bool2 = df2[df2 >15]

In [32]:
bool2

Unnamed: 0,coverage,name,reports,year
Cochice,25,Jason,,2012
Pima,94,Molly,24.0,2012
Santa Cruz,57,Tina,31.0,2013
Maricopa,62,Jake,,2014
Yuma,70,Amy,,2014


In [33]:
bool2.loc[['Pima','Santa Cruz'],['coverage','year']]

Unnamed: 0,coverage,year
Pima,94,2012
Santa Cruz,57,2013


# Practice Important conditional selection

In [35]:
df

Unnamed: 0,w,x,y,z,country
a,2.70685,0.628133,0.907969,0.503826,uk
b,0.651118,-0.319318,-0.848077,0.605965,us
c,-2.018168,0.740122,0.528813,-0.589001,ussr
d,0.188695,-0.758872,-0.933237,0.955057,italy
e,0.190794,1.978757,2.605967,0.683509,japan


In [42]:
resfst = df['w']>0
ressec = df[df['w']>0]
cols = ['x','z']
ressec[cols]

Unnamed: 0,x,z
a,0.628133,0.503826
b,-0.319318,0.605965
d,-0.758872,0.955057
e,1.978757,0.683509


a     True
b     True
c    False
d     True
e     True
Name: w, dtype: bool