# Pandas Data Frames Part 2

### Conditional Selection

In [1]:
import numpy as np
import pandas as pd

In [2]:
from numpy.random import randn

In [3]:
np.random.seed(101)

In [4]:
# Create a Pandas DataFrame with 5 rows and 4 columns:
# Rows must be A,B,C,D,E and columns W,X,Y,Z
# The elements must be generated from samples of a random normal distribution function
df = pd.DataFrame(randn(5,4), ['A','B','C','D','E'], ['W','X','Y','Z'])

In [5]:
df

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
C,-2.018168,0.740122,0.528813,-0.589001
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [6]:
boolean_data_frame = df > 0
boolean_data_frame

Unnamed: 0,W,X,Y,Z
A,True,True,True,True
B,True,False,False,True
C,False,True,True,False
D,True,False,False,True
E,True,True,True,True


In [7]:
df[boolean_data_frame]

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,,,0.605965
C,,0.740122,0.528813,
D,0.188695,,,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [8]:
df['W'] > 0

A     True
B     True
C    False
D     True
E     True
Name: W, dtype: bool

In [9]:
df['W']

A    2.706850
B    0.651118
C   -2.018168
D    0.188695
E    0.190794
Name: W, dtype: float64

In [12]:
df[df['W'] > 0]

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [21]:
result_df = df[df['Z'] > 0]
result_df['X']

A    0.628133
B   -0.319318
D   -0.758872
E    1.978757
Name: X, dtype: float64

In [26]:
df[df['Z'] > 0]['X'] # This might be intimidating for beginners

A    0.628133
B   -0.319318
D   -0.758872
E    1.978757
Name: X, dtype: float64

In [28]:
df[df['Z'] > 0][['Y','X']] # This might be intimidating for beginners

Unnamed: 0,Y,X
A,0.907969,0.628133
B,-0.848077,-0.319318
D,-0.933237,-0.758872
E,2.605967,1.978757


In [31]:
boolean_series = df['W'] > 0
boolean_series

A     True
B     True
C    False
D     True
E     True
Name: W, dtype: bool

In [33]:
result = df[boolean_series]
result

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [35]:
result[['Y','X']]

Unnamed: 0,Y,X
A,0.907969,0.628133
B,-0.848077,-0.319318
D,-0.933237,-0.758872
E,2.605967,1.978757


### Multiple conditional selection

In [36]:
df[ df['W'] > 0 ]

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [39]:
# To compare series use "&" instead of "and" keyword
df[ (df['W'] > 0) & (df['Y'] > 0) ]

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
E,0.190794,1.978757,2.605967,0.683509


### Resetting the index or setting it to something else

In our original data frame we had 5 rows and 4 columns:

In [41]:
df

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
C,-2.018168,0.740122,0.528813,-0.589001
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [43]:
df.reset_index()

Unnamed: 0,index,W,X,Y,Z
0,A,2.70685,0.628133,0.907969,0.503826
1,B,0.651118,-0.319318,-0.848077,0.605965
2,C,-2.018168,0.740122,0.528813,-0.589001
3,D,0.188695,-0.758872,-0.933237,0.955057
4,E,0.190794,1.978757,2.605967,0.683509


In [45]:
df

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
C,-2.018168,0.740122,0.528813,-0.589001
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [47]:
df.reset_index(inplace=True)

In [49]:
df

Unnamed: 0,level_0,index,W,X,Y,Z
0,0,A,2.70685,0.628133,0.907969,0.503826
1,1,B,0.651118,-0.319318,-0.848077,0.605965
2,2,C,-2.018168,0.740122,0.528813,-0.589001
3,3,D,0.188695,-0.758872,-0.933237,0.955057
4,4,E,0.190794,1.978757,2.605967,0.683509


### Creating new custom indexes

In [53]:
new_indexes = 'CA NY WY OR CO'.split() # this is a nice way to create a list, instead of having to type in every comma.

In [54]:
new_indexes

['CA', 'NY', 'WY', 'OR', 'CO']

In [56]:
df['States'] = new_indexes
df

Unnamed: 0,level_0,index,W,X,Y,Z,States
0,0,A,2.70685,0.628133,0.907969,0.503826,CA
1,1,B,0.651118,-0.319318,-0.848077,0.605965,NY
2,2,C,-2.018168,0.740122,0.528813,-0.589001,WY
3,3,D,0.188695,-0.758872,-0.933237,0.955057,OR
4,4,E,0.190794,1.978757,2.605967,0.683509,CO


In [57]:
df.set_index('States')

Unnamed: 0_level_0,level_0,index,W,X,Y,Z
States,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
CA,0,A,2.70685,0.628133,0.907969,0.503826
NY,1,B,0.651118,-0.319318,-0.848077,0.605965
WY,2,C,-2.018168,0.740122,0.528813,-0.589001
OR,3,D,0.188695,-0.758872,-0.933237,0.955057
CO,4,E,0.190794,1.978757,2.605967,0.683509


### References

* [Pandas Data Frames][1]

[1]:https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.html#pandas.DataFrame