# DataFrames Filtering in Pandas

In [2]:
import pandas as pd

In [3]:
titanic = pd.read_csv('titanic.csv')

In [4]:
titanic

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,deck
0,0,3,male,22.0,1,0,7.2500,S,
1,1,1,female,38.0,1,0,71.2833,C,C
2,1,3,female,26.0,0,0,7.9250,S,
3,1,1,female,35.0,1,0,53.1000,S,C
4,0,3,male,35.0,0,0,8.0500,S,
...,...,...,...,...,...,...,...,...,...
886,0,2,male,27.0,0,0,13.0000,S,
887,1,1,female,19.0,0,0,30.0000,S,B
888,0,3,female,,1,2,23.4500,S,
889,1,1,male,26.0,0,0,30.0000,C,C


In [10]:
#To check the Males data in Pandas in Boolean
titanic.sex == 'male'

0       True
1      False
2      False
3      False
4       True
       ...  
886     True
887    False
888    False
889     True
890     True
Name: sex, Length: 891, dtype: bool

# Filtering Dataframes with one condition

In [27]:
#Filtering out Males data in Dataframe
titanic[titanic.sex == 'male']

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,deck
0,0,3,male,22.0,1,0,7.2500,S,
4,0,3,male,35.0,0,0,8.0500,S,
5,0,3,male,,0,0,8.4583,Q,
6,0,1,male,54.0,0,0,51.8625,S,E
7,0,3,male,2.0,3,1,21.0750,S,
...,...,...,...,...,...,...,...,...,...
883,0,2,male,28.0,0,0,10.5000,S,
884,0,3,male,25.0,0,0,7.0500,S,
886,0,2,male,27.0,0,0,13.0000,S,
889,1,1,male,26.0,0,0,30.0000,C,C


In [26]:
#Chain indexing to filter fare only for Female passenger
titanic.loc[titanic.sex =='female'][['fare','age']]

Unnamed: 0,fare,age
1,71.2833,38.0
2,7.9250,26.0
3,53.1000,35.0
8,11.1333,27.0
9,30.0708,14.0
...,...,...
880,26.0000,25.0
882,10.5167,22.0
885,29.1250,39.0
887,30.0000,19.0


# Filtering Dataframes with Multiple conditions (AND)

# Scenario is to check survival rate of adult males (Age>14) Hypothesis analysis

In [28]:
titanic

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,deck
0,0,3,male,22.0,1,0,7.2500,S,
1,1,1,female,38.0,1,0,71.2833,C,C
2,1,3,female,26.0,0,0,7.9250,S,
3,1,1,female,35.0,1,0,53.1000,S,C
4,0,3,male,35.0,0,0,8.0500,S,
...,...,...,...,...,...,...,...,...,...
886,0,2,male,27.0,0,0,13.0000,S,
887,1,1,female,19.0,0,0,30.0000,S,B
888,0,3,female,,1,2,23.4500,S,
889,1,1,male,26.0,0,0,30.0000,C,C


In [30]:
#Filtering males into mask1
mask1 = titanic.sex =='male'

In [33]:
#Filtering data whose age > 14
mask2 = titanic.age > 14

In [35]:
(mask1 & mask2).head()

0     True
1    False
2    False
3    False
4     True
dtype: bool

In [39]:
male_surv = titanic.loc[(mask1 & mask2),["survived","pclass","sex","age"]]

In [40]:
#All males older than 14 years old
male_surv

Unnamed: 0,survived,pclass,sex,age
0,0,3,male,22.0
4,0,3,male,35.0
6,0,1,male,54.0
12,0,3,male,20.0
13,0,3,male,39.0
...,...,...,...,...
883,0,2,male,28.0
884,0,3,male,25.0
886,0,2,male,27.0
889,1,1,male,26.0


In [43]:
#Adult male > 14 survived on 17.3% in titanic disaster
male_surv.describe()

Unnamed: 0,survived,pclass,age
count,414.0,414.0,414.0
mean,0.173913,2.309179,33.129227
std,0.379493,0.829868,12.922177
min,0.0,1.0,15.0
25%,0.0,2.0,23.0
50%,0.0,3.0,30.0
75%,0.0,3.0,40.0
max,1.0,3.0,80.0


In [44]:
#But in titanic dataset average survival rate is 38% on total passenger including females, males, children
titanic.describe()

Unnamed: 0,survived,pclass,age,sibsp,parch,fare
count,891.0,891.0,714.0,891.0,891.0,891.0
mean,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,0.0,1.0,0.42,0.0,0.0,0.0
25%,0.0,2.0,20.125,0.0,0.0,7.9104
50%,0.0,3.0,28.0,0.0,0.0,14.4542
75%,1.0,3.0,38.0,1.0,0.0,31.0
max,1.0,3.0,80.0,8.0,6.0,512.3292


# Filtering Dataframes with Multiple conditions (OR)

# Scenario is to check survival rate of Females & Children Hypothesis analysis

In [5]:
titanic

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,deck
0,0,3,male,22.0,1,0,7.2500,S,
1,1,1,female,38.0,1,0,71.2833,C,C
2,1,3,female,26.0,0,0,7.9250,S,
3,1,1,female,35.0,1,0,53.1000,S,C
4,0,3,male,35.0,0,0,8.0500,S,
...,...,...,...,...,...,...,...,...,...
886,0,2,male,27.0,0,0,13.0000,S,
887,1,1,female,19.0,0,0,30.0000,S,B
888,0,3,female,,1,2,23.4500,S,
889,1,1,male,26.0,0,0,30.0000,C,C


In [6]:
#Loading female passengers on variable
female_passengers = titanic.sex == 'female'

In [7]:
female_passengers

0      False
1       True
2       True
3       True
4      False
       ...  
886    False
887     True
888     True
889    False
890    False
Name: sex, Length: 891, dtype: bool

In [8]:
#Loading children whose age less than 14 to a variable
children_passengers = titanic.age < 14

In [9]:
children_passengers

0      False
1      False
2      False
3      False
4      False
       ...  
886    False
887    False
888    False
889    False
890    False
Name: age, Length: 891, dtype: bool

In [10]:
(female_passengers | children_passengers).head()

0    False
1     True
2     True
3     True
4    False
dtype: bool

In [11]:
#Filtering Female or Children from Titanic dataset details
women_or_child = titanic.loc[(female_passengers | children_passengers),['survived','pclass','sex','age']] 

In [12]:
women_or_child

Unnamed: 0,survived,pclass,sex,age
1,1,1,female,38.0
2,1,3,female,26.0
3,1,1,female,35.0
7,0,3,male,2.0
8,1,3,female,27.0
...,...,...,...,...
880,1,2,female,25.0
882,0,3,female,22.0
885,0,3,female,39.0
887,1,1,female,19.0


In [13]:
#Probable Survival rate for Children, Female in titanic disaster is 72% which is much higher than all passenger
women_or_child.describe()

Unnamed: 0,survived,pclass,age
count,351.0,351.0,298.0
mean,0.723647,2.205128,25.039161
std,0.447832,0.847232,15.314631
min,0.0,1.0,0.42
25%,0.0,1.0,14.125
50%,1.0,2.0,24.0
75%,1.0,3.0,35.0
max,1.0,3.0,63.0


In [14]:
#Average survival rate for passenger in titanic disaster is 38%
titanic.describe()

Unnamed: 0,survived,pclass,age,sibsp,parch,fare
count,891.0,891.0,714.0,891.0,891.0,891.0
mean,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,0.0,1.0,0.42,0.0,0.0,0.0
25%,0.0,2.0,20.125,0.0,0.0,7.9104
50%,0.0,3.0,28.0,0.0,0.0,14.4542
75%,1.0,3.0,38.0,1.0,0.0,31.0
max,1.0,3.0,80.0,8.0,6.0,512.3292
