In [1]:
#importing libraries
import pandas as pd
import numpy as np

#used read_csv function to read csv file in python
df= pd.DataFrame(pd.read_csv('/content/train (1).csv'))
df.head()

#getting shape of dataframe
df.shape

(891, 12)

In [2]:
#to check columns with null values
df.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [6]:
#to check that column in which more than 50% data is null
drop_col= df.isnull().sum()[df.isnull().sum()>(50/100* df.shape[0])]
drop_col

Cabin    687
dtype: int64

In [7]:
#now since this column is not much useful, we will drop it and then check null value count in all columns
df.drop(drop_col.index, axis=1, inplace=True)
df.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Embarked         2
dtype: int64

In [8]:
#we can see that cabin is dropped. Now we will fill null values with mean. Here embarked will not be filled as it is a string
df.fillna(df.mean(), inplace= True)
df.isnull().sum()

PassengerId    0
Survived       0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Embarked       2
dtype: int64

In [9]:
df['Embarked'].describe()

count     889
unique      3
top         S
freq      644
Name: Embarked, dtype: object

In [12]:
#filling embarked value with the most frequent name
df['Embarked'].fillna('S', inplace= True)
df.isnull().sum() # now all the null values have been filled

PassengerId    0
Survived       0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Embarked       0
dtype: int64

In [13]:
df.corr() #gives correaltion between x and y. Max value can be 1.

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
PassengerId,1.0,-0.005007,-0.035144,0.033207,-0.057527,-0.001652,0.012658
Survived,-0.005007,1.0,-0.338481,-0.069809,-0.035322,0.081629,0.257307
Pclass,-0.035144,-0.338481,1.0,-0.331339,0.083081,0.018443,-0.5495
Age,0.033207,-0.069809,-0.331339,1.0,-0.232625,-0.179191,0.091566
SibSp,-0.057527,-0.035322,0.083081,-0.232625,1.0,0.414838,0.159651
Parch,-0.001652,0.081629,0.018443,-0.179191,0.414838,1.0,0.216225
Fare,0.012658,0.257307,-0.5495,0.091566,0.159651,0.216225,1.0


In [14]:
# a good co relation (although negative) can be observed between fare and Pclass. That means Class A has max fare

#now we will make a new column combining two columns
df['FamilySize']= df['SibSp'] + df['Parch']
df.drop(['SibSp', 'Parch'], axis=1, inplace=True)
df.corr() #this shows higher the family size, more the chances of survival

Unnamed: 0,PassengerId,Survived,Pclass,Age,Fare,FamilySize
PassengerId,1.0,-0.005007,-0.035144,0.033207,0.012658,-0.040143
Survived,-0.005007,1.0,-0.338481,-0.069809,0.257307,0.016639
Pclass,-0.035144,-0.338481,1.0,-0.331339,-0.5495,0.065997
Age,0.033207,-0.069809,-0.331339,1.0,0.091566,-0.248512
Fare,0.012658,0.257307,-0.5495,0.091566,1.0,0.217138
FamilySize,-0.040143,0.016639,0.065997,-0.248512,0.217138,1.0


In [15]:
#to check chances of survival if alone
df['Alone']= [0 if df['FamilySize'][i]>0 else 1 for i in df.index]
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,Ticket,Fare,Embarked,FamilySize,Alone
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,A/5 21171,7.25,S,1,0
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,PC 17599,71.2833,C,1,0
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,STON/O2. 3101282,7.925,S,0,1
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,113803,53.1,S,1,0
4,5,0,3,"Allen, Mr. William Henry",male,35.0,373450,8.05,S,0,1


In [16]:
df.groupby(['Alone'])['Survived'].mean() #shows that lower chance of surviving if alone

Alone
0    0.505650
1    0.303538
Name: Survived, dtype: float64

In [17]:
#fare if a person is alone
df[['Alone', 'Fare']].corr() #we see that if the person is not alone, chances of ticket prices are higher

Unnamed: 0,Alone,Fare
Alone,1.0,-0.271832
Fare,-0.271832,1.0


In [18]:
df['Sex']= [0 if df['Sex'][i]=='male' else 1 for i in df.index] #1 for male 0 for female
df.groupby(['Sex'])['Survived'].mean() #shows feamle passengers were prioritized

Sex
0    0.188908
1    0.742038
Name: Survived, dtype: float64

In [19]:
df.groupby(['Embarked'])['Survived'].mean() #high chances of survival of Cherbourg passengers

Embarked
C    0.553571
Q    0.389610
S    0.339009
Name: Survived, dtype: float64

CONCLUSION

1. People with high class or rich people have higher survival rate than others. The hierarichy must have been followed while saving the passengers.
2. Passengers tarvelling with their family have higher survival rate.
3. Alone passengers have less fare.
4. Female passengers were prioritized while saving.
5. Passengers who boarded the ship at Cherbourg survived more in proportion than the others.