In [77]:
import pandas as pd 
import numpy as np 

# Read data and describe it

In [78]:
titanic = pd.read_csv("../Titanic/titanic_dataset.csv")

In [79]:
titanic.head()

Unnamed: 0.1,Unnamed: 0,pclass,survived,name,gender,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
0,0,1,1,"Allen, Miss. Elisabeth Walton",female,29.0,0,0,24160,$211.34,B5,S,2.0,,"St Louis, MO"
1,1,1,1,"Allison, Master. Hudson Trevor",male,0.9167,1,2,113781,$151.55,C22 C26,S,11.0,,"Montreal, PQ / Chesterville, ON"
2,2,1,0,"Allison, Miss. Helen Loraine",female,2.0,1,2,113781,$151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"
3,3,1,0,"Allison, Mr. Hudson Joshua Creighton",male,30.0,1,2,113781,$151.55,C22 C26,S,,135.0,"Montreal, PQ / Chesterville, ON"
4,4,1,0,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",female,25.0,1,2,113781,$151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"


In [80]:

titanic.describe()

Unnamed: 0.1,Unnamed: 0,pclass,survived,age,sibsp,parch,body
count,1309.0,1309.0,1309.0,1046.0,1309.0,1309.0,121.0
mean,654.0,2.294882,0.381971,29.881135,0.498854,0.385027,160.809917
std,378.020061,0.837836,0.486055,14.4135,1.041658,0.86556,97.696922
min,0.0,1.0,0.0,0.1667,0.0,0.0,1.0
25%,327.0,2.0,0.0,21.0,0.0,0.0,72.0
50%,654.0,3.0,0.0,28.0,0.0,0.0,155.0
75%,981.0,3.0,1.0,39.0,1.0,0.0,256.0
max,1308.0,3.0,1.0,80.0,8.0,9.0,328.0


# Find columns with missingdata

In [81]:
titanic.columns[titanic.isna().sum() >0]

Index(['age', 'fare', 'cabin', 'embarked', 'boat', 'body', 'home.dest'], dtype='object')

# Print shape of dataset

In [82]:
titanic.shape

(1309, 15)

# drop columns with more than 25% missing data

In [83]:
percent_missing = titanic.isna().sum() * 100 /len(titanic)

In [84]:
titanic_columns = titanic.columns[percent_missing > 25]
titanic_columns

Index(['cabin', 'boat', 'body', 'home.dest'], dtype='object')

In [85]:
titanic.drop(titanic_columns, axis=1, inplace=True)

In [86]:

titanic.head()

Unnamed: 0.1,Unnamed: 0,pclass,survived,name,gender,age,sibsp,parch,ticket,fare,embarked
0,0,1,1,"Allen, Miss. Elisabeth Walton",female,29.0,0,0,24160,$211.34,S
1,1,1,1,"Allison, Master. Hudson Trevor",male,0.9167,1,2,113781,$151.55,S
2,2,1,0,"Allison, Miss. Helen Loraine",female,2.0,1,2,113781,$151.55,S
3,3,1,0,"Allison, Mr. Hudson Joshua Creighton",male,30.0,1,2,113781,$151.55,S
4,4,1,0,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",female,25.0,1,2,113781,$151.55,S


# drop columns having independent values(which do not affect the survival rate)

In [87]:
# columns with missing values is already dropped 
# from the description ticket column has no significance in predecting the servival rate ticket column can be dropped

In [88]:
titanic.drop(["ticket"], axis=1, inplace=True)

# check data types of all columns

In [89]:
titanic.dtypes

Unnamed: 0      int64
pclass          int64
survived        int64
name           object
gender         object
age           float64
sibsp           int64
parch           int64
fare           object
embarked       object
dtype: object

# convert price to numeric

In [90]:
titanic.fare = pd.to_numeric(titanic.fare.replace(to_replace= r'[$]',value="", regex=True))
#titanic.fare = titanic.fare.astype(np.float32)
titanic.dtypes

Unnamed: 0      int64
pclass          int64
survived        int64
name           object
gender         object
age           float64
sibsp           int64
parch           int64
fare          float64
embarked       object
dtype: object

# find columns still having missing/na values and also count of missing data

In [91]:
titanic.isna().sum()

Unnamed: 0      0
pclass          0
survived        0
name            0
gender          0
age           263
sibsp           0
parch           0
fare            1
embarked        2
dtype: int64

# fill na with mean for fare and age column column.

In [92]:
titanic[['age', 'fare']] = titanic[['age', 'fare']].fillna(titanic.mean())

In [93]:
titanic.isna().sum()

Unnamed: 0    0
pclass        0
survived      0
name          0
gender        0
age           0
sibsp         0
parch         0
fare          0
embarked      2
dtype: int64

# drop na values for embarked column.

In [94]:
titanic.isna().sum()

Unnamed: 0    0
pclass        0
survived      0
name          0
gender        0
age           0
sibsp         0
parch         0
fare          0
embarked      2
dtype: int64

In [95]:
titanic[titanic.embarked.isna()]

Unnamed: 0.1,Unnamed: 0,pclass,survived,name,gender,age,sibsp,parch,fare,embarked
168,168,1,1,"Icard, Miss. Amelie",female,38.0,0,0,80.0,
284,284,1,1,"Stone, Mrs. George Nelson (Martha Evelyn)",female,62.0,0,0,80.0,


In [96]:
titanic.drop(labels=titanic[titanic.embarked.isna()].index, inplace=True)

In [97]:
titanic[titanic.embarked.isna()]

Unnamed: 0.1,Unnamed: 0,pclass,survived,name,gender,age,sibsp,parch,fare,embarked


In [98]:
titanic.isna().sum()

Unnamed: 0    0
pclass        0
survived      0
name          0
gender        0
age           0
sibsp         0
parch         0
fare          0
embarked      0
dtype: int64

# dump the dataframe to a csv file 'titanic_filtered.csv'

In [99]:
titanic.to_csv('titanic_filtered.csv')

# for surviced column replace 0 with D and 1 with A

In [100]:
titanic.survived.replace('0', 'D', inplace=True)
titanic.survived.replace('1', 'A', inplace=True)
titanic.head()

Unnamed: 0.1,Unnamed: 0,pclass,survived,name,gender,age,sibsp,parch,fare,embarked
0,0,1,1,"Allen, Miss. Elisabeth Walton",female,29.0,0,0,211.34,S
1,1,1,1,"Allison, Master. Hudson Trevor",male,0.9167,1,2,151.55,S
2,2,1,0,"Allison, Miss. Helen Loraine",female,2.0,1,2,151.55,S
3,3,1,0,"Allison, Mr. Hudson Joshua Creighton",male,30.0,1,2,151.55,S
4,4,1,0,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",female,25.0,1,2,151.55,S


# find the frequency of different values in survived column

In [101]:
titanic.survived.value_counts()

0    809
1    498
Name: survived, dtype: int64

# group by gender and survived and see the counts in each category

In [102]:
titanic.groupby(['gender', 'survived']).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 0,pclass,name,age,sibsp,parch,fare,embarked
gender,survived,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
female,0,127,127,127,127,127,127,127,127
female,1,337,337,337,337,337,337,337,337
male,0,682,682,682,682,682,682,682,682
male,1,161,161,161,161,161,161,161,161


In [103]:
# female survival rate is more

# find different pclass and no of people in each class

In [104]:
titanic.pclass.value_counts()

3    709
1    321
2    277
Name: pclass, dtype: int64

# find top 5 people with highest values of age. Count no of male and females in the top 5

In [105]:
df = titanic.sort_values(by='age',ascending=False).head(5).groupby(['gender']).count()

In [106]:
df

Unnamed: 0_level_0,Unnamed: 0,pclass,survived,name,age,sibsp,parch,fare,embarked
gender,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
female,1,1,1,1,1,1,1,1,1
male,4,4,4,4,4,4,4,4,4


# find max age male and female who survived

In [121]:
df1 = titanic.sort_values(by='age',ascending=False).head(5).groupby(['gender']).max()
df1

Unnamed: 0_level_0,Unnamed: 0,pclass,survived,name,age,sibsp,parch,fare,embarked
gender,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
female,61,1,1,"Cavendish, Mrs. Tyrell William (Julia Florence...",76.0,1,0,78.85,S
male,1235,3,1,"Svensson, Mr. Johan",80.0,0,0,49.5,S


In [130]:
print(f"female condidate is {df1[df1.index == 'female']['name']}")

female condidate is gender
female    Cavendish, Mrs. Tyrell William (Julia Florence...
Name: name, dtype: object


In [131]:
print(f"male condidate is {df1[df1.index == 'male']['name']}")

male condidate is gender
male    Svensson, Mr. Johan
Name: name, dtype: object


# get average age by gender

In [133]:
titanic.groupby('gender')['age'].mean()

gender
female    28.795931
male      30.430716
Name: age, dtype: float64

# get average age by people survived vs not-survived

In [134]:
titanic.groupby('survived')['age'].mean()

survived
0    30.389368
1    28.974711
Name: age, dtype: float64