In [1]:
import pandas as pd
import numpy as np

In [None]:
# Read and Describe Data
df = pd.read_csv('titanic_dataset.csv')
print(df.describe(include = 'all'))

In [None]:
# columns with missing data
print(df.isna().any())
print()
print(df.columns[df.isna().any() == True])

print(df.isna().sum()/df.shape[0])

In [None]:
# Print shape of dataset
print(df.shape)

In [None]:
# drop columns with more than 25% missing data
df.dropna(axis = 'columns', thresh = df.shape[0]*0.75, inplace = True)
print(df.head(2))

In [None]:
# drop columns having independent values(which do not affect the survival rate).
df.drop(columns = ['Unnamed: 0','name', 'ticket'], inplace = True)
print(df.head(2))

In [None]:
# check data types of all columns
print(df.dtypes)

In [None]:
# convert price to numeric

# method-1
c1 = df.fare.replace(r'\$', '', regex = True)
print(c1.head()) 

# method-2. 
c2 = df.fare.str.replace('$', '')
print(c2.head())#No inplace option available so replace old col with result of replace

# Here I use method-1 
df.fare.replace(r'\$', '', regex = True, inplace = True)
print(df.head())

# finally change datatype
print()
df.fare = df.fare.astype('float64')
print(df.dtypes)

In [None]:
#find columns still having missing/na values and also count of missing data
print(df.isna().any())
print()
print(df.isna().sum())

In [None]:
# fill na with mean for fare and age column column.
df.fare.fillna(df.fare.mean(), inplace = True)
df.age.fillna(df.age.mean(), inplace = True)
print(df.isna().sum())

In [None]:
# drop na values for embarked column.
df.dropna(subset = ['embarked'], inplace = True)
print(df.isna().sum())

In [None]:
# dump the dataframe to a csv file 'titanic_filtered.csv'.
df.to_csv('titanic_filtered.csv')

In [None]:
# for surviced column replace 0 with D and 1 with A
df.survived.replace({0:'D', 1:'A'}, inplace = True)
print(df.survived.head())

In [None]:
#find the frequency of different values in survived column
print( df.survived.value_counts() )

In [None]:
#group by gender and survived and see the counts in each category
g1 = df.groupby(['gender', 'survived'])
g1.count()

In [None]:
# find different pclass and no of people in each class
print(df.pclass.unique())
print()
print(df.pclass.value_counts())

In [None]:
# find top 5 people with highest values of age. Count no of male and females in the top 5
res = df.nlargest(5, 'age')
print(res)
print(res.gender.value_counts())

In [None]:
# find max age male and female who survived
res = df[(df.survived == 'A') & (df.gender == 'male')]
print(res.nlargest(1, 'age'))

res = df[(df.survived == 'A') & (df.gender == 'female')]
print(res.nlargest(1, 'age'))

In [None]:
# get average age by gender
print(df.groupby('gender').age.mean())

In [None]:
# get average age by people survived vs not-survived
print(df.groupby('survived').age.mean())