In [1]:
#reading titanic dataset 
import pandas as pd
import numpy as np
df = pd.read_csv('train.csv')
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [2]:
#size info
df.shape

(891, 12)

In [3]:
#details
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [4]:
# check for duplicates in the dataset
df.duplicated().sum()

0

In [5]:
# check if there are null datas in the dataset
df.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

detected null for Age,Cabin, and Embarked data 

In [6]:
#check for negative age
any(df['Age']<0)

False

In [7]:
# check the number of unique values in each column
df.nunique()

PassengerId    891
Survived         2
Pclass           3
Name           891
Sex              2
Age             88
SibSp            7
Parch            7
Ticket         681
Fare           248
Cabin          147
Embarked         3
dtype: int64

In [8]:
#Imputing missing values 

from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.impute import SimpleImputer
imp_S = SimpleImputer(strategy='most_frequent')
imp_I = IterativeImputer(max_iter=10, random_state=0,min_value=0,max_value=100)
df[['Survived','Pclass','Age','SibSp','Parch','Fare']]=imp_I.fit_transform(df[['Survived','Pclass','Age','SibSp','Parch','Fare']])
df[['Cabin','Embarked']]=imp_S.fit_transform(df[['Cabin','Embarked']])

In [9]:
# check if there are null datas in the dataset
df.isnull().sum()


PassengerId    0
Survived       0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Cabin          0
Embarked       0
dtype: int64

In [10]:
#Classifing Age to 5 different catagories

from sklearn.preprocessing import FunctionTransformer
bins = [-1, 1, 13, 20, 60, np.inf]
labels = ['infant', 'kid', 'teen', 'adult', 'senior citizen']
age_trans = FunctionTransformer(pd.cut, kw_args={'bins': bins, 'labels': labels, 'retbins': False})
df['Age']=age_trans.fit_transform(df['Age'])

In [11]:
# testing catagories
df['Age'].unique()
#df.isnull().sum()

['adult', 'kid', 'teen', 'senior citizen', 'infant']
Categories (5, object): ['infant' < 'kid' < 'teen' < 'adult' < 'senior citizen']

In [12]:
df.dtypes


PassengerId       int64
Survived        float64
Pclass          float64
Name             object
Sex              object
Age            category
SibSp           float64
Parch           float64
Ticket           object
Fare            float64
Cabin            object
Embarked         object
dtype: object

In [13]:
#Converting object variables to catagorical
df["Survived"]=df["Survived"].astype("category")
df["Pclass"]=df["Pclass"].astype("category")
df["Sex"]=df["Sex"].astype("category")
df["Embarked"]=df["Embarked"].astype("category")
df.dtypes

PassengerId       int64
Survived       category
Pclass         category
Name             object
Sex            category
Age            category
SibSp           float64
Parch           float64
Ticket           object
Fare            float64
Cabin            object
Embarked       category
dtype: object

A new variable Ticket_type can be engineered (since Ticket is not unique for each passengers,multiple passengers share same ticket)

In [14]:
#groupng based on Ticket
Group_ticket=pd.DataFrame(df.groupby('Ticket')['Ticket'].count().reset_index(name='person_per_Ticket'))
Group_ticket['Ticket_type']=Group_ticket['person_per_Ticket'].apply(lambda x: 'multi_person' if x>1 else 'single_person')
Group_ticket

Unnamed: 0,Ticket,person_per_Ticket,Ticket_type
0,110152,3,multi_person
1,110413,3,multi_person
2,110465,2,multi_person
3,110564,1,single_person
4,110813,1,single_person
...,...,...,...
676,W./C. 6608,4,multi_person
677,W./C. 6609,1,single_person
678,W.E.P. 5734,1,single_person
679,W/C 14208,1,single_person


In [15]:
df=df.merge(Group_ticket, how='left', left_on='Ticket', right_on='Ticket')

In [16]:
#df['Ticket_type']=0
#for i in range(0,891):
 #   for j in range(0,681):
  #      if(df['Ticket'][i]==g['Ticket'][j]):
   #         df['Ticket_type'][i]=g['Ticket_type'][j]

In [17]:
# new variables person_per_Ticket and Ticket_type are added
df.tail(20)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,person_per_Ticket,Ticket_type
871,872,1.0,1.0,"Beckwith, Mrs. Richard Leonard (Sallie Monypeny)",female,adult,1.0,1.0,11751,52.5542,D35,S,2,multi_person
872,873,0.0,1.0,"Carlsson, Mr. Frans Olof",male,adult,0.0,0.0,695,5.0,B51 B53 B55,S,1,single_person
873,874,0.0,3.0,"Vander Cruyssen, Mr. Victor",male,adult,0.0,0.0,345765,9.0,B96 B98,S,1,single_person
874,875,1.0,2.0,"Abelson, Mrs. Samuel (Hannah Wizosky)",female,adult,1.0,0.0,P/PP 3381,24.0,B96 B98,C,2,multi_person
875,876,1.0,3.0,"Najib, Miss. Adele Kiamie ""Jane""",female,teen,0.0,0.0,2667,7.225,B96 B98,C,1,single_person
876,877,0.0,3.0,"Gustafsson, Mr. Alfred Ossian",male,teen,0.0,0.0,7534,9.8458,B96 B98,S,2,multi_person
877,878,0.0,3.0,"Petroff, Mr. Nedelio",male,teen,0.0,0.0,349212,7.8958,B96 B98,S,1,single_person
878,879,0.0,3.0,"Laleff, Mr. Kristo",male,adult,0.0,0.0,349217,7.8958,B96 B98,S,1,single_person
879,880,1.0,1.0,"Potter, Mrs. Thomas Jr (Lily Alexenia Wilson)",female,adult,0.0,1.0,11767,83.1583,C50,C,2,multi_person
880,881,1.0,2.0,"Shelley, Mrs. William (Imanita Parrish Hall)",female,adult,0.0,1.0,230433,26.0,B96 B98,S,2,multi_person


In [18]:
df.shape

(891, 14)