In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import statsmodels.api as sm
import seaborn as sns
sns.set()


# Load the Data


In [2]:
traindata=pd.read_csv('train.csv')
traindata.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [3]:
traindata.shape

(891, 12)

In [4]:
traindata.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [5]:
data_cleaned = traindata.copy()
data_cleaned['Embarked'] = data_cleaned['Embarked'].fillna(value = traindata['Embarked'].mode()[0])
data_cleaned['Age'] = data_cleaned['Age'].fillna(value = data_cleaned['Age'].mean().round())
data_cleaned['Sex']=data_cleaned['Sex'].map({'male':0,'female':1}) 
data_cleaned['Pclass']=data_cleaned['Pclass'].map({1:0,2:0,3:1}) 
data_cleaned=data_cleaned.drop({'PassengerId','Name','Ticket','Cabin'},axis=1)
data_cleaned.head(20)

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,1,0,22.0,1,0,7.25,S
1,1,0,1,38.0,1,0,71.2833,C
2,1,1,1,26.0,0,0,7.925,S
3,1,0,1,35.0,1,0,53.1,S
4,0,1,0,35.0,0,0,8.05,S
5,0,1,0,30.0,0,0,8.4583,Q
6,0,0,0,54.0,0,0,51.8625,S
7,0,1,0,2.0,3,1,21.075,S
8,1,1,1,27.0,0,2,11.1333,S
9,1,0,1,14.0,1,0,30.0708,C


In [6]:
data_cleaned.isnull().sum()

Survived    0
Pclass      0
Sex         0
Age         0
SibSp       0
Parch       0
Fare        0
Embarked    0
dtype: int64

# Dummies variable for Embarked

In [7]:
data_cleaned.columns.values

array(['Survived', 'Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare',
       'Embarked'], dtype=object)

In [8]:
pd.unique(data_cleaned['Embarked'])

array(['S', 'C', 'Q'], dtype=object)

In [9]:
embarked_columns=(pd.get_dummies(data_cleaned['Embarked']))
embarked_columns['check']=embarked_columns.sum(axis=1)
embarked_columns.sum()

C        168
Q         77
S        646
check    891
dtype: int64

In [10]:
embarked_columns=pd.get_dummies(data_cleaned['Embarked'],drop_first=True)
embarked_columns

Unnamed: 0,Q,S
0,0,1
1,0,0
2,0,1
3,0,1
4,0,1
...,...,...
886,0,1
887,0,1
888,0,1
889,0,0


In [11]:
Q=embarked_columns.loc[:,'Q':'Q'].max(axis=1)
S=embarked_columns.loc[:,'S':].max(axis=1)


In [12]:
data_cleaned=pd.concat([data_cleaned,Q,S], axis=1)
data_cleaned.head()


Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,0,1
0,0,1,0,22.0,1,0,7.25,S,0,1
1,1,0,1,38.0,1,0,71.2833,C,0,0
2,1,1,1,26.0,0,0,7.925,S,0,1
3,1,0,1,35.0,1,0,53.1,S,0,1
4,0,1,0,35.0,0,0,8.05,S,0,1


In [13]:
data_cleaned=data_cleaned.drop({'Embarked'},axis=1)

In [14]:
data_cleaned.columns.values

array(['Survived', 'Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 0, 1],
      dtype=object)

In [15]:
new_columns=['Target','Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare','Q','S']
data_cleaned.columns=new_columns


In [16]:
data_cleaned.head()

Unnamed: 0,Target,Pclass,Sex,Age,SibSp,Parch,Fare,Q,S
0,0,1,0,22.0,1,0,7.25,0,1
1,1,0,1,38.0,1,0,71.2833,0,0
2,1,1,1,26.0,0,0,7.925,0,1
3,1,0,1,35.0,1,0,53.1,0,1
4,0,1,0,35.0,0,0,8.05,0,1


# Dummies for Sibsp,Parch

In [17]:
data_cleaned.isnull().sum()

Target    0
Pclass    0
Sex       0
Age       0
SibSp     0
Parch     0
Fare      0
Q         0
S         0
dtype: int64

In [18]:
pdata=data_cleaned.copy()
pdata.to_csv('Titanic_preprocessed.csv', index=False)