# Titanic Survival Analysis

In [1]:
import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt

In [2]:
data = pd.read_csv("train.csv")
data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


#### Cabin has many missing values. Plus I couldn't find how it would effect the survival.
#### Name and Ticket too wont matter much.
#### So Name Ticket Column are dropped

In [4]:
y_train = data['Survived']
data = data.drop(['Name', 'Ticket', 'Cabin', 'PassengerId', 'Survived'], axis = 1)

In [5]:
data.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Pclass    891 non-null    int64  
 1   Sex       891 non-null    object 
 2   Age       714 non-null    float64
 3   SibSp     891 non-null    int64  
 4   Parch     891 non-null    int64  
 5   Fare      891 non-null    float64
 6   Embarked  889 non-null    object 
dtypes: float64(2), int64(3), object(2)
memory usage: 48.9+ KB


##### Create a attribute selector to supply attributes to be transformed in Pipeline.

In [6]:
from sklearn.base import BaseEstimator, TransformerMixin

class DataFrameSelector(BaseEstimator, TransformerMixin):
  def __init__(self, attribute_names):
    self.attribute_names = attribute_names
  def fit(self, X, y = None):
    return self
  def transform(self, X):
    return data[self.attribute_names]

##### Imputer for missing age values. Replacing them with median.

In [7]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

num_pipeline = Pipeline([
                         ("select_numeric", DataFrameSelector(['Age', 'Fare', 'SibSp', 'Parch'])),
                         ("imputer", SimpleImputer(strategy = "median")),
])

In [8]:
w = num_pipeline.fit_transform(data)
len(w)

891

##### Replacing Embarked's missing values with most frequent as only 2 values are missing out of **891**

In [9]:
class MostFrequentImputer(BaseEstimator, TransformerMixin):
  def fit(self, X, y = None):
    self.most_frequent_ = pd.Series([X[c].value_counts().index[0] for c in X],
                                  index = X.columns)
    return self
  def transform(self, X, y = None):
    return X.fillna(self.most_frequent_)

##### Constructing 0 1 columns for categorical attributes

In [10]:
from sklearn.preprocessing import OneHotEncoder

cat_pipeline = Pipeline([
                         ("select_cat", DataFrameSelector(['Embarked', 'Sex', 'Pclass'])),
                         ("imputer", MostFrequentImputer()),
                         ("cat_encoder", OneHotEncoder(sparse = False)),
])

In [11]:
cat_pipeline.fit_transform(data)

array([[0., 0., 1., ..., 0., 0., 1.],
       [1., 0., 0., ..., 1., 0., 0.],
       [0., 0., 1., ..., 0., 0., 1.],
       ...,
       [0., 0., 1., ..., 0., 0., 1.],
       [1., 0., 0., ..., 1., 0., 0.],
       [0., 1., 0., ..., 0., 0., 1.]])

##### Combining pipelines, through Feature Union

In [12]:
from sklearn.pipeline import FeatureUnion

preprocess_pipeline = FeatureUnion(
    transformer_list = [
                        ("num_pipeline", num_pipeline),
                        ("cat_pipeline", cat_pipeline),
    ]
)

In [13]:
X_train = data

print(X_train.info())
print("-----------")
print(y_train)
X_train = preprocess_pipeline.fit_transform(data)
print("-----------")
print(X_train.shape)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Pclass    891 non-null    int64  
 1   Sex       891 non-null    object 
 2   Age       714 non-null    float64
 3   SibSp     891 non-null    int64  
 4   Parch     891 non-null    int64  
 5   Fare      891 non-null    float64
 6   Embarked  889 non-null    object 
dtypes: float64(2), int64(3), object(2)
memory usage: 48.9+ KB
None
-----------
0      0
1      1
2      1
3      1
4      0
      ..
886    0
887    1
888    0
889    1
890    0
Name: Survived, Length: 891, dtype: int64
-----------
(891, 12)


##### Support Vector Machine Classifier

In [14]:
test_data = pd.read_csv('test.csv')
o = test_data
test_data = test_data.drop(['Name', 'Ticket', 'Cabin', 'PassengerId'], axis = 1)
print(test_data.info())
g = test_data[['Age', 'Fare']]
test_data = test_data.drop(['Age', 'Fare'], axis = 1)
imputer = SimpleImputer(strategy="median")
t = imputer.fit(g)
t = imputer.transform(g)
print(t.shape)
test_data = pd.concat([test_data, pd.DataFrame(t)], axis=1)

test_data = test_data.rename(columns={0: "Age", 1: "Fare"})
test_data.info()
#y_pred = svm_clf.predict(X_test)
#print(len(y_pred))

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Pclass    418 non-null    int64  
 1   Sex       418 non-null    object 
 2   Age       332 non-null    float64
 3   SibSp     418 non-null    int64  
 4   Parch     418 non-null    int64  
 5   Fare      417 non-null    float64
 6   Embarked  418 non-null    object 
dtypes: float64(2), int64(3), object(2)
memory usage: 23.0+ KB
None
(418, 2)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Pclass    418 non-null    int64  
 1   Sex       418 non-null    object 
 2   SibSp     418 non-null    int64  
 3   Parch     418 non-null    int64  
 4   Embarked  418 non-null    object 
 5   Age       418 non-null    float64
 6   Fare      418 non-null    float64
dtypes: float

In [15]:
g = test_data[['Pclass', 'Sex', 'Embarked']]
test_data = test_data.drop(['Pclass', 'Sex', 'Embarked'], axis = 1)
ohe = OneHotEncoder(sparse = False)
t = ohe.fit_transform(g)
print(t.shape)
test_data = test_data.join(pd.DataFrame(t))

#test_data = test_data.rename(columns={0: "Age", 1: "Fare"})
X_test = test_data.to_numpy()
X_train.shape
y_train.shape
#y_pred = svm_clf.predict(X_test)
#print(len(y_pred))

(418, 8)


(891,)

In [16]:
from sklearn.model_selection import cross_val_score
from sklearn.svm import SVC

svm_clf = SVC(gamma="auto")
svm_clf.fit(X_train, y_train)

svm_scores = cross_val_score(svm_clf, X_train, y_train, cv=10)
svm_scores.mean()

0.7329588014981274

##### Random Forest Classifier

In [17]:
from sklearn.ensemble import RandomForestClassifier

forest_clf = RandomForestClassifier(n_estimators=100, random_state=42)
forest_clf.fit(X_train, y_train)
y_pred_rfc = forest_clf.predict(X_test)
forest_scores = cross_val_score(forest_clf, X_train, y_train, cv=10)

forest_scores.mean()

0.8160049937578029

In [18]:
len(y_pred_rfc)

418

In [19]:
result = pd.DataFrame(columns=['PassengerId'])

result[['PassengerId']] = o[['PassengerId']]
result.index = test_data.index
result

Unnamed: 0,PassengerId
0,892
1,893
2,894
3,895
4,896
...,...
413,1305
414,1306
415,1307
416,1308


In [20]:
result = pd.concat([result, pd.DataFrame(y_pred_rfc)], axis = 1)



In [21]:
result = result.rename(columns={"Survivor":"Survived"})

In [22]:
result

Unnamed: 0,PassengerId,0
0,892,1
1,893,1
2,894,0
3,895,0
4,896,1
...,...,...
413,1305,0
414,1306,1
415,1307,0
416,1308,0


In [23]:
doct = result.to_csv('final_submission.csv', index = False)