In [1]:
import numpy as np
import pandas as pd
import matplotlib as mpl 
import matplotlib.pyplot as plt

In [26]:
train_data = pd.read_csv('titanic/train.csv')

In [45]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [28]:
train_data.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [64]:
train_data.Embarked.isnull()

0      False
1      False
2      False
3      False
4      False
       ...  
886    False
887    False
888    False
889    False
890    False
Name: Embarked, Length: 891, dtype: bool

In [65]:
#titanic_data [titanic_data.Cabin.isnull() ]
train_data.Embarked.value_counts()

S    644
C    168
Q     77
Name: Embarked, dtype: int64

In [30]:
women = train_data.loc[train_data.Sex == 'female']["Survived"]
rate_women = sum(women)/len(women)

print("% of women who survived:", rate_women)

% of women who survived: 0.7420382165605095


In [31]:
men = train_data.loc[train_data.Sex == 'male']["Survived"]
rate_men = sum(men)/len(men)

print("% of men who survived:", rate_men)

% of men who survived: 0.18890814558058924


In [33]:
corr_matrix = train_data.corr()
corr_matrix["Survived"].sort_values(ascending=False)

Survived       1.000000
Fare           0.257307
Parch          0.081629
PassengerId   -0.005007
SibSp         -0.035322
Age           -0.077221
Pclass        -0.338481
Name: Survived, dtype: float64

In [46]:
train_data.head(10)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S
5,6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S
7,8,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.075,,S
8,9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,11.1333,,S
9,10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,,C


In [86]:
#list(train_data.columns) 
features = ['PassengerId', 'Pclass', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked', 'Sex']
X = train_data[features]

In [87]:
#take numeric column and feed it to pipeline
X_num = X.drop(['Embarked', 'Sex'], axis=1)

In [88]:
X_num.info() # only age has null columns

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Pclass       891 non-null    int64  
 2   Age          714 non-null    float64
 3   SibSp        891 non-null    int64  
 4   Parch        891 non-null    int64  
 5   Fare         891 non-null    float64
dtypes: float64(2), int64(4)
memory usage: 41.9 KB


In [89]:
from sklearn.pipeline import Pipeline 
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
num_pipeline = Pipeline([ 
    ('imputer', SimpleImputer(strategy="median")), 
    ('std_scaler', StandardScaler()),
]) 

X_num_train = num_pipeline.fit_transform(X_num)


In [90]:
X_train_num = pd.DataFrame(X_num_train, columns=X_num.columns)

In [91]:
X_train_num.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    float64
 1   Pclass       891 non-null    float64
 2   Age          891 non-null    float64
 3   SibSp        891 non-null    float64
 4   Parch        891 non-null    float64
 5   Fare         891 non-null    float64
dtypes: float64(6)
memory usage: 41.9 KB


In [92]:
X.loc[X.Embarked.isnull(), 'Embarked'] = 'S'


In [103]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
num_attribs = list(X_train_num) 
cat_attribs = ["Embarked", "Sex"]
full_pipeline = ColumnTransformer([ 
    ("num", num_pipeline, num_attribs), 
    ("cat", OneHotEncoder(), cat_attribs),
])
X_train = full_pipeline.fit_transform(X)

In [107]:
print(X.shape)
print(X_train.shape)
#X_train_num.shape

(891, 8)
(891, 11)


In [78]:
y_train = train_data["Survived"]

In [108]:
#let start by KNN
from sklearn.neighbors import KNeighborsClassifier
neigh = KNeighborsClassifier(n_neighbors=3)
neigh.fit(X_train, y_train)

KNeighborsClassifier(n_neighbors=3)

In [109]:
from sklearn.model_selection import cross_val_predict
#calculate confusion matrix using sklrearn cross val predict
y_train_pred = cross_val_predict(neigh, X_train, y_train, cv=3)


In [110]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_train, y_train_pred)

array([[461,  88],
       [106, 236]], dtype=int64)

In [112]:
from sklearn.model_selection import cross_val_score
knnscores = cross_val_score(neigh, X_train, y_train, cv=10)
knnscores.mean()

0.7890636704119851

In [119]:
from sklearn.ensemble import RandomForestClassifier

forest_clf = RandomForestClassifier(n_estimators=100, random_state=42)
forest_scores = cross_val_score(forest_clf, X_train, y_train, cv=10)
forest_scores.mean()

0.8215855181023721

In [127]:
forest_clf.fit(X_train, y_train)

RandomForestClassifier(random_state=42)

In [118]:
y_train_pred = cross_val_predict(forest_clf, X_train, y_train, cv=10)
confusion_matrix(y_train, y_train_pred)

array([[482,  67],
       [ 92, 250]], dtype=int64)

In [115]:
#using SVM
from sklearn.svm import SVC

svm_clf = SVC(gamma="auto")
svm_clf.fit(X_train, y_train)

SVC(gamma='auto')

In [116]:
svc_scores = cross_val_score(svm_clf, X_train, y_train, cv=10)
svc_scores.mean()

0.8260549313358302

In [117]:
y_train_pred = cross_val_predict(svm_clf, X_train, y_train, cv=10)
confusion_matrix(y_train, y_train_pred)

array([[491,  58],
       [ 97, 245]], dtype=int64)

In [120]:
test_data = pd.read_csv('titanic/test.csv')

In [121]:
test_data.loc[test_data.Embarked.isnull(), 'Embarked'] = 'S'


In [122]:
X_test = full_pipeline.transform(test_data)


In [None]:
y_pred = svm_clf.predict(X_test)

In [131]:
output = pd.DataFrame({'PassengerId': test_data.PassengerId,'Survived': y_pred})
output.to_csv('submission.csv', index=False)

In [130]:
y_pred = forest_clf.predict(X_test)