In [314]:
import os

# Data manipulation
import pandas as pd
import numpy as np
import random as rn

# Data visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Machine learning
from sklearn.neighbors import KNeighborsClassifier

## Acquire data

In [315]:
# scan the files in the ./data path
obj = os.scandir('./data')

csv_files = []

# Select csv files from the data folder
for entry in obj:
    if entry.name[-3:] == 'csv':
        csv_files.append(entry.name)

# create variable for each csv file

csv_variables = []
for file in csv_files:
    globals()[file[0:-4]] = pd.read_csv('./data/'+file)
    csv_variables.append(globals()[file[0:-4]])

# columns to lowcase

for dataframe in csv_variables:
    dataframe.columns = [column.lower() for column in dataframe.columns]

# Combine both datasets
combine = [train, test]

## Analyze by describing data

In [316]:
print(train.columns.values)
print(test.columns.values)

['passengerid' 'survived' 'pclass' 'name' 'sex' 'age' 'sibsp' 'parch'
 'ticket' 'fare' 'cabin' 'embarked']
['passengerid' 'pclass' 'name' 'sex' 'age' 'sibsp' 'parch' 'ticket' 'fare'
 'cabin' 'embarked']


In [317]:
train.head()

# The categorical features: Survived, Sex, Embarked, Pclass
# The numerical features: Age, Fare, SibSp, Parch

Unnamed: 0,passengerid,survived,pclass,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [318]:
train.info()
print('_'*40)
test.info()

# There is NaN values in both datasets

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   passengerid  891 non-null    int64  
 1   survived     891 non-null    int64  
 2   pclass       891 non-null    int64  
 3   name         891 non-null    object 
 4   sex          891 non-null    object 
 5   age          714 non-null    float64
 6   sibsp        891 non-null    int64  
 7   parch        891 non-null    int64  
 8   ticket       891 non-null    object 
 9   fare         891 non-null    float64
 10  cabin        204 non-null    object 
 11  embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB
________________________________________
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   passenger

In [319]:
# The cabin and ticker features are not relevant to machine learning model

train = train.drop(['ticket', 'cabin'], axis=1)
test = test.drop(['ticket', 'cabin'], axis=1)
combine = [train, test]

In [320]:
for dataset in combine:
    dataset['title'] = dataset.name.str.extract(' ([A-Za-z]+)\.', expand=False)

pd.crosstab(train['title'], train['sex'])

sex,female,male
title,Unnamed: 1_level_1,Unnamed: 2_level_1
Capt,0,1
Col,0,2
Countess,1,0
Don,0,1
Dr,1,6
Jonkheer,0,1
Lady,1,0
Major,0,2
Master,0,40
Miss,182,0


In [321]:
for dataset in combine:
    dataset['title'] = dataset['title'].replace(['Lady', 'Countess','Capt', 'Col',\
 	'Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')

    dataset['title'] = dataset['title'].replace('Mlle', 'Miss')
    dataset['title'] = dataset['title'].replace('Ms', 'Miss')
    dataset['title'] = dataset['title'].replace('Mme', 'Mrs')
    
train[['title', 'survived']].groupby(['title'], as_index=False).mean()

Unnamed: 0,title,survived
0,Master,0.575
1,Miss,0.702703
2,Mr,0.156673
3,Mrs,0.793651
4,Rare,0.347826


In [322]:
title_mapping = {"Mr": 1, "Miss": 2, "Mrs": 3, "Master": 4, "Rare": 5}
for dataset in combine:
    dataset['title'] = dataset['title'].map(title_mapping)
    dataset['title'] = dataset['title'].fillna(0)

train.head()

Unnamed: 0,passengerid,survived,pclass,name,sex,age,sibsp,parch,fare,embarked,title
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,7.25,S,1
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,71.2833,C,3
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,7.925,S,2
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,53.1,S,3
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,8.05,S,1


In [323]:
train = train.drop(['name', 'passengerid'], axis=1)
test = test.drop(['name'], axis=1)
combine = [train, test]
train.shape, test.shape

((891, 9), (418, 9))

In [324]:
train = pd.get_dummies(train, columns=['pclass','sibsp', 'parch', 'embarked', 'title'])
test = pd.get_dummies(test, columns=['pclass','sibsp', 'parch', 'embarked', 'title'])

In [325]:
train.age.fillna(value=train.age.mean(), inplace=True)
test.age.fillna(value=train.age.mean(), inplace=True)

In [326]:
train.fare.fillna(value=train.fare.mean(), inplace=True)
test.fare.fillna(value=train.fare.mean(), inplace=True)

In [327]:

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

ct = ColumnTransformer(transformers=[('num', scaler, ['age', 'fare'])])
train_scale = ct.fit_transform(train)
test_scale = ct.fit_transform(test)

df_test_scale = pd.DataFrame(test_scale, columns=['age_scaled', 'fare_scaled'])
df_train_scale = pd.DataFrame(train_scale, columns=['age_scaled', 'fare_scaled'])
train = train.join(df_train_scale)
test = test.join(df_test_scale)

In [328]:
train.replace({'male': 1, 'female': 0}, inplace=True)
test.replace({'male': 1, 'female': 0}, inplace=True)

In [329]:
train.drop(columns=['age', 'fare'], inplace=True)
test.drop(columns=['age', 'fare'], inplace=True)

In [330]:
train['parch_9'] = 0
train.shape, test.shape

((891, 30), (418, 30))

In [331]:
train

Unnamed: 0,survived,sex,pclass_1,pclass_2,pclass_3,sibsp_0,sibsp_1,sibsp_2,sibsp_3,sibsp_4,...,embarked_Q,embarked_S,title_1,title_2,title_3,title_4,title_5,age_scaled,fare_scaled,parch_9
0,0,1,0,0,1,0,1,0,0,0,...,0,1,1,0,0,0,0,-0.592481,-0.502445,0
1,1,0,1,0,0,0,1,0,0,0,...,0,0,0,0,1,0,0,0.638789,0.786845,0
2,1,0,0,0,1,1,0,0,0,0,...,0,1,0,1,0,0,0,-0.284663,-0.488854,0
3,1,0,1,0,0,0,1,0,0,0,...,0,1,0,0,1,0,0,0.407926,0.420730,0
4,0,1,0,0,1,1,0,0,0,0,...,0,1,1,0,0,0,0,0.407926,-0.486337,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,0,1,0,1,0,1,0,0,0,0,...,0,1,0,0,0,0,1,-0.207709,-0.386671,0
887,1,0,1,0,0,1,0,0,0,0,...,0,1,0,1,0,0,0,-0.823344,-0.044381,0
888,0,0,0,0,1,0,1,0,0,0,...,0,1,0,1,0,0,0,0.000000,-0.176263,0
889,1,1,1,0,0,1,0,0,0,0,...,0,0,1,0,0,0,0,-0.284663,-0.044381,0


In [332]:
test

Unnamed: 0,passengerid,sex,pclass_1,pclass_2,pclass_3,sibsp_0,sibsp_1,sibsp_2,sibsp_3,sibsp_4,...,embarked_C,embarked_Q,embarked_S,title_1,title_2,title_3,title_4,title_5,age_scaled,fare_scaled
0,892,1,0,0,1,1,0,0,0,0,...,0,1,0,1,0,0,0,0,0.344284,-0.498258
1,893,0,0,0,1,0,1,0,0,0,...,0,0,1,0,0,1,0,0,1.334655,-0.513125
2,894,1,0,1,0,1,0,0,0,0,...,0,1,0,1,0,0,0,0,2.523099,-0.464940
3,895,1,0,0,1,1,0,0,0,0,...,0,0,1,1,0,0,0,0,-0.249938,-0.483317
4,896,0,0,0,1,0,1,0,0,0,...,0,0,1,0,0,1,0,0,-0.646086,-0.418323
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
413,1305,1,0,0,1,1,0,0,0,0,...,0,0,1,1,0,0,0,0,-0.036088,-0.494299
414,1306,0,1,0,0,1,0,0,0,0,...,1,0,0,0,0,0,0,1,0.700817,1.313894
415,1307,1,0,0,1,1,0,0,0,0,...,0,0,1,1,0,0,0,0,0.661203,-0.508643
416,1308,1,0,0,1,1,0,0,0,0,...,0,0,1,1,0,0,0,0,-0.036088,-0.494299


### KNN Model

In [333]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.decomposition import PCA

y_train = train['survived']
X_train = train.drop('survived', axis=1)
X_test = test.drop('passengerid', axis=1)

In [334]:
grid_params = {
    'n_neighbors': range(1, 10)
}

gs = GridSearchCV(
    KNeighborsClassifier(),
    grid_params,
    verbose=1,
    cv=3,
    n_jobs=-1
)

gs_results = gs.fit(X_train, y_train)
gs_results.best_params_, gs_results.best_estimator_, gs_results.best_score_

Fitting 3 folds for each of 9 candidates, totalling 27 fits


({'n_neighbors': 7}, KNeighborsClassifier(n_neighbors=7), 0.8058361391694725)

In [336]:
pca = PCA(n_components=3)

pca.fit(X_test)

PCA(n_components=3)

In [None]:
knn = KNeighborsClassifier(n_neighbors=7)
knn.fit(X_train, y_train)
y_test = knn.predict(X_test)
y_test.shape

acc_knn = knn.score(X_train, y_train)
acc_knn

0.8507295173961841

In [None]:
PassengerId = np.array(test['passengerid'])
Survived = y_test

submission = [PassengerId, Survived]

In [None]:
submission_df = pd.DataFrame({'PassengerId': PassengerId, 'Survived': Survived})

In [None]:
submission_df.to_csv('./data/submission.csv', index=False)