In [None]:
import pandas as pd
import numpy as np

import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

# pd.isnull(df).sum()

In [None]:
df = pd.read_csv('dataset/train.csv')
df.sample(3)

In [None]:
df[df['Sex'] == 'female'].sort_values(by = ['SibSp'], ascending = False )[0:20]

In [None]:
# create family variable
df['Family_size'] = df['Parch'] + df['SibSp'] + 1
# create title feature
df['Title'] = df['Name'].apply(lambda x : x.split('.')[0].split(',')[1].strip())
df.loc[df['Title'] == 'Mlle','Title'] = 'Miss'
df.loc[df['Title'] == 'Mme', 'Title'] = 'Miss'
df.loc[df['Title'] == 'Ms', 'Title'] = 'Miss'
rare_titles = ['Dr', 'Rev', 'Col', 'Major', 'Don','Dona', 'Lady', 'the Countess','Capt','Sir', 'Jonkheer']
for r in rare_titles:
    df.loc[df['Title'] == r, 'Title'] = 'Rare Title'

df['IsAlone'] = 0
df.loc[df['Family_size'] == 1, 'IsAlone'] = 1
# remove name, cabin, ticket, 'PassengerId'
df.drop(['Name', 'Cabin', 'Ticket', 'PassengerId', 'Embarked'], axis = 1, inplace = True)
df.sample(3)

In [None]:
# Reorder columns
['Title', 'Age', 'Sex', 'Pclass', 'IsAlone', 'Family_size', 'SibSp', 'Parch', 'Fare', 'Survived']
new_order = [8, 3, 2, 1, 9, 7, 5, 4, 6, 0]
df = df[df.columns[new_order]]
df.sample(5)

## Converting character variables to numeric

In [None]:
df['Sex'] = df['Sex'].map( {'male': 0, 'female': 1} ).astype(int) 
df['Title'] = df['Title'].map({'Mr':0, 'Miss':1, 'Mrs':2, 'Master':3, 'Rare Title':4}).astype(int)

In [None]:
df.head(5)

## Age Imputation - Applying linear regression on resulting variables to predict age

In [None]:
print(pd.isnull(df).sum())

In [None]:
# df[pd.isnull(df['Age'])]
df.head()

In [None]:

# train_set = df[pd.isnull(df['Age']),]
train_set = df[df['Age'] >= 0]
X_train = train_set.drop(['Age'], axis = 1)
y_train = train_set.loc[:,'Age']

from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
regressor.fit(X_train, y_train)

# comparing age predictions
age_comparison = df.loc[:,['Age']]
age_comparison['is_null'] = np.where(pd.isnull(age_comparison['Age']), True, False)

age_comparison['Interpolation'] = df.interpolate()['Age']

age_comparison['LR_prediction'] = regressor.predict(df.drop(['Age'], axis = 1))

age_comparison['LR_prediction'] = np.where(age_comparison['is_null'], 
                                           age_comparison['LR_prediction'],
                                           age_comparison['Age'])
                                           
# age_comparison[age_comparison['is_null']]


# # age_comparison['LR_diff'] = age_comparison['Age'] - age_comparison['LR_prediction']
# age_comparison['Interp_diff'] = age_comparison['Age'] - age_comparison['Interpolation']


# mad = age_comparison["LR_diff"].abs().mean()
# std = age_comparison["LR_diff"].abs().std()
# print("Linear Regresssion Prediction")
# print("Mean absolute difference: {}".format(mad))
# print("Standard Deviation: {}".format(std))

# mad = age_comparison["Interp_diff"].abs().mean()
# std = age_comparison["Interp_diff"].abs().std()
# print("Interpolation Prediction")
# print("Mean absolute difference: {}".format(mad))
# print("Standard Deviation: {}".format(std))

df['Age'] = age_comparison['LR_prediction']
df['Age'] = age_comparison['Interpolation']


In [None]:
bins = [0, 2, 12, 18, 25, 54, 65, np.inf]
# labels = ['baby','child','teenager',"young adult",'adult','senior','older person']
labels = [0,1,2,3,4,5,6]
df['Age_Group'] = pd.cut(df['Age'], bins=bins, labels=labels)

df.groupby(['Age_Group'])['Age'].mean()

In [None]:
new_order = [0,1,10,2,3,4,5,6,7,8,9]
df = df[df.columns[new_order]]
df.sample(5)

In [None]:
from sklearn.model_selection import train_test_split
X = df.drop(['Survived'], axis = 1)
y = df['Survived']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [None]:
from sklearn.metrics import confusion_matrix, classification_report

def evaluate_algorithm(model, X_train, X_test, y_train, y_test):
    model.fit(X_train, y_train) # fitting-training the model
    y_pred = model.predict(X_test) # making predictions

    training_accuracy = model.score(X_train, y_train)
    test_accuracy = model.score(X_test, y_test)

    print("Training Accuracy = {:.3f}".format(training_accuracy)) # predicting and scoring on the same time given a the data
    print("Test Accuracy = {:.3f}".format(test_accuracy))
    print(confusion_matrix(y_test, y_pred))
    #     tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()

    print(classification_report(y_test, y_pred))
    return model

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB


# print("-"*25,"LOGISTIC REGRESSION","-"*25)
# logreg = LogisticRegression()
# logreg = evaluate_algorithm(logreg, X_train, X_test, y_train, y_test)

# print("-"*25,"SUPPORT VECTOR MACHINES","-"*25)
# svc = SVC()
# svc = evaluate_algorithm(svc, X_train, X_test, y_train, y_test)

print("-"*25,"RANDOM FOREST","-"*25)
rf = RandomForestClassifier()
rf = evaluate_algorithm(rf, X_train, X_test, y_train, y_test)

# print("-"*25,"KNN","-"*25)
# knn = KNeighborsClassifier(n_neighbors = 5)
# knn = evaluate_algorithm(knn, X_train, X_test, y_train, y_test)

# print("-"*25,"NAIVE BAYES CLASSIFIER","-"*25)
# nbc = GaussianNB()
# nbc = evaluate_algorithm(nbc, X_train, X_test, y_train, y_test)

In [None]:
# import pickle
# pickle.dump(rf, open('model.pkl','wb'))

In [None]:
import pickle
model = pickle.load(open('model.pkl', 'rb'))
dc = {
    'Title':0,
    'Age':16,
    'Age_Group':2,
    'Sex':0,
    'Pclass':2,
    'IsAlone':1,
    'Family_size':1,
    'Parch':0,
    'SibSp':0,
    'Fare':26,
}
# dc = {
#     'Title':0,
#     'Age':20,
#     'Age_Group':4,
#     'Sex':0,
#     'Pclass':3,
#     'IsAlone':0,
#     'Family_size':4,
#     'Parch':2,
#     'SibSp':1,
#     'Fare':7.25,
# }
# rf.predict
row = [[x for x in dc.values()]]
model.predict_proba(row)[0][0]