### Author: Kubam Ivo
### Date: 8/15/2020
### Purpose: Titanic Kaggle competition

In [33]:
#Importing necessary libraries
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report


In [54]:
#Importing the clean dataset
train_data = pd.read_csv("train_clean1")
#train_data = train_data.drop(["Fare","Age"],axis=1)
train_data.head()

Unnamed: 0,Survived,Age,SibSp,Parch,Fare,Pclass_1,Pclass_2,Pclass_3,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S
0,0,22.0,1,0,7.25,0,0,1,0,1,0,0,1
1,1,38.0,1,0,71.2833,1,0,0,1,0,1,0,0
2,1,26.0,0,0,7.925,0,0,1,1,0,0,0,1
3,1,35.0,1,0,53.1,1,0,0,1,0,0,0,1
4,0,35.0,0,0,8.05,0,0,1,0,1,0,0,1


In [55]:
# Importing the test data set
test_data = pd.read_csv("test.csv")

In [56]:
# Extracting the features
X = train_data.iloc[:,1:]
#Extracting the labels
y = train_data["Survived"]

In [57]:
# Hyper paramenter Tuning
# Set the parameters by cross-validation

tuned_parameters = [{'criterion': ['gini'], 'max_depth':[2,5,7,10],'n_estimators':[100]},
                    {'criterion': ['entropy'], 'max_depth':[2,5,7,10],'n_estimators':[100]}]
scores = ['roc_auc_score']

clf = GridSearchCV(RandomForestClassifier(),tuned_parameters,scoring='recall',cv=5)
clf.fit(X,y).best_params_

{'criterion': 'entropy', 'max_depth': 10, 'n_estimators': 100}

In [58]:
#Initialising model class
forest = RandomForestClassifier(n_estimators=100, max_depth=10, criterion='entropy', random_state=1)

In [59]:
forest.get_params()

{'bootstrap': True,
 'class_weight': None,
 'criterion': 'entropy',
 'max_depth': 10,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': 1,
 'verbose': 0,
 'warm_start': False}

In [60]:
# 5 fold cross validation
y_pred = cross_val_predict(forest, X, y, cv=5)

In [61]:
# Confusion matrix
confusion_matrix(y,y_pred, labels=[0, 1])

array([[495,  54],
       [ 92, 250]], dtype=int64)

In [53]:
print(classification_report(y, y_pred, labels=[0,1]))

              precision    recall  f1-score   support

           0       0.82      0.88      0.85       549
           1       0.78      0.68      0.73       342

   micro avg       0.81      0.81      0.81       891
   macro avg       0.80      0.78      0.79       891
weighted avg       0.80      0.81      0.80       891



In [22]:
#AUC
roc_auc_score(y,y_pred)

0.8163167481545393

In [23]:
# Fitting the logistic regression model
forest.fit(X,y)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
            max_depth=10, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=None,
            oob_score=False, random_state=1, verbose=0, warm_start=False)

In [24]:
test_data1 = pd.get_dummies(test_data,columns=["Pclass","Sex","Embarked"])


In [25]:
test_data1 = test_data1.drop(["PassengerId","Name", "Ticket", "Cabin"], axis=1)


In [26]:
test_data1.head(5)

Unnamed: 0,Age,SibSp,Parch,Fare,Pclass_1,Pclass_2,Pclass_3,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S
0,34.5,0,0,7.8292,0,0,1,0,1,0,1,0
1,47.0,1,0,7.0,0,0,1,1,0,0,0,1
2,62.0,0,0,9.6875,0,1,0,0,1,0,1,0
3,27.0,0,0,8.6625,0,0,1,0,1,0,0,1
4,22.0,1,1,12.2875,0,0,1,1,0,0,0,1


In [27]:
test_data1.isnull().sum()

Age           86
SibSp          0
Parch          0
Fare           1
Pclass_1       0
Pclass_2       0
Pclass_3       0
Sex_female     0
Sex_male       0
Embarked_C     0
Embarked_Q     0
Embarked_S     0
dtype: int64

In [28]:
# Handling missing values
#test_data1["Age"] = test_data1["Age"].fillna(29.7) # imputing the mean value of 29.7 for all missing ages
test_data1["Fare"] = test_data1["Fare"].fillna(32.2)

# Handling missing values in age column usng linear regression
from sklearn.linear_model import LinearRegression

# Extracting the features
X = train_data.iloc[:,2:]
#Extracting the labels
y = train_data["Age"]

reg = LinearRegression().fit(X,y)
print(reg.score(X,y))


x1 = pd.isnull(test_data1["Age"]) # Extracting rows with null values for Age
x2 = test_data1[x1].index #Extracting index for all rows with null values for age
x_test = test_data1.iloc[x2,1:] # extracting test dataset where age is null
y_pred = reg.predict(x_test) #Predicting Age
test_data1.iloc[x2,0]=y_pred #replacing all null ages from  original dataset with predicted values
test_data1.isnull().sum()


0.32237586235082505


Age           0
SibSp         0
Parch         0
Fare          0
Pclass_1      0
Pclass_2      0
Pclass_3      0
Sex_female    0
Sex_male      0
Embarked_C    0
Embarked_Q    0
Embarked_S    0
dtype: int64

In [29]:
test_data1.isnull().sum()

Age           0
SibSp         0
Parch         0
Fare          0
Pclass_1      0
Pclass_2      0
Pclass_3      0
Sex_female    0
Sex_male      0
Embarked_C    0
Embarked_Q    0
Embarked_S    0
dtype: int64

In [30]:
pred = forest.predict(test_data1)
output = pd.DataFrame({'PassengerId': test_data.PassengerId, 'Survived': pred})
output.to_csv('my_submissionForest2.csv', index=False)