### Author: Kubam Ivo
### Date: 8/12/2020
### Purpose: Titanic Kaggle competition

In [1]:
#Importing necessary libraries
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score


In [46]:
#Importing the clean dataset
train_data = pd.read_csv("train_clean")
train_data.head()

Unnamed: 0,Survived,Pclass_1,Pclass_2,Pclass_3,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S,Age,Fare
0,0,0,0,1,0,1,0,0,1,0.949757,0.312988
1,1,1,0,0,1,0,1,0,0,0.470417,0.882444
2,1,0,0,1,1,0,0,0,1,0.956551,0.291564
3,1,1,0,0,1,0,0,0,1,0.550338,0.834942
4,0,0,0,1,0,1,0,0,1,0.974555,0.224148


In [30]:
# Importing the test data set
test_data = pd.read_csv("test.csv")

In [31]:
# Extracting the features
X = train_data.iloc[:,1:]
#Extracting the labels
y = train_data["Survived"]

In [32]:
#Initialising model class
logistic = LogisticRegression()

In [33]:
# 5 fold cross validation
y_pred = cross_val_predict(logistic, X, y, cv=5)

In [34]:
# Confusion matrix
confusion_matrix(y,y_pred, labels=[0, 1])

array([[469,  80],
       [112, 230]], dtype=int64)

In [35]:
#AUC
roc_auc_score(y,y_pred)

0.7633975649506279

In [36]:
# Fitting the logistic regression model
logistic.fit(X,y)

LogisticRegression()

In [37]:
test_data1 = pd.get_dummies(test_data,columns=["Pclass","Sex","Embarked"])


In [38]:
test_data1 = test_data1.drop(["PassengerId","Name", "SibSp", "Parch","Ticket", "Cabin"], axis=1)


In [39]:
def missing_zero_values_table(df):
        zero_val = (df == 0.00).astype(int).sum(axis=0)
        mis_val = df.isnull().sum()
        mis_val_percent = 100 * df.isnull().sum() / len(df)
        mz_table = pd.concat([zero_val, mis_val, mis_val_percent], axis=1)
        mz_table = mz_table.rename(
        columns = {0 : 'Zero Values', 1 : 'Missing Values', 2 : '% of Total Values'})
        mz_table['Total Zero Missing Values'] = mz_table['Zero Values'] + mz_table['Missing Values']
        mz_table['% Total Zero Missing Values'] = 100 * mz_table['Total Zero Missing Values'] / len(df)
        mz_table['Data Type'] = df.dtypes
        mz_table = mz_table[
            mz_table.iloc[:,1] != 0].sort_values(
        '% of Total Values', ascending=False).round(1)
        print ("Your selected dataframe has " + str(df.shape[1]) + " columns and " + str(df.shape[0]) + " Rows.\n"      
            "There are " + str(mz_table.shape[0]) +
              " columns that have missing values.")
#         mz_table.to_excel('D:/sampledata/missing_and_zero_values.xlsx', freeze_panes=(1,0), index = False)
        return mz_table

missing_zero_values_table(test_data1)

Your selected dataframe has 10 columns and 418 Rows.
There are 2 columns that have missing values.


Unnamed: 0,Zero Values,Missing Values,% of Total Values,Total Zero Missing Values,% Total Zero Missing Values,Data Type
Age,0,86,20.6,86,20.6,float64
Fare,2,1,0.2,3,0.7,float64


In [40]:
# Handling missing values
test_data1["Age"] = test_data1["Age"].fillna(29.7) # imputing the mean value of 29.7 for all missing ages
test_data1["Fare"] = test_data1["Fare"].fillna(32.2) # imputing the mean value of 32.2 for all missing Fare

In [41]:
missing_zero_values_table(test_data1)

Your selected dataframe has 10 columns and 418 Rows.
There are 0 columns that have missing values.


Unnamed: 0,Zero Values,Missing Values,% of Total Values,Total Zero Missing Values,% Total Zero Missing Values,Data Type


In [45]:
test_data1.head(5)

Unnamed: 0,Age,Fare,Pclass_1,Pclass_2,Pclass_3,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S
0,34.5,7.8292,0,0,1,0,1,0,1,0
1,47.0,7.0,0,0,1,1,0,0,0,1
2,62.0,9.6875,0,1,0,0,1,0,1,0
3,27.0,8.6625,0,0,1,0,1,0,0,1
4,22.0,12.2875,0,0,1,1,0,0,0,1


In [43]:
pred = logistic.predict(test_data1)
output = pd.DataFrame({'PassengerId': test_data.PassengerId, 'Survived': pred})
output.to_csv('my_submission.csv', index=False)

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,