In [None]:
# Import data analysis modules
import numpy as np
import pandas as pd
# Import visualization modules
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

plt.ion() # inline matplotlib graphics
# pd.options.plotting.backend = 'plotly'

In [None]:
# Use pandas to read in csv file of the titanic dataset
train = pd.read_csv('data/train.csv')
# test = pd.read_csv('data/test.csv')
train.head(20)

In [None]:
train.describe()

In [None]:
# Use the .isnull() method to locate missing data
missing_values = train.isnull()

# Use seaborn to conduct heatmap to identify missing data
# data -> argument refers to the data to creat heatmap
# yticklabels -> argument avoids plotting the column names
# cbar -> argument identifies if a colorbar is required or not
# cmap -> argument identifies the color of the heatmap
sns.heatmap(data = missing_values, yticklabels=False, cbar=False, cmap='viridis')

In [None]:
# Survivors ratio
sns.countplot(x='Survived', data=train)

In [None]:
# Use the countplot() method to identify ratio of who survived vs. not with interest in Passenger class
# x -> argument referes to column of interest
# data -> argument refers to dataset
# hue -> allows another level to subdivide data
# palette -> argument refers to plot color
sns.countplot(x='Survived', data=train, hue='Pclass')

In [None]:
# Identify outliers
# train.plot(kind="box")
plt.figure(figsize=(10, 7))
sns.boxplot(x='Pclass', y='Age', data=train)

In [None]:
# Create function to impute the age value if it is null
def impute_age(cols):
    Age = cols[0]
    Pclass = cols[1]

    if pd.isnull(Age):

        if Pclass == 1:
            return 37
        elif Pclass == 2:
            return 29
        else:
            return 24
    else:
        return Age
# Apply function to impute the age for missing values
# The age column is at position 0
# The pclass column is at position 1
# axis -> argument refers to columns instead of rows to apply the impute_age function
train['Age'] = train[['Age', 'Pclass']].apply(impute_age, axis=1)
# test['Age'] = test[['Age', 'Pclass']].apply(impute_age, axis=1)
sns.heatmap(data = train.isnull(), yticklabels=False, cbar=False, cmap='viridis')

In [None]:
# Remove column 'Cabin' from dataframe, because it contains too many null values
train.drop(columns='Cabin', inplace=True)
# Remove lines containing null values
train.dropna(inplace=True)
train.head(20)
sns.heatmap(data = train.isnull(), yticklabels=False, cbar=False, cmap='viridis').set_title("Train Set")

# # Remove column 'Cabin' from dataframe, because it contains too many null values
# test.drop(columns='Cabin', inplace=True)
# # Remove lines containing null values
# test.dropna(inplace=True)
# sns.heatmap(data = test.isnull(), yticklabels=False, cbar=False, cmap='viridis').set_title("Test Set")

In [None]:
print(train['Sex'].unique())
print(train['Embarked'].unique())

In [None]:
# Use the .get_dummies() method to convert categorical data into dummy values
# train['Sex'] refers to the column we want to convert
# drop_first -> argument avoids the multicollinearity problem, which can undermines
# the statistical significance of an independent variable.
sex = pd.get_dummies(train['Sex'], drop_first=True)
# sex_test = pd.get_dummies(test['Sex'], drop_first=True)
embark = pd.get_dummies(train['Embarked'], drop_first=True)
# embark_test = pd.get_dummies(test['Embarked'], drop_first=True)
# Use  .concat() method to merge the series data into one dataframe
train = pd.concat([train, sex, embark], axis=1)
# test = pd.concat([test, sex_test, embark_test], axis=1)
train.head(20)
# test.head(20)

In [None]:
# Drop columns with categorical data
train.drop(['Sex','Embarked','Ticket','Name','PassengerId'], axis=1, inplace=True)
train.head(20)
# test.drop(['Sex','Embarked','Ticket','Name','PassengerId'], axis=1, inplace=True)
# test.head(20)

In [None]:
# Split data into 'x' features and 'y' target label sets (if survived)
x = train[['Pclass', 'Age', 'SibSp', 'Parch', 'Fare', 'male', 'Q', 'S']]
y = train['Survived']
# x_test = test[['Pclass', 'Age', 'SibSp', 'Parch', 'Fare', 'male', 'Q', 'S']]
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3)

In [None]:
# Create instance of LogisticRegression and fit the model using the training data
logmodel = LogisticRegression(max_iter = 1000)
logmodel.fit(x_train, y_train)

In [None]:
y_predict = logmodel.predict(x_test)
x_test_series = x_test.iloc[:,0]
predict = pd.concat([x_test, pd.DataFrame({'Y Predicted': y_predict, 'Y Real': y_test})], axis=1)
predict.head(50)

In [None]:
# Print evaluation statistics
print(classification_report(y_test, y_predict))
