# Kaggle Titanic Challenge

## Imports

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

## Load Data

In [None]:
all_data = pd.read_csv('./data/train.csv')
# test = pd.read_csv('./data/test.csv')

## Data Exploration

In [None]:
# survival: 0=No, 1=Yes
# pclass(Ticket class): 1=1st, 2=2nd, 3=3rd
# sibsp: # of siblings / spouses aboard the Titanic	
# parch: # of parents / children aboard the Titanic	
# embarked (Port of Embarkation): C=Cherbourg, Q=Queenstown,S =Southampton

print(all_data.shape)
all_data.head()

#### Male vs Female

In [None]:
all_data['Sex'].value_counts().plot(kind='pie')

#### Age Distribution

In [None]:
all_data['Age'].hist()

#### Average class price

In [None]:
all_data.groupby('Pclass')['Fare'].mean()

## Clean the Data

#### Transform Male and Female into numberic. Female=0, Male=1

In [None]:
# train['Sex Numeric'] = train['Sex'].replace('female', 0).replace('male', 1)
all_data['Sex'] = all_data['Sex'].map({'female': 0, 'male': 1})

#### Impute Age

In [None]:
# check for NaN

# for c in all_data.columns:
#     print(f"column {c} has null values? {all_data[c].isnull().values.any()}")

age_na_count = len(all_data[all_data['Age'].isna()])
all_count = len(all_data)

print(f"There are {age_na_count} na age records ({(age_na_count / all_count) * 100:.2f})%")

print(f"Mean before: {all_data['Age'].mean()}")

# Mean Impute
all_data['Age'] = all_data['Age'].fillna(all_data['Age'].mean())

print(f"After imputation There are {age_na_count} na age records ({(age_na_count / all_count) * 100:.2f})%")
print(f"Mean after: {all_data['Age'].mean()}")


## TEMP: DELETE NON NUMERIC COLUMNS

In [None]:
all_data.drop(['Name', 'Ticket', 'Cabin', 'Embarked'], axis=1, inplace=True)

### Correlations

In [None]:
def plot_corr(df, size=11):
    """
    Function plots a graphical correlation matrix for each pair of columns in the dataframe.

    Input:
        df: pandas DataFrame
        size: vertical and horizontal size of the plot

    Displays:
        matrix of correlation between columns.  Blue-cyan-yellow-red-darkred => less to more correlated
                                                0 ------------------>  1
                                                Expect a darkred line running from top left to bottom right
    """

    corr = df.corr()    # data frame correlation function
    fig, ax = plt.subplots(figsize=(size, size))
    ax.matshow(corr)   # color code the rectangles by correlation value
    plt.xticks(range(len(corr.columns)), corr.columns)  # draw x tick marks
    plt.yticks(range(len(corr.columns)), corr.columns)  # draw y tick marks

plot_corr(all_data, size=10)

## Split Train Test

In [None]:
X = all_data[all_data.columns.difference(['Survived'])]
y = all_data['Survived']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

### Check Survival Distribution in Test and Train Data

In [None]:

passengers_count = len(all_data)
survived = len(all_data.loc[all_data['Survived'] == 1])
drawn = len(all_data.loc[all_data['Survived'] == 0])
print( f"All data Survived passengers: {survived} ({(survived/passengers_count)*100:.2f}%)")
print( f"All data Drawn passengers: {drawn} ({(drawn/passengers_count)*100:.2f}%)")

passengers_count = len(y_train)
drawn, survived = y_train.value_counts()
print( f"Train data Survived passengers: {survived} ({(survived/passengers_count)*100:.2f}%)")
print( f"Train data Drawn passengers: {drawn} ({(drawn/passengers_count)*100:.2f}%)")

passengers_count = len(y_test)
drawn, survived = y_test.value_counts()
print( f"Test data Survived passengers: {survived} ({(survived/passengers_count)*100:.2f}%)")
print( f"Test data Drawn passengers: {drawn} ({(drawn/passengers_count)*100:.2f}%)")

## Naive Bayes

In [None]:
from sklearn.naive_bayes import GaussianNB

nb_model = GaussianNB()

nb_model.fit(X_train, y_train.values)

### Performance

#### On Train Data

In [None]:
from sklearn import metrics

# predict values using the training data
nb_predict_train = nb_model.predict(X_train)

# Accuracy
print(f"Accuracy: {metrics.accuracy_score(y_train, nb_predict_train):.4f}")

#### On Test Data

In [None]:
nb_predict_test = nb_model.predict(X_test)

# Accuracy
print(f"Accuracy: {metrics.accuracy_score(y_test, nb_predict_test):.4f}")

## Logistic Regression

In [227]:
from sklearn.linear_model import LogisticRegression

lr_model = LogisticRegression(C=0.7, random_state=42, solver='liblinear', max_iter=10000)
lr_model.fit(X_train, y_train.values)

lr_predict_train = lr_model.predict(X_train)
lr_predict_test = lr_model.predict(X_test)

# training metrics
print("Training Accuracy: {0:.4f}".format(metrics.accuracy_score(y_train, lr_predict_train)))
print("Training Classification Report")
print(metrics.classification_report(y_train, lr_predict_train))
# test metrics
print("Test Accuracy: {0:.4f}".format(metrics.accuracy_score(y_test, lr_predict_test)))
# print(metrics.confusion_matrix(y_test, lr_predict_test) )
print("Test Classification Report")
print(metrics.classification_report(y_test, lr_predict_test))

Training Accuracy: 0.7987
Training Classification Report
              precision    recall  f1-score   support

           0       0.82      0.87      0.84       374
           1       0.76      0.67      0.71       222

    accuracy                           0.80       596
   macro avg       0.79      0.77      0.78       596
weighted avg       0.80      0.80      0.80       596

Test Accuracy: 0.8102
Test Classification Report
              precision    recall  f1-score   support

           0       0.81      0.88      0.85       175
           1       0.80      0.71      0.75       120

    accuracy                           0.81       295
   macro avg       0.81      0.79      0.80       295
weighted avg       0.81      0.81      0.81       295

