### Imports

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import statsmodels.api as sm
import seaborn as sns
sns.set()

### Load and View Data

In [None]:
raw_data = pd.read_csv('train.csv')

In [None]:
raw_data.head(15)

In [None]:
raw_data.info()  # confirmed no duplicate names.

In [None]:
raw_data.describe()

### Drop columns, replace values, and create dummies

In [None]:
# raw_data['Ticket'].unique()
data_drop = raw_data.drop(['Name', 'Ticket'], axis = 1)
data_drop.head(3)

In [None]:
data_drop['Sex'] = data_drop['Sex'].replace(['female', 'male'], [0, 1])
data_drop.head()

In [None]:
data_drop['Parch'].unique()

In [None]:
data_drop['SibSp'].unique()

In [None]:
data_drop['Pclass'].unique()

In [None]:
data_drop['Embarked'].value_counts()

In [None]:
data_drop['Pclass'].value_counts()

In [None]:
data_drop_cabin = data_drop.drop('Cabin', axis=1)
data_drop_cabin.head(3)

In [None]:
class_columns = pd.get_dummies(data_drop_cabin['Pclass'], drop_first = True) # dummies for class
class_columns = class_columns.rename(columns={2:'2nd Class', 3:'3rd Class'})
data_class = pd.concat([data_drop_cabin, class_columns], axis=1).drop('Pclass', axis=1)
data_class.head()

In [None]:
embarked_columns = pd.get_dummies(data_class['Embarked'], drop_first = True) # dummies for embark
embarked_columns = embarked_columns.rename(columns={'Q':'Queenstown', 'S':'Southampton'})
data_dummy = pd.concat([data_class, embarked_columns], axis = 1).drop('Embarked', axis=1)
data_dummy.head()

### Visualizing the Data

In [None]:
plt.hist(data_dummy['Fare']) # Fare has a couple outliers

In [None]:
data_age = data_drop_cabin.dropna(how='any', axis=0)
sns.set(context = 'paper')
sns.stripplot(x=data_age['Survived'], y=data_age['Age'], hue = data_age['Pclass'])
plt.show()

In [None]:
sns.barplot(x='Embarked', y='Survived', data=data_age)

In [None]:
sns.barplot(x='Pclass', y='Survived', data=data_age)

In [None]:
sns.barplot(x='Pclass', y='Survived', ci = None, hue='Embarked', order=[3,2,1], data=data_age)

In [None]:
sns.barplot(x='Pclass', y='Survived', hue='Sex', order=[3,2,1], data=data_age)

### Scale Numerical Data 

In [None]:
data_dummy.head()

In [None]:
numerical_features = pd.DataFrame()
numerical_features['Scaled Age'] = data_dummy['Age']
numerical_features['Scaled Fare'] = data_dummy['Fare']
# numerical_features['Scaled SibSp'] = data_dummy['SibSp']
# numerical_features['Scaled Parch'] = data_dummy['Parch']

In [None]:
numerical_features.head(1)

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler().fit(numerical_features)
num_feat_scal = scaler.transform(numerical_features)
num_feat_scal_df = pd.DataFrame(data=num_feat_scal, columns=['Age', 'Fare', 
#                                                              'SibSp', 'Parch'
                                                            ])
num_feat_scal_df.head(1)

In [None]:
data_dummy['Scaled Age'] = num_feat_scal_df['Age']
data_dummy['Scaled Fare'] = num_feat_scal_df['Fare']
data_scaled = data_dummy.drop(['Age', 'Fare'], axis=1)
data_scaled.head(1)

### Assign Numbers to Embarked

In [None]:
# data_scaled['Embarked'] = data_scaled['Embarked'].replace(['S', 'Q', 'C'], [1, 2, 3])
# data_scaled.head(5)

### KNN to Impute Missing Age Values

In [None]:
from sklearn.impute import KNNImputer

In [None]:
data_scaled.columns.values

In [None]:
imputer = KNNImputer(n_neighbors=5)
data_impute = imputer.fit_transform(data_scaled.drop('Survived', axis=1))
data_impute = pd.DataFrame(data=data_impute, columns=['PassengerId', 'Sex', 'SibSp', 'Parch', '2nd Class',
       '3rd Class', 'Queenstown', 'Southampton', 'Scaled Age',
       'Scaled Fare'])

data_impute.info()

In [None]:
data_impute['Survived'] = data_scaled['Survived']
data_impute.head(1)

### Create Booleans for Familial Columns

In [None]:
# Resulted in model with worse results.

In [None]:
def family_boolean(element):
    if element > 1:
        return int(1)
    else:
        return element

data_impute['SibSp'] = pd.DataFrame(data_impute['SibSp'].apply(family_boolean)[0::], columns=['SibSp'])
data_impute['SibSp'].value_counts()

In [None]:
data_impute['Parch'] = pd.DataFrame(data_impute['Parch'].apply(family_boolean)[0::], columns=['Parch'])
data_impute['Parch'].value_counts()

### Implement Random Forest Classification Model

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
estimators = ['Sex', 'SibSp', 'Parch', '2nd Class',
       '3rd Class', 'Queenstown', 'Southampton', 'Scaled Age',
       'Scaled Fare']

features = data_impute[estimators]
target = data_impute['Survived']

In [None]:
randomforest = RandomForestClassifier(random_state=0, n_jobs=-1, n_estimators = 100)
model = randomforest.fit(features, target)

### Check Important Features

In [None]:
importances = model.feature_importances_
indices = np.argsort(importances)[::-1]
names = [estimators[i] for i in indices]
plt.figure()
plt.title('Feature Importance')
plt.bar(range(len(estimators)), importances[indices])
plt.xticks(range(features.shape[1]), names, rotation=90)
plt.show()

### Test Data

In [None]:
test_data = pd.read_csv('test.csv')

In [None]:
test_data.head()

In [None]:
test_data.info()

In [None]:
data_test_drop = test_data.drop(['Name', 'Ticket', 'Cabin'], axis = 1)
data_test_drop['Sex'] = data_test_drop['Sex'].replace(['female', 'male'], [0, 1])

test_class = pd.get_dummies(data_test_drop['Pclass'], drop_first = True) # dummies for class
test_class = test_class.rename(columns={2:'2nd Class', 3:'3rd Class'})
data_test_class = pd.concat([data_test_drop, test_class], axis=1).drop('Pclass', axis=1)

embarked_test_columns = pd.get_dummies(data_test_class['Embarked'], drop_first = True) # dummies for embark
embarked_test_columns = embarked_test_columns.rename(columns={'Q':'Queenstown', 'S':'Southampton'})
data_test_final = pd.concat([data_test_class, embarked_test_columns], axis = 1).drop('Embarked', axis=1)

test_features = pd.DataFrame()
test_features['Scaled Age'] = data_test_final['Age']
test_features['Scaled Fare'] = data_test_final['Fare']

test_feat_scal = scaler.transform(test_features)
test_feat_scal_df = pd.DataFrame(data=test_feat_scal, columns=['Age', 'Fare'])

data_test_final['Scaled Age'] = test_feat_scal_df['Age']
data_test_final['Scaled Fare'] = test_feat_scal_df['Fare']
data_test_scaled = data_test_final.drop(['Age', 'Fare'], axis=1)

# data_test_scaled['Embarked'] = data_test_scaled['Embarked'].replace(['S', 'Q', 'C'], [1, 2, 3])

data_test_scaled.info()

In [None]:
data_test_impute = imputer.transform(data_test_scaled)
data_test_impute = pd.DataFrame(data=data_test_impute, columns=['PassengerId', 'Sex', 'SibSp', 'Parch', '2nd Class',
       '3rd Class', 'Queenstown', 'Southampton', 'Scaled Age',
       'Scaled Fare'])
data_test_impute.info()

In [None]:
data_test_impute['SibSp'] = pd.DataFrame(data_test_impute['SibSp'].apply(family_boolean)[0::], columns=['SibSp'])
data_test_impute['Parch'] = pd.DataFrame(data_test_impute['Parch'].apply(family_boolean)[0::], columns=['Parch'])

In [None]:
predictions = model.predict(data_test_impute[estimators])

### Export csv from formatted dataframe

In [None]:
predictions = pd.DataFrame(predictions, columns=['Predictions'])
predictions.head()

In [None]:
# def assign_boolean(element):
#     if element > 0.5:
#         return int(1)
#     else:
#         return int(0)
    
# predictions_binary = predictions['Predictions'].apply(assign_boolean)[0::]
# predictions_binary.head()

In [None]:
# predictions_final = pd.DataFrame(predictions_binary, columns=['Predictions'])

In [None]:
data_export = pd.DataFrame()
data_export['PassengerId'] = data_test_impute['PassengerId']
data_export = data_export.astype('int32')
data_export['Survived'] = predictions['Predictions']
# data_export = pd.concat([data_test_final['PassengerId'], predictions_final['Predictions']])
data_export.head()

In [None]:
data_export.info()

In [None]:
data_export.to_csv('Titanic_results_v5.csv', index=False)

# Kaggle Result: 0.76315