In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn import metrics
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

In [None]:
df1 = pd.read_csv('titanic-train.csv', encoding='utf-8')
df2 = pd.read_csv('titanic-test.csv', encoding='utf-8')
df1.head()

### Set up a label encoder

Since these machine learning tools can't distinguish between words like "male" and "female" we need to convert them to numerical values of 1 and 0. 

In [None]:
# Set the Label Encoder to transform categorical data to numerials.
lb = LabelEncoder()

# https://pbpython.com/categorical-encoding.html
df1['sex_code']      = lb.fit_transform(df1['Sex'])
df1['embarked_code'] = lb.fit_transform(df1['Embarked'].fillna(""))
df2['sex_code']      = lb.fit_transform(df2['Sex'])
df2['embarked_code'] = lb.fit_transform(df2['Embarked'].fillna(""))
df1.head(50)

### Remove N/A from age columns

Any NaN blanks will give us errors later, so remove them now.

In [None]:
df1 = df1.dropna(subset=['Age'])
df2 = df2.dropna(subset=['Age'])
df1.head()

### Set up our X and Y variables

In [None]:
X = df1[['Pclass', 'sex_code', 'Age', 'SibSp','Fare']]
y = df1['Survived']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,stratify=y)
print('Number of observations in the training data:', len(X_train))
print('Number of observations in the test data:',len(X_test))

### Set up our classifier variable

In [None]:
clf = RandomForestClassifier(n_jobs=2, n_estimators=100, random_state=0)

### Give it our training data for both X and Y

In [None]:
clf.fit(X_train,y_train)

### Make so predictions based on data

Let's grat that test data, and see how well it does when we try to make precitions.

In [None]:
y_pred = clf.predict(X_test)

In [None]:
df3 = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})
df3.head(15)

In [None]:
print("Accuracy:", metrics.accuracy_score(y_test, y_pred))

In [None]:
feature_imp = pd.DataFrame({'variable':list(X_train.columns), 'importance': clf.feature_importances_}).sort_values(by='importance',ascending=False)
feature_imp

In [None]:
sns.catplot(y='variable', x='importance', kind='bar', orient='h', data=feature_imp)