# Final Model: Random Forest

In [1]:
import pandas as pd
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
import warnings
warnings.filterwarnings("ignore")

In [2]:
csv = Path("titanic.csv")
titanic_df = pd.read_csv(csv)
titanic_df.head()

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
0,1.0,1.0,"Allen, Miss. Elisabeth Walton",female,29.0,0.0,0.0,24160,211.3375,B5,S,2.0,,"St Louis, MO"
1,1.0,1.0,"Allison, Master. Hudson Trevor",male,0.9167,1.0,2.0,113781,151.55,C22 C26,S,11.0,,"Montreal, PQ / Chesterville, ON"
2,1.0,0.0,"Allison, Miss. Helen Loraine",female,2.0,1.0,2.0,113781,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"
3,1.0,0.0,"Allison, Mr. Hudson Joshua Creighton",male,30.0,1.0,2.0,113781,151.55,C22 C26,S,,135.0,"Montreal, PQ / Chesterville, ON"
4,1.0,0.0,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",female,25.0,1.0,2.0,113781,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"


In [3]:
#drop unecessary columns
titanic_df = titanic_df.drop(["name","ticket","cabin","boat","body","home.dest"], axis = 1)

In [4]:
#get rid of missing values 
new_df = titanic_df.dropna()

In [5]:
dummy_df = pd.get_dummies(new_df)
dummy_df.head()

Unnamed: 0,pclass,survived,age,sibsp,parch,fare,sex_female,sex_male,embarked_C,embarked_Q,embarked_S
0,1.0,1.0,29.0,0.0,0.0,211.3375,1,0,0,0,1
1,1.0,1.0,0.9167,1.0,2.0,151.55,0,1,0,0,1
2,1.0,0.0,2.0,1.0,2.0,151.55,1,0,0,0,1
3,1.0,0.0,30.0,1.0,2.0,151.55,0,1,0,0,1
4,1.0,0.0,25.0,1.0,2.0,151.55,1,0,0,0,1


In [7]:
# Getting target, features and train/testing/splitting the data 
y_rf = dummy_df["survived"].values.reshape(-1, 1)
X_rf = dummy_df.drop("survived", axis = 1)
X_trainRF, X_testRF, y_trainRF, y_testRF = train_test_split(X_rf, y_rf, random_state=1912)

In [8]:
# Scaling the data 
scaler = StandardScaler()
X_scalerRF = scaler.fit(X_trainRF)
X_train_scaledRF = X_scalerRF.transform(X_trainRF)
X_test_scaledRF = X_scalerRF.transform(X_testRF)

In [9]:
# Creating the Random Forest mode
rf_model = RandomForestClassifier(n_estimators=1000, random_state=78)

In [10]:
# Fitting the model 
rf_model = rf_model.fit(X_train_scaledRF, y_trainRF.ravel())

In [11]:
# Making predictions 
predictionsRF = rf_model.predict(X_test_scaledRF)

In [12]:
# Getting the accuracy score 
accuracy_score(y_testRF,predictionsRF)

0.7931034482758621

In [15]:
pred_v_surv = X_testRF.copy()
pred_v_surv["actual_survival"] = y_testRF
pred_v_surv["predicted_survival"] = predictionsRF
pred_v_surv.to_csv("Model_Predictions.csv")

In [65]:
models =  ["Logistic Regression","Logistic w/ PCA","Decision Tree","Random Forest","Random Forest w/ dropping features" ,
    "K-Nearest Neighbors" ,
    "K-Nearest Neighbors w/ dropping features",
    "Neural Network w/ 4 layers"]
accuracies= [.74,.67,.78,.79,.76,.78,.75,.77]

attempts_df = pd.DataFrame({"Model":models,"Accuracy":accuracies})
attempts_df.to_csv("Model_Progression.csv")

In [80]:
def predict_titanic():
    male = 0
    female = 0
    embarked_C = 0
    embarked_Q = 0
    embarked_S = 0
    pclass = input("What class are you travelling in> (1,2,or 3)")
    age = input("How old are you?")
    sibsp = input("How many sibglings and/or spouses are with you?")
    parch = input("How many parents and/ or children are with you?")
    fare = input("How much did you pay for your ticket?")
    sex = input("What is your sex? (F/M)")
    embarked = input("Where did you embark from? (Queenstown,Cherbourg, or Southampton)")
    if sex == "M":
        female = 0 
        male = 1
    elif sex == "F":
        female = 1
        male == 0
    elif sex not in ("M","F","m","f"):
        print("Please choose either M for male or F for female")
    if embarked == "Cherbourg":
        embarked_C = 1
        embarked_Q = 0
        embarked_S = 0
    elif embarked == "Queenstown":
        embarked_C = 0
        embarked_Q = 1
        embarked_S = 0 
    elif embarked == "Southampton":
        embarked_C = 0
        embarked_Q = 0
        embarked_S = 1
    elif embarked not in ("Queenstown","Cherbourg","Southampton"):
        print("Please make sure you spelled your embarkation point correctly, including capitalization")
    x_data = pd.DataFrame({"pclass":pclass,
                           "age":age, 
                           "sibsp":sibsp,
                           "parch":parch,
                           "fare":fare,
                           "sex_female":female,
                           "sex_male":male,
                           "embarked_C":embarked_C,
                          "embarked_Q":embarked_Q,
                          "embarked_S":embarked_S},index=[0])
    X_scaler = X_scalerRF.transform(x_data)
    prediction = rf_model.predict(X_scaler)
    if prediction[0] > 0:
        return("You survived the sinking of the Titanic")
    else:
        return("You have perished in the sinking of the Titanic")
    

In [None]:
predict_titanic()
    
        
        