In [65]:
# import dependencies
import pandas as pd
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier
import warnings
warnings.filterwarnings("ignore")


# Clean, Process, and Split Data

In [9]:
# read in the data 
csv = Path("titanic.csv")
titanic_df = pd.read_csv(csv)


Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
0,1.0,1.0,"Allen, Miss. Elisabeth Walton",female,29.0000,0.0,0.0,24160,211.3375,B5,S,2,,"St Louis, MO"
1,1.0,1.0,"Allison, Master. Hudson Trevor",male,0.9167,1.0,2.0,113781,151.5500,C22 C26,S,11,,"Montreal, PQ / Chesterville, ON"
2,1.0,0.0,"Allison, Miss. Helen Loraine",female,2.0000,1.0,2.0,113781,151.5500,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"
3,1.0,0.0,"Allison, Mr. Hudson Joshua Creighton",male,30.0000,1.0,2.0,113781,151.5500,C22 C26,S,,135.0,"Montreal, PQ / Chesterville, ON"
4,1.0,0.0,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",female,25.0000,1.0,2.0,113781,151.5500,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1305,3.0,0.0,"Zabour, Miss. Thamine",female,,1.0,0.0,2665,14.4542,,C,,,
1306,3.0,0.0,"Zakarian, Mr. Mapriededer",male,26.5000,0.0,0.0,2656,7.2250,,C,,304.0,
1307,3.0,0.0,"Zakarian, Mr. Ortin",male,27.0000,0.0,0.0,2670,7.2250,,C,,,
1308,3.0,0.0,"Zimmerman, Mr. Leo",male,29.0000,0.0,0.0,315082,7.8750,,S,,,


In [11]:
#drop unecessary columns
titanic_df = titanic_df.drop(["name","ticket","cabin","boat","body","home.dest"], axis = 1)


Unnamed: 0,pclass,survived,sex,age,sibsp,parch,fare,embarked
0,1.0,1.0,female,29.0,0.0,0.0,211.3375,S
1,1.0,1.0,male,0.9167,1.0,2.0,151.55,S
2,1.0,0.0,female,2.0,1.0,2.0,151.55,S
3,1.0,0.0,male,30.0,1.0,2.0,151.55,S
4,1.0,0.0,female,25.0,1.0,2.0,151.55,S


In [12]:
#get rid of missing values 
new_df = titanic_df.dropna()

Unnamed: 0,pclass,survived,sex,age,sibsp,parch,fare,embarked
0,1.0,1.0,female,29.0000,0.0,0.0,211.3375,S
1,1.0,1.0,male,0.9167,1.0,2.0,151.5500,S
2,1.0,0.0,female,2.0000,1.0,2.0,151.5500,S
3,1.0,0.0,male,30.0000,1.0,2.0,151.5500,S
4,1.0,0.0,female,25.0000,1.0,2.0,151.5500,S
...,...,...,...,...,...,...,...,...
1301,3.0,0.0,male,45.5000,0.0,0.0,7.2250,C
1304,3.0,0.0,female,14.5000,1.0,0.0,14.4542,C
1306,3.0,0.0,male,26.5000,0.0,0.0,7.2250,C
1307,3.0,0.0,male,27.0000,0.0,0.0,7.2250,C


In [15]:
new_df["pclass"] = new_df["pclass"].astype("int")
new_df["survived"] = new_df["survived"].astype("int")


Unnamed: 0,pclass,survived,sex,age,sibsp,parch,fare,embarked
0,1,1,female,29.0000,0.0,0.0,211.3375,S
1,1,1,male,0.9167,1.0,2.0,151.5500,S
2,1,0,female,2.0000,1.0,2.0,151.5500,S
3,1,0,male,30.0000,1.0,2.0,151.5500,S
4,1,0,female,25.0000,1.0,2.0,151.5500,S
...,...,...,...,...,...,...,...,...
1301,3,0,male,45.5000,0.0,0.0,7.2250,C
1304,3,0,female,14.5000,1.0,0.0,14.4542,C
1306,3,0,male,26.5000,0.0,0.0,7.2250,C
1307,3,0,male,27.0000,0.0,0.0,7.2250,C


In [16]:
#get dummy varibles for Sex and Embarkment columns 
dummy_df = pd.get_dummies(new_df)


Unnamed: 0,pclass,survived,age,sibsp,parch,fare,sex_female,sex_male,embarked_C,embarked_Q,embarked_S
0,1,1,29.0000,0.0,0.0,211.3375,1,0,0,0,1
1,1,1,0.9167,1.0,2.0,151.5500,0,1,0,0,1
2,1,0,2.0000,1.0,2.0,151.5500,1,0,0,0,1
3,1,0,30.0000,1.0,2.0,151.5500,0,1,0,0,1
4,1,0,25.0000,1.0,2.0,151.5500,1,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...
1301,3,0,45.5000,0.0,0.0,7.2250,0,1,1,0,0
1304,3,0,14.5000,1.0,0.0,14.4542,1,0,1,0,0
1306,3,0,26.5000,0.0,0.0,7.2250,0,1,1,0,0
1307,3,0,27.0000,0.0,0.0,7.2250,0,1,1,0,0


In [19]:
#split our features from our target
y = dummy_df["survived"]
X = dummy_df.drop("survived",axis = 1)


In [20]:
#split data into training and testing 
X_train, X_test, y_train, y_test = train_test_split(X, y)

# First Model Attempt : Logistic Regression

In [40]:
#create a logistic regression model
logistic_regression_model = LogisticRegression(random_state=1912)

# Fit and save the logistic regression model using the training data
logistic_regression_model.fit(X_train, y_train)

In [41]:
#Generate testing predictions
testing_predictions = logistic_regression_model.predict(X_test)

In [42]:
from sklearn.metrics import confusion_matrix
#create and save the confusion matrix for the testing data 
test_matrix = confusion_matrix(y_test, testing_predictions)

# Print the confusion matrix for the testing data
print(test_matrix)

[[123  39]
 [ 29  70]]


In [25]:
# Create and save the testing classification report
testing_report = classification_report(y_test, testing_predictions)

# Print the testing classification report
print(testing_report)

              precision    recall  f1-score   support

           0       0.81      0.76      0.78       162
           1       0.64      0.71      0.67        99

    accuracy                           0.74       261
   macro avg       0.73      0.73      0.73       261
weighted avg       0.75      0.74      0.74       261



Our first attempt at a model, plain logistic regression, yielded a 74% accuracy

## Optimization attempt 1 for Logistic Regression
### Attempting PCA

In [27]:
from sklearn.decomposition import PCA

In [33]:
pca = PCA(n_components=2)
X_train_pca = pca.fit_transform(X_train)
X_test_pca = pca.transform(X_test)

In [43]:
pca_log_model = LogisticRegression(random_state=1912)

# Fit and save the logistic regression model using the training data
pca_log_model.fit(X_train_pca, y_train)

In [45]:
#Generate testing predictions
pca_predictions = pca_log_model.predict(X_test_pca)

In [46]:
pca_matrix = confusion_matrix(y_test, pca_predictions)

# Print the confusion matrix for the testing data
print(pca_matrix)


[[141  21]
 [ 79  20]]


In [47]:
pca_report = classification_report(y_test, pca_predictions)

# Print the testing classification report
print(pca_report)

              precision    recall  f1-score   support

           0       0.64      0.87      0.74       162
           1       0.49      0.20      0.29        99

    accuracy                           0.62       261
   macro avg       0.56      0.54      0.51       261
weighted avg       0.58      0.62      0.57       261



# Second Model Attempt: Decision Tree

In [51]:
y_dt = dummy_df["survived"].values.reshape(-1, 1)
X_dt = X
X_trainDT, X_testDT, y_trainDT, y_testDT = train_test_split(X_dt, y_dt, random_state=1912)

In [55]:
scaler = StandardScaler()
X_scaler = scaler.fit(X_trainDT)
X_train_scaled = X_scaler.transform(X_trainDT)
X_test_scaled = X_scaler.transform(X_testDT)

In [60]:
model = tree.DecisionTreeClassifier()
model = model.fit(X_train_scaled, y_trainDT)
predictionsDT = model.predict(X_test_scaled)

In [63]:
confusion_matrix(y_testDT, predictionsDT)

array([[123,  29],
       [ 31,  78]], dtype=int64)

In [64]:
accuracy_score(y_testDT, predictionsDT)

0.7701149425287356

This attempt was better, improving our accuracy by 2 percentage points. 

# Third Model Attempt: Random Forests

In [66]:
y_rf = dummy_df["survived"].values.reshape(-1, 1)
X_rf = X
X_trainRF, X_testRF, y_trainRF, y_testRF = train_test_split(X_rf, y_rf, random_state=1912)

In [67]:
scaler = StandardScaler()
X_scalerRF = scaler.fit(X_trainRF)
X_train_scaledRF = X_scalerRF.transform(X_trainRF)
X_test_scaledRF = X_scalerRF.transform(X_testRF)

In [83]:
rf_model = RandomForestClassifier(n_estimators=1000, random_state=78)

In [84]:
rf_model = rf_model.fit(X_train_scaledRF, y_trainRF.ravel())

In [85]:
predictionsRF = rf_model.predict(X_test_scaledRF)

In [86]:
confusion_matrix(y_testRF, predictionsRF)

array([[130,  22],
       [ 32,  77]], dtype=int64)

In [87]:
accuracy_score(y_testRF,predictionsRF)

0.7931034482758621

## Assessing Feature Importance 

In [90]:
# Get the feature importance array
importances = rf_model.feature_importances_
# List the top 10 most important features
importances_sorted = sorted(zip(rf_model.feature_importances_, X_rf.columns), reverse=True)
importances_sorted[:10]

[(0.2836685202183342, 'age'),
 (0.2498147570100119, 'fare'),
 (0.13480812509836776, 'sex_male'),
 (0.13347118133586452, 'sex_female'),
 (0.08319460262126055, 'pclass'),
 (0.04485672636902161, 'sibsp'),
 (0.03334084537327364, 'parch'),
 (0.020209775490332337, 'embarked_C'),
 (0.010237745859518865, 'embarked_S'),
 (0.0063977206240144685, 'embarked_Q')]

After determining that the data on embarkment was not essential, we will try dropping it and running the RF model again

## Optimization Attempt 1 for Random Forest

In [91]:
y = y_rf
X = dummy_df.drop(["survived","embarked_C","embarked_S","embarked_Q"],axis = 1)

In [92]:
X_train,X_test,y_train,y_test = train_test_split(X,y)

In [93]:
scaler = StandardScaler()
X_scalerRF = scaler.fit(X_train)
X_train_scaledRF = X_scalerRF.transform(X_train)
X_test_scaledRF = X_scalerRF.transform(X_test)

In [95]:
rf_model = RandomForestClassifier(n_estimators=1000, random_state=1912)
rf_model = rf_model.fit(X_train_scaledRF, y_train.ravel())
predictionsRF = rf_model.predict(X_test_scaledRF)

In [96]:
confusion_matrix(y_test, predictionsRF)

array([[130,  28],
       [ 27,  76]], dtype=int64)

In [97]:
accuracy_score(y_test,predictionsRF)

0.789272030651341