# New model using SKLearn random forest regressor


In [2]:
import numpy as np
import prep_data_2
from sklearn.model_selection import RepeatedStratifiedKFold, cross_val_score
from xgboost import XGBClassifier
import pandas as pd

In [3]:
# Define Paths
train_csv_path = "C:/Users/joshc/OneDrive/Documents/01 Trying too hard/Machine Learning and AI/Kaggle/titanic/Datasets/train.csv"
summary_stats_path = "C:/Users/joshc/OneDrive/Documents/01 Trying too hard/Machine Learning and AI/Kaggle/titanic/Datasets/summary stats.txt"
final_test_csv_path = "C:/Users/joshc/OneDrive/Documents/01 Trying too hard/Machine Learning and AI/Kaggle/titanic/Datasets/test.csv"

data_df = prep_data_2.main(train_csv_path=train_csv_path)

y_feature = "Survived"
X_features = ['Age', 'Fare', 'female', 'no friends or family', '10^class']

X = data_df[X_features]
y = data_df[y_feature]

In [4]:
# Define model
model = XGBClassifier()

# Fit model
rskf = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1234)
n_scores = cross_val_score(model, X, y, scoring='accuracy', cv=rskf, n_jobs=-1)
print('Accuracy: %.3f (%.3f)' % (np.mean(n_scores), np.std(n_scores)))

Accuracy: 0.817 (0.033)


In [5]:
# Basic hyperparameter tuning
models = []
n_scores = []

max_depth_values = range(1,10)
n_estimators = [10, 50, 100, 500, 1000, 5000]

for depth in max_depth_values:
    for n in n_estimators:
        xgb_model = XGBClassifier(max_depth=depth, n_estimators=n)
        models.append(xgb_model)

for model in models:
    mean_score = np.mean(cross_val_score(model, X, y, scoring='accuracy', cv=rskf, n_jobs=-1))
    n_scores.append({
        "mean_score": mean_score,
        "max_depth": model.max_depth,
        "n_estimators": model.n_estimators
        })
    print(f"Finished model: max_depth = {model.max_depth}, n_estimators = {model.n_estimators}")

results_df = pd.DataFrame(n_scores, columns=n_scores[0].keys())   
results_df

Finished model: max_depth = 1, n_estimators = 10
Finished model: max_depth = 1, n_estimators = 50
Finished model: max_depth = 1, n_estimators = 100
Finished model: max_depth = 1, n_estimators = 500
Finished model: max_depth = 1, n_estimators = 1000
Finished model: max_depth = 1, n_estimators = 5000
Finished model: max_depth = 2, n_estimators = 10
Finished model: max_depth = 2, n_estimators = 50
Finished model: max_depth = 2, n_estimators = 100
Finished model: max_depth = 2, n_estimators = 500
Finished model: max_depth = 2, n_estimators = 1000


In [None]:
import matplotlib.pyplot as plt

xs = np.array(results_df["max_depth"])
ys = np.array(results_df["n_estimators"])
zs = np.array(results_df["mean_score"])

shape = (len(np.unique(xs)), len(np.unique(ys)))

x_cont, y_cont = np.meshgrid(np.unique(ys), np.unique(xs))
z_cont = np.reshape(zs, shape)

fig = plt.figure()
ax = fig.add_subplot(111)
plt.contourf(x_cont, y_cont, z_cont)
ax.set_xlabel("n_estimators")
ax.set_ylabel("max_depth")
ax.xaxis.label.set_color('white')
ax.yaxis.label.set_color('white')
ax.tick_params(axis='both', colors='white')

color_bar = plt.colorbar()
cbytick_obj = plt.getp(color_bar.ax.axes, 'yticklabels')                
plt.setp(cbytick_obj, color='w')

plt.show()

In [None]:
# Basic hyperparameter tuning
models = []
n_scores = []

max_depth_values = [3]
n_estimators = [50, 75, 100, 125, 150]
learning_rates = [0.4, 0.3, 0.2, 0.1]

for depth in max_depth_values:
    for n in n_estimators:
        for learning_rate in learning_rates:
            xgb_model = XGBClassifier(max_depth=depth, n_estimators=n, learning_rate=learning_rate)
            models.append(xgb_model)

for model in models:
    mean_score = np.mean(cross_val_score(model, X, y, scoring='accuracy', cv=rskf, n_jobs=-1))
    n_scores.append({
        "mean_score": mean_score,
        "max_depth": model.max_depth,
        "n_estimators": model.n_estimators,
        "learning_rate": model.learning_rate
        })
    print(f"Finished model: learning_rate = {model.learning_rate:.4f}, n_estimators = {model.n_estimators} --> mean_score = {mean_score:.3f}")

results_df = pd.DataFrame(n_scores, columns=n_scores[0].keys())

In [None]:
import matplotlib.pyplot as plt

z_cont = results_df.pivot_table(index='learning_rate', columns='n_estimators', values='mean_score').T.values

X_unique = np.sort(results_df.learning_rate.unique())
Y_unique = np.sort(results_df.n_estimators.unique())
x_cont, y_cont = np.meshgrid(X_unique, Y_unique)

fig = plt.figure()
ax = fig.add_subplot(111)
plt.contourf(x_cont, y_cont, z_cont)
ax.set_xlabel("learning_rate")
ax.set_ylabel("n_estimators")
ax.xaxis.label.set_color('white')
ax.yaxis.label.set_color('white')
ax.tick_params(axis='both', colors='white')

color_bar = plt.colorbar()
cbytick_obj = plt.getp(color_bar.ax.axes, 'yticklabels')                
plt.setp(cbytick_obj, color='w')

plt.show()

In [None]:
results_df.sort_values(by="mean_score", ascending=False)

In [None]:
final_model = XGBClassifier(max_depth=3, n_estimators=75, learning_rate=0.3)
final_model.fit(X, y)

In [None]:
test_data_df = prep_data_2.main(train_csv_path=final_test_csv_path)
X_test = test_data_df[X_features]

y_pred = final_model.predict(X_test)

y_pred_df = pd.DataFrame()
y_pred_df["PassengerId"] = test_data_df["PassengerId"]
y_pred_df["Survived"] = y_pred

y_pred_df.to_csv("Datasets/submission 8.csv", index=False)

In [None]:
# Trying again with all columns of dataframe
data_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 22 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   PassengerId           891 non-null    int64  
 1   Survived              891 non-null    int64  
 2   Pclass                891 non-null    int64  
 3   Age                   891 non-null    float64
 4   SibSp                 891 non-null    int64  
 5   Parch                 891 non-null    int64  
 6   Fare                  891 non-null    float64
 7   female                891 non-null    int32  
 8   male                  891 non-null    int32  
 9   child                 891 non-null    int32  
 10  15 - 30               891 non-null    int32  
 11  30 - 50               891 non-null    int32  
 12  50+                   891 non-null    int32  
 13  C                     891 non-null    int32  
 14  Q                     891 non-null    int32  
 15  S                     8