In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
data = pd.read_csv("dataset/train.csv")

In [3]:
from sklearn.model_selection import StratifiedShuffleSplit

split = StratifiedShuffleSplit(n_splits=1, test_size=0.2)

for train_indices, test_indices in split.split(data, data[["Survived", "Pclass", "Sex"]]):
    strat_train_set = data.loc[train_indices]
    strat_test_set = data.loc[test_indices]

In [4]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, LabelEncoder

class AgeImputer(BaseEstimator, TransformerMixin):
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        imputer = SimpleImputer(strategy="mean")
        X["Age"] = imputer.fit_transform(X[["Age"]])
        return X

class FeatureEncoder(BaseEstimator, TransformerMixin):
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        label_encoder = LabelEncoder()
        one_hot_encoder = OneHotEncoder()

        X["Sex"] = label_encoder.fit_transform(X["Sex"])

        matrix = one_hot_encoder.fit_transform(X[["Embarked"]]).toarray()
        df = pd.DataFrame(matrix, columns=one_hot_encoder.get_feature_names_out(["Embarked"]))

        df.index = X.index
        X = pd.concat([X, df], axis=1)
        
        return X 

class FeatureDropper(BaseEstimator, TransformerMixin):
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        return X.drop(["PassengerId", "Embarked", "Name", "Ticket", "Cabin", "Embarked_nan"], axis=1, errors="ignore")

In [None]:
from sklearn.pipeline import Pipeline

pipeline = Pipeline([
    ("age_imputer", AgeImputer()),
    ("feature_encoder", FeatureEncoder()),
    ("feature_dropper", FeatureDropper())
])

In [None]:
from sklearn.preprocessing import StandardScaler

strat_train_set = pipeline.fit_transform(strat_train_set)

X_data_train = strat_train_set.drop(['Survived'], axis=1)
y_data_train = strat_train_set['Survived']

scaler = StandardScaler()

X_data_train = scaler.fit_transform(X_data_train)
y_data_train = y_data_train.to_numpy()

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

rf = RandomForestClassifier()

param_grid = [{
        "n_estimators": [10, 100, 200, 500, 750],
        "max_depth": [None, 5, 10, 12],
        "min_samples_split": [2, 3, 4]
    }]

grid_search = GridSearchCV(rf, param_grid, cv=3, scoring="accuracy")
grid_search.fit(X_data_train, y_data_train)

In [None]:
best_rf = grid_search.best_estimator_

strat_test_set = pipeline.fit_transform(strat_test_set)

X_data_test = strat_test_set.drop(['Survived'], axis=1)
y_data_test = strat_test_set['Survived']

scaler = StandardScaler()

X_data_test = scaler.fit_transform(X_data_test)
y_data_test = y_data_test.to_numpy()

best_rf.score(X_data_test, y_data_test)

In [None]:
final_data = pipeline.fit_transform(data)

X_final = final_data.drop(['Survived'],axis=1)
y_final = final_data['Survived']

scaler = StandardScaler()

X_data_final = scaler.fit_transform(X_final)
y_data_final = y_final.to_numpy()

final_rf = RandomForestClassifier()

param_grid = [{
        "n_estimators": [10, 100, 200, 500, 750],
        "max_depth": [None, 5, 10, 12],
        "min_samples_split": [2, 3, 4]
    }]

grid_search = GridSearchCV(final_rf, param_grid, cv=3, scoring="accuracy")
grid_search.fit(X_data_final, y_data_final)

best_final_rf = grid_search.best_estimator_

In [None]:
test_data = pd.read_csv("files/dataset/test.csv")

scaler = StandardScaler()

X_test = pipeline.fit_transform(test_data)
X_test = scaler.fit_transform(X_test)

In [None]:
predictions = best_final_rf.predict(X_test)

predictions_df = pd.DataFrame(test_data['PassengerId'])
predictions_df['Survived'] = predictions

predictions_df.to_csv("files/dataset/predictions.csv", index=False)