In [None]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import RandomForestClassifier
 

df = pd.read_csv("data/train.csv")

In [None]:
def show_accuracy_confusion(model, X_test, y_test):
    predictions = model.predict(X_test)
    
    print("Accuracy: " + str(accuracy_score(y_test, predictions)))
    ConfusionMatrixDisplay(confusion_matrix(y_test, predictions)).plot()    

In [None]:
def fill_values(df):
    
    #Filling missing age values based on the median of Pclass/Sex groups
    df["Age"] = df.groupby(["Pclass", "Sex"])["Age"].transform(lambda x: x.fillna(x.median()))

In [None]:
def split_data(df):

    X = df.drop(["PassengerId", "Survived", "Embarked"], axis="columns")
    y = df["Survived"]
    
    return [X, y]

In [None]:
X = split_data(df)[0]
y = split_data(df)[1]

fill_values(X)

In [None]:
def get_titles_series(X):
    titles = X["Name"].str.split(',').str.get(1).str.split('.').str.get(0)

    relevant_titles =  ['Mr', 'Mrs', 'Miss', 'Master']
    pattern = '|'.join(relevant_titles)

    titles = titles.loc[titles.str.contains(pattern)]
    
    return titles

In [None]:
def encode_data(X):

    X["Title"] = get_titles_series(X)
    X_enc = pd.get_dummies(X, columns=["Pclass", "Sex", "Title"])
    X_enc = X_enc.drop(["Name", "Ticket", "Cabin"], axis="columns")
    
    return X_enc

In [None]:
def isminor(row):
    if (row["Age"] < 18) or (row["Title_ Master"] == 1):
        return 1
    else:
        return 0

def attach_isminor(X_enc):

    X_enc["isminor"] = X_enc.apply(lambda row: isminor(row), axis=1)
    return X_enc

In [None]:
def prepare_data(X):
    X_enc = encode_data(X)
    X_enc = attach_isminor(X_enc)
    
    return X_enc

X_enc = prepare_data(X)

In [None]:
X_enc.head(10)

In [None]:
default_svc = Pipeline([
    ("scaler", StandardScaler()),
    ("svc", SVC())
])

X_train, X_test, y_train, y_test = train_test_split(X_enc, y, random_state=38)

In [None]:
default_svc.fit(X_train, y_train)
predictions = default_svc.predict(X_test)

show_accuracy_confusion(default_svc, X_test, y_test)

In [None]:
forest = RandomForestClassifier(criterion="gini", max_depth=7, n_estimators=200, max_features=None)
svc = SVC(kernel="poly", gamma=0.001, C = 2, coef0=1, degree=3)

for model in [forest, svc]:
    model.fit(X_train, y_train)

In [None]:
show_accuracy_confusion(forest, X_test, y_test)

In [None]:
show_accuracy_confusion(svc, X_test, y_test)

In [None]:
forest_pipe = Pipeline([
    ("scaler", StandardScaler()),
    ("forest", RandomForestClassifier())
])

forest_grid = {
    "forest__n_estimators": [200, 300, 320],
    "forest__max_depth": [4, 5, 6, 7],
    "forest__criterion": ["gini", "entropy"],
    "forest__max_features": [None, "sqrt", "log2"]
}

forcv = GridSearchCV(estimator=forest_pipe, param_grid=forest_grid, cv = 10, scoring="accuracy")

forcv.fit(X_train, y_train)
forcv.best_params_

In [None]:
show_accuracy_confusion(forcv, X_test, y_test)

In [None]:
sgd_pipe = Pipeline([
    ("scaler", StandardScaler()),
    ("sgd", SGDClassifier())
])

sgd_params = {
    "sgd__loss": ["hinge", "log_loss", "perceptron", "modified_huber"],
    "sgd__penalty": ["l2", "elasticnet", "l1", None],
    "sgd__learning_rate": ["adaptive", "optimal"],
    "sgd__n_jobs": [-1],
    "sgd__eta0" :[1, 2.15, 2.25, 2.5],
    "sgd__alpha": [0.0001, 0.01, 0.1],
    "sgd__l1_ratio": [0.15]
    
}

sgd_opt = GridSearchCV(estimator=sgd_pipe, cv=10, scoring="accuracy", param_grid=sgd_params)
sgd_opt.fit(X_train, y_train)
sgd_opt.best_params_

In [None]:
show_accuracy_confusion(sgd_opt, X_test, y_test)

In [None]:
pipe1 = Pipeline([
    ("scale", StandardScaler()),
    ("sgd1", SGDClassifier(alpha=0.01, eta0=2.5, l1_ratio=0.15, learning_rate='optimal', loss='log_loss',n_jobs=-1, penalty=None))
])

pipe2 = Pipeline([
    ("scale", StandardScaler()),
    ("sgd1", SGDClassifier(alpha=0.01, eta0=2.25, l1_ratio=0.15, learning_rate='optimal', loss='modified_huber',n_jobs=-1, penalty='l1'))
])

In [None]:
for pipe in [pipe1, pipe2]:
    pipe.fit(X_train, y_train)

In [None]:
show_accuracy_confusion(pipe1, X_test, y_test)

In [None]:
show_accuracy_confusion(pipe2, X_test, y_test)

In [None]:
to_predict = pd.read_csv("data/test.csv")

to_predict.head()

In [None]:
fill_values(to_predict)

pclass_grp = to_predict.groupby("Pclass")
median_fare_third_class = pclass_grp["Fare"].median()[3]

to_predict.loc[pd.isna(to_predict["Fare"])] = to_predict.loc[pd.isna(to_predict["Fare"])].fillna(median_fare_third_class)


In [None]:
X_topred = to_predict.drop(['PassengerId', 'Embarked'], axis='columns')
X_topred_enc = prepare_data(X_topred)

X_topred_enc.head(10)

In [None]:
predicted = to_predict.loc[:,['PassengerId']]

preds = forcv.predict(X_topred_enc)

In [None]:
predicted["Survived"] = preds

In [None]:
predicted.to_csv("predictions.csv", index=False)