In [None]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import RandomForestClassifier
 

df = pd.read_csv("data/train.csv")

In [None]:
dftest= df.drop(["PassengerId"], axis="columns")
dftest.head()

#dftest["Ticket"].value_counts().head(20)



Unnamed: 0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
85,1,3,"Backstrom, Mrs. Karl Alfred (Maria Mathilda Gu...",female,33.0,3,0,3101278,15.85,,S
206,0,3,"Backstrom, Mr. Karl Alfred",male,32.0,1,0,3101278,15.85,,S


In [4]:
imp = SimpleImputer(missing_values=pd.NA, strategy="constant", fill_value=10)
#imp.fit_transformdftest.loc[(dftest["SibSp"] > 2) & pd.isna(dftest["Age"])])

prob = dftest.loc[(dftest["SibSp"] > 2) & pd.isna(dftest["Age"])]
prob.head()

Unnamed: 0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
159,0,3,"Sage, Master. Thomas Henry",male,,8,2,CA. 2343,69.55,,S
176,0,3,"Lefebre, Master. Henry Forbes",male,,3,1,4133,25.4667,,S
180,0,3,"Sage, Miss. Constance Gladys",female,,8,2,CA. 2343,69.55,,S
201,0,3,"Sage, Mr. Frederick",male,,8,2,CA. 2343,69.55,,S
229,0,3,"Lefebre, Miss. Mathilde",female,,3,1,4133,25.4667,,S


In [None]:
def show_accuracy_confusion(model, X_test, y_test):
    predictions = model.predict(X_test)
    
    print("Accuracy: " + str(accuracy_score(y_test, predictions)))
    ConfusionMatrixDisplay(confusion_matrix(y_test, predictions)).plot()    

In [None]:
def fill_values(df):
    
    dftest.loc[(dftest["SibSp"] > 2) & pd.isna(dftest["Age"])]

    
    
    imputer = SimpleImputer(missing_values=pd.NA, strategy="median")
    

In [None]:
X = df.drop(["PassengerId", "Survived", "Name"], axis="columns")
y = df["Survived"]

In [None]:
X_enc = pd.get_dummies(X, columns = ["Pclass", "Sex", "Embarked"])

has_cabin = [int(pd.isna(cabin)) for cabin in X_enc["Cabin"]]
X_enc["has cabin"] = has_cabin


#X_enc["deck"] = X_enc["Cabin"].str.get(0)
#X_enc = pd.get_dummies(X_enc, columns=["deck"])


#on_c_deck = [int(deck == 'F') for deck in X_enc["deck"]]
#X_enc["on F"] = on_c_deck

X_enc = X_enc.drop(["Cabin", "Ticket"], axis = "columns")
#X_enc.head()


In [None]:
X_enc["Age"] = imputer.fit_transform(np.array(X_enc["Age"]).reshape(-1,1))

In [None]:
pipe = Pipeline([
    ("scaler", StandardScaler()),
    ("svc", SVC())
])

X_train, X_test, y_train, y_test = train_test_split(X_enc, y, random_state=22, test_size=0.3)

In [None]:
pipe.fit(X_train, y_train)
predictions = pipe.predict(X_test)

show_accuracy_confusion(pipe, X_test, y_test)

In [None]:
forest = RandomForestClassifier(criterion="entropy", max_depth=5, n_estimators=300)
sgd = SGDClassifier(eta0=1, learning_rate="adaptive",n_jobs=-1, penalty="l2", loss="log_loss")
svc = SVC(kernel="rbf", gamma=0.01, C = 3)

for model in [forest, sgd, svc]:
    model.fit(X_train, y_train)

In [None]:
show_accuracy_confusion(forest, X_test, y_test)

In [None]:
show_accuracy_confusion(sgd, X_test, y_test)

In [None]:
show_accuracy_confusion(svc, X_test, y_test)

In [None]:
forest_pipe = Pipeline([
    ("scaler", StandardScaler()),
    ("forest", RandomForestClassifier())
])

forest_grid = {
    "forest__n_estimators": [300, 350, 400],
    "forest__max_depth": [3, 5, 6, None],
    "forest__criterion": ["gini", "entropy"],
}

forcv = GridSearchCV(estimator=forest_pipe, param_grid=forest_grid, cv = 10, scoring="accuracy")

forcv.fit(X_train, y_train)
forcv.best_params_

In [None]:
show_accuracy_confusion(forcv, X_test, y_test)

In [None]:
sgd_pipe = Pipeline([
    ("scaler", StandardScaler()),
    ("sgd", SGDClassifier())
])

sgd_params = {
    "sgd__loss": ["hinge", "log_loss", "perceptron", "modified_huber"],
    "sgd__penalty": ["l2", "elasticnet"],
    "sgd__learning_rate": ["optimal", "adaptive"],
    "sgd__n_jobs": [-1],
    "sgd__eta0" :[1]
    
}

sgd_opt = GridSearchCV(estimator=sgd_pipe, cv=10, scoring="accuracy", param_grid=sgd_params)
sgd_opt.fit(X_train, y_train)
sgd_opt.best_params_

In [None]:
show_accuracy_confusion(sgd_opt, X_test, y_test)