In [60]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.impute import KNNImputer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score

In [61]:
df_train = pd.read_csv(f"dataset/train.csv", index_col= "PassengerId")
df_test = pd.read_csv(f"dataset/test.csv", index_col= "PassengerId")
gender = pd.read_csv(f"dataset/gender_submission.csv", index_col= "PassengerId")

In [62]:
df_train.isnull().sum()

Survived      0
Pclass        0
Name          0
Sex           0
Age         177
SibSp         0
Parch         0
Ticket        0
Fare          0
Cabin       687
Embarked      2
dtype: int64

In [63]:
df_test.isnull().sum()

Pclass        0
Name          0
Sex           0
Age          86
SibSp         0
Parch         0
Ticket        0
Fare          1
Cabin       327
Embarked      0
dtype: int64

In [64]:
df_train

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...
887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


To visualize better the columns we will transform the Sex and Embarked columns to numeric. Sex column only has two categories Female and Male, Embarked column has tree labels S, C and Q

In [65]:
# Embarked, fill with most relevant data S
df_train.fillna('S', inplace=True)
df_test.fillna('S', inplace=True)

# Sex
change = {'female':0,'male':1}
df_train.Sex = df_train.Sex.map(change)
df_test.Sex = df_test.Sex.map(change)

# Embarked
change = {'S':0,'C':1,'Q':2}
df_train.Embarked = df_train.Embarked.map(change)
df_test.Embarked = df_test.Embarked.map(change)
df_train

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,0,3,"Braund, Mr. Owen Harris",1,22.0,1,0,A/5 21171,7.2500,,0.0
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,38.0,1,0,PC 17599,71.2833,C85,1.0
3,1,3,"Heikkinen, Miss. Laina",0,26.0,0,0,STON/O2. 3101282,7.9250,,0.0
4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,35.0,1,0,113803,53.1000,C123,0.0
5,0,3,"Allen, Mr. William Henry",1,35.0,0,0,373450,8.0500,,0.0
...,...,...,...,...,...,...,...,...,...,...,...
887,0,2,"Montvila, Rev. Juozas",1,27.0,0,0,211536,13.0000,,0.0
888,1,1,"Graham, Miss. Margaret Edith",0,19.0,0,0,112053,30.0000,B42,0.0
889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",0,,1,2,W./C. 6607,23.4500,,0.0
890,1,1,"Behr, Mr. Karl Howell",1,26.0,0,0,111369,30.0000,C148,1.0


In [66]:
df_train = df_train.reset_index()
df_test = df_test.reset_index()
handle_train = df_train.drop(['Survived', 'Cabin','Name','PassengerId','Ticket'], axis = 1)
handle_test = df_test.drop(['Cabin', 'Name','PassengerId','Ticket'], axis = 1)

In [67]:
# Critical point is that the KNN Imptuer is a distance-based imputation method and it requires to normalize data. 
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
handle_train = pd.DataFrame(scaler.fit_transform(handle_train), columns = handle_train.columns)
handle_test = pd.DataFrame(scaler.fit_transform(handle_test), columns = handle_test.columns)

In [68]:
from sklearn.impute import KNNImputer
imputer = KNNImputer(n_neighbors=10)
imputer.fit(handle_train)
df_train[handle_train.columns] = imputer.transform(handle_train)
df_test[handle_test.columns] = imputer.transform(handle_test)

In [69]:
feature = df_train.drop(['Survived','PassengerId','Name','Ticket', 'Cabin','Embarked'], axis = 1)
label = df_train['Survived']
test = df_test.drop([ 'Name','PassengerId','Ticket', 'Cabin','Embarked'], axis = 1)
feature

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare
0,1.0,1.0,0.271174,0.125,0.000000,0.014151
1,0.0,0.0,0.472229,0.125,0.000000,0.139136
2,1.0,0.0,0.321438,0.000,0.000000,0.015469
3,0.0,0.0,0.434531,0.125,0.000000,0.103644
4,1.0,1.0,0.434531,0.000,0.000000,0.015713
...,...,...,...,...,...,...
886,0.5,1.0,0.334004,0.000,0.000000,0.025374
887,0.0,0.0,0.233476,0.000,0.000000,0.058556
888,1.0,0.0,0.286253,0.125,0.333333,0.045771
889,0.0,1.0,0.321438,0.000,0.000000,0.058556


In [70]:
X_train, X_test, y_train, y_test = train_test_split(
      feature,label, test_size=0.10, random_state=42)

In [71]:
import optuna
from sklearn.linear_model import LogisticRegression

def objective(trial):
    logreg_c = trial.suggest_float("logreg_c", 1e-10, 1e1, log=True)
    max_iter = trial.suggest_int("max_iter", 50, 3000)
    solver = trial.suggest_categorical("solver", ["newton-cg", "lbfgs", "liblinear"])
    penalty = trial.suggest_categorical("penalty", ["l2"])

    LRC = LogisticRegression(C=logreg_c,max_iter=max_iter,solver=solver,penalty=penalty)
    LRC.fit(X_train, y_train)
    return 1.0 - accuracy_score(y_test, LRC.predict(X_test))
study = optuna.create_study()
study.optimize(objective, n_trials = 500)
print(study.best_params)
print(1.0 - study.best_value)

hyperparameters = {
'C':np.logspace(0, 10, 50), 
'penalty':['l2'],
'random_state':[42],
'max_iter':[200,500,1000],
'solver':['newton-cg', 'lbfgs', 'liblinear']
}
modellrOpt = LogisticRegression(C=study.best_params.get('logreg_c'),max_iter=study.best_params.get('max_iter'),solver=study.best_params.get('solver'),penalty=study.best_params.get('penalty'))

#rfOpt.fit(X_train, y_train)

#pred=rfOpt.predict(X_test)
#print("Accuracy for Random Forest on CV data: ",accuracy_score(y_test,pred))
modellrOpt.fit(feature, label)
predictions = modellrOpt.predict(test)

output = pd.DataFrame({'PassengerId': df_test.PassengerId, 'Survived': predictions})
output.to_csv('submission.csv', index=False)
print("Your submission was successfully saved!")

[32m[I 2023-02-18 22:58:48,330][0m A new study created in memory with name: no-name-967a27d3-3e04-4237-9224-2732bbe300c8[0m
[32m[I 2023-02-18 22:58:48,342][0m Trial 0 finished with value: 0.4 and parameters: {'logreg_c': 1.2638206910467543e-05, 'max_iter': 847, 'solver': 'liblinear', 'penalty': 'l2'}. Best is trial 0 with value: 0.4.[0m
[32m[I 2023-02-18 22:58:48,356][0m Trial 1 finished with value: 0.4 and parameters: {'logreg_c': 4.175362495547279e-06, 'max_iter': 1452, 'solver': 'newton-cg', 'penalty': 'l2'}. Best is trial 0 with value: 0.4.[0m
[32m[I 2023-02-18 22:58:48,362][0m Trial 2 finished with value: 0.4 and parameters: {'logreg_c': 0.0013182769036805251, 'max_iter': 979, 'solver': 'liblinear', 'penalty': 'l2'}. Best is trial 0 with value: 0.4.[0m
[32m[I 2023-02-18 22:58:48,372][0m Trial 3 finished with value: 0.4 and parameters: {'logreg_c': 1.8158917407519873e-06, 'max_iter': 2247, 'solver': 'lbfgs', 'penalty': 'l2'}. Best is trial 0 with value: 0.4.[0m
[32m

{'logreg_c': 3.5440000863324483, 'max_iter': 2715, 'solver': 'lbfgs', 'penalty': 'l2'}
0.8666666666666667
Your submission was successfully saved!


In [72]:
from sklearn.tree import DecisionTreeClassifier 

def objective(trial):
    max_depth = trial.suggest_int("max_depth", 2, 300)
    min_samples_split = trial.suggest_int("min_samples_split", 2, 300)
    max_leaf_nodes = int(trial.suggest_int("max_leaf_nodes", 2, 300))
    splitter = trial.suggest_categorical("splitter",["best","random"])
    criterion = trial.suggest_categorical("criterion", ["gini", "entropy"])
    min_samples_leaf = trial.suggest_int("min_samples_leaf", 2, 300)
    DTC = DecisionTreeClassifier(min_samples_split = min_samples_split, 
                            max_leaf_nodes = max_leaf_nodes, max_depth=max_depth, random_state=42, splitter=splitter,min_samples_leaf=min_samples_leaf,
                            criterion = criterion)
    DTC.fit(X_train, y_train)
    return 1.0 - accuracy_score(y_test, DTC.predict(X_test))
study = optuna.create_study()
study.optimize(objective, n_trials = 200)
print(study.best_params)
print(1.0 - study.best_value)

dtOpt = DecisionTreeClassifier(min_samples_split = study.best_params.get('min_samples_split'), 
                                max_leaf_nodes = study.best_params.get('max_leaf_nodes'), max_depth=study.best_params.get('max_depth'), random_state=42, splitter=study.best_params.get('splitter'),min_samples_leaf=study.best_params.get('min_samples_leaf'),
                                criterion = study.best_params.get('criterion'))

dtOpt.fit(feature, label)
predictions = dtOpt.predict(test)

output = pd.DataFrame({'PassengerId': df_test.PassengerId, 'Survived': predictions})
output.to_csv('submission.csv', index=False)
print("Your submission was successfully saved!")

[32m[I 2023-02-18 22:59:38,288][0m A new study created in memory with name: no-name-41c0f2d9-3ef5-43c1-b472-eeced8fb648d[0m
[32m[I 2023-02-18 22:59:38,298][0m Trial 0 finished with value: 0.3222222222222222 and parameters: {'max_depth': 31, 'min_samples_split': 193, 'max_leaf_nodes': 292, 'splitter': 'random', 'criterion': 'gini', 'min_samples_leaf': 293}. Best is trial 0 with value: 0.3222222222222222.[0m
[32m[I 2023-02-18 22:59:38,306][0m Trial 1 finished with value: 0.18888888888888888 and parameters: {'max_depth': 68, 'min_samples_split': 66, 'max_leaf_nodes': 201, 'splitter': 'random', 'criterion': 'gini', 'min_samples_leaf': 238}. Best is trial 1 with value: 0.18888888888888888.[0m
[32m[I 2023-02-18 22:59:38,313][0m Trial 2 finished with value: 0.18888888888888888 and parameters: {'max_depth': 225, 'min_samples_split': 257, 'max_leaf_nodes': 225, 'splitter': 'random', 'criterion': 'gini', 'min_samples_leaf': 233}. Best is trial 1 with value: 0.18888888888888888.[0m
[

{'max_depth': 190, 'min_samples_split': 16, 'max_leaf_nodes': 131, 'splitter': 'best', 'criterion': 'gini', 'min_samples_leaf': 40}
0.8777777777777778
Your submission was successfully saved!
