In [10]:
%load_ext autoreload
%autoreload 2

import warnings
warnings.filterwarnings(action='ignore', category=DeprecationWarning)
warnings.filterwarnings(action='ignore', category=FutureWarning)


import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

from sklearn.svm import SVC

from sklearn.pipeline import Pipeline

import functions as f
import classes as c

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [4]:
train_df = pd.read_csv("datasets/train.csv")
test_df = pd.read_csv("datasets/test.csv")

X = train_df.drop("Survived", axis=1).copy()
y = train_df["Survived"].copy()

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42,
                                                    stratify=X["Sex"])

In [5]:
train_df.shape, test_df.shape

((891, 12), (418, 11))

In [6]:
nominal_col = ["Pclass", "Embarked"]
binary_col = ["Sex"]
ordinal_col = ["Ticket"]
count_col = ["Age", "SibSp", "Parch"]
interval_col = ["Fare"]
col_with_miss_vals = ["Age", "Fare", "Cabin", "Embarked"]

nominal_col_cats = f.unique_values(train_df[nominal_col])
binary_col_cats = f.unique_values(train_df[binary_col])

all_initial_futures = ["Pclass", "Embarked", "Sex", "Age", "SibSp", "Parch", "Fare"]

In [7]:
missing_vallue_pip = Pipeline([
                ("selector", c.DFSelector(columns=all_initial_futures, inplace=False)),
                ("most_frequent_imputer", c.DFImputer(strategy="most_frequent", columns=[
                     "Embarked"], inplace=True)),
                ("mean_imputer", c.DFImputer(strategy="mean", columns=[
                    "Age", "Fare"], inplace=True)),
            ])
full_pip = Pipeline([
    ("missing_vallue", missing_vallue_pip),
    ("one_hot_encoder", c.DFOneHotEncoder(columns=nominal_col+binary_col, handle_unknown="ignore", categories=nominal_col_cats+binary_col_cats, sparse=False)),
    ("log_scaler", c.DFLogScaler(columns=count_col+interval_col)),
    ("scaler", c.DFScaler(columns=count_col+interval_col))
])

In [17]:
X_train_tr = full_pip.fit_transform(X_train)

In [13]:
C_params = [0.0001, 0.001, 0.1, 1, 10]
g_params = [0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000]
param_grid = [
  {'C': C_params, 'kernel': ['linear']},
  {'C': C_params, 'gamma': g_params, 'kernel': ['rbf']}
 ]
svc = SVC()
clf = GridSearchCV(svc, param_grid, cv=3, verbose=1, n_jobs=4)
clf.fit(X_train_tr, y_train)

Fitting 3 folds for each of 45 candidates, totalling 135 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done 135 out of 135 | elapsed:    0.7s finished


GridSearchCV(cv=3, error_score='raise-deprecating',
       estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='rbf', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False),
       fit_params=None, iid='warn', n_jobs=4,
       param_grid=[{'C': [0.0001, 0.001, 0.1, 1, 10], 'kernel': ['linear']}, {'C': [0.0001, 0.001, 0.1, 1, 10], 'gamma': [0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000], 'kernel': ['rbf']}],
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=1)

In [14]:
clf.best_params_

{'C': 1, 'gamma': 0.1, 'kernel': 'rbf'}

In [15]:
clf.best_score_

0.8238255033557047

In [18]:
test_df_tr = full_pip.transform(test_df)

In [19]:
test_pr = clf.best_estimator_.predict(test_df_tr)

In [21]:
test_pr.shape

(418,)

In [23]:
predict_pr = pd.DataFrame({"PassengerId": test_df["PassengerId"], "Survived": test_pr})

In [24]:
predict_pr.to_csv("prediction.csv", index=False)