# Imports

In [1]:
import os
import pickle
import numpy as np
import pandas as pd

In [2]:
# Pipelines
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

# Transformers
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import FunctionTransformer

## Models
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier

## Parameter Tuning
from sklearn.model_selection import GridSearchCV

## Metrics
from sklearn.metrics import f1_score, accuracy_score, confusion_matrix, classification_report

In [3]:
from data_gadgets.cleaning import Cleaner

# Reading Data

In [4]:
path = os.path.join('..', '..', 'data', 'raw', 'data_task1.csv')
data = pd.read_csv(path)

In [5]:
data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,0,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,1,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,0,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,0,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,1,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [6]:
target = 'Survived'

In [7]:
X = data.drop(target, axis=1)
y = data[target]

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=0)

In [9]:
X_test

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
360,1252,3,"Sage, Master. William Henry",male,14.5,8,2,CA. 2343,69.5500,,S
170,1062,3,"Lithman, Mr. Simon",male,,0,0,S.O./P.P. 251,7.5500,,S
224,1116,1,"Candee, Mrs. Edward (Helen Churchill Hungerford)",female,53.0,0,0,PC 17606,27.4458,,C
358,1250,3,"O'Keefe, Mr. Patrick",male,,0,0,368402,7.7500,,Q
309,1201,3,"Hansen, Mrs. Claus Peter (Jennie L Howard)",female,45.0,1,0,350026,14.1083,,S
...,...,...,...,...,...,...,...,...,...,...,...
100,992,1,"Stengel, Mrs. Charles Emil Henry (Annie May Mo...",female,43.0,1,0,11778,55.4417,C116,C
7,899,2,"Caldwell, Mr. Albert Francis",male,26.0,1,1,248738,29.0000,,S
22,914,1,"Flegenheim, Mrs. Alfred (Antoinette)",female,,0,0,PC 17598,31.6833,,S
68,960,1,"Tucker, Mr. Gilbert Milligan Jr",male,31.0,0,0,2543,28.5375,C53,C


In [10]:
# X_train.to_csv('../../data/raw/X_train_task1.csv', index=False)
# y_train.to_csv('../../data/raw/y_train_task1.csv', index=False)
# X_test.to_csv('../../data/raw/X_test_task1.csv', index=False)
# y_test.to_csv('../../data/raw/y_test_task1.csv', index=False)

# Cleaning Pipeline

In [11]:
def cleaning_pipeline(df):
    cleaner = Cleaner()
    df = cleaner.headers(df)
    df = cleaner.categories(df)
    df['cabin'] = df['cabin'].map(lambda x: x[0] if type(x) is not float else 'N')

    df['sibsp'] = df['sibsp'].map(lambda x: 1 if x >=1 else 0)
    df['parch'] = df['parch'].map(lambda x: 1 if x >=1 else 0)
    family = []
    for value1, value2 in zip(df.sibsp, df.parch):
        if value1 == 1 or value2 == 1:
            family.append(1)
        else:
            family.append(0)
    df['family'] = family
    df = df.drop(['sibsp', 'parch'], axis=1)
    
    return df

In [12]:
X_train = cleaning_pipeline(X_train)
cols = Cleaner().separate_data(X_train, None)
cols

{'target': [None],
 'time': [],
 'category': ['sex', 'cabin', 'embarked'],
 'category+': ['name', 'ticket'],
 'continuous': ['age', 'fare'],
 'continuous+': ['passengerid'],
 'discrete': ['pclass', 'family']}

# Components

In [13]:
# steps = [
#     ("imputer", SimpleImputer(missing_values=np.nan, strategy='constant', fill_value='N')),
#     ("scaler", FunctionTransformer(impute_first_letter, feature_names_out='one-to-one')),
#     ("encoder", OneHotEncoder(drop='first', handle_unknown='ignore'))
# ]

# impute_cabin = Pipeline(steps)

In [14]:
# Time variables pipeline
steps = [
]
time_pipe = Pipeline(steps)

# Continuous variables pipeline
steps = [
    ("imputer", SimpleImputer(missing_values=np.nan, strategy='median')),
]
continuous_pipe = Pipeline(steps)

# Discrete variables pipeline
steps = [
    ("imputer", SimpleImputer(missing_values=np.nan, strategy='most_frequent', ))
]
discrete_pipe = Pipeline(steps)

# Category variables pipeline
steps = [
    ("imputer", SimpleImputer(missing_values=np.nan, strategy='most_frequent')),
    ("encoder", OneHotEncoder(drop='first', handle_unknown='ignore'))
]
category_pipe = Pipeline(steps)

In [15]:
transformers = [
    # ("time", time_pipe, cols['time']),
    ("continuous", continuous_pipe, cols['continuous']), 
    ("discrete", discrete_pipe, cols['discrete']),
    ("categorical", category_pipe, cols['category']),
]

preprocessor = ColumnTransformer(transformers, remainder='drop')

In [16]:
steps = [
    ('preprocessor', preprocessor), 
    ('model', RandomForestClassifier())
]
pipe = Pipeline(steps)

# Training

In [17]:
pipe.fit(X_train, y_train)

# Finding Best Model

In [18]:
param_grid = {
    'model': [
        SVC(),
        KNeighborsClassifier(),  
        RandomForestClassifier(),
    ],
}

In [19]:
cross_validator = GridSearchCV(pipe, param_grid, cv=5, )

In [20]:
cross_validator.fit(X_train, y_train)



In [21]:
cross_validator.best_estimator_

In [22]:
cross_validator.score(X_train, y_train)

1.0

In [23]:
cross_validator.cv_results_

{'mean_fit_time': array([0.04099803, 0.0229116 , 0.2219996 ]),
 'std_fit_time': array([0.03462168, 0.00487263, 0.00797488]),
 'mean_score_time': array([0.01825609, 0.01520967, 0.01920528]),
 'std_score_time': array([0.01654885, 0.00117229, 0.00146583]),
 'param_model': masked_array(data=[SVC(), KNeighborsClassifier(),
                    RandomForestClassifier()],
              mask=[False, False, False],
        fill_value='?',
             dtype=object),
 'params': [{'model': SVC()},
  {'model': KNeighborsClassifier()},
  {'model': RandomForestClassifier()}],
 'split0_test_score': array([0.59701493, 0.62686567, 1.        ]),
 'split1_test_score': array([0.65671642, 0.67164179, 1.        ]),
 'split2_test_score': array([0.65671642, 0.70149254, 1.        ]),
 'split3_test_score': array([0.65671642, 0.68656716, 1.        ]),
 'split4_test_score': array([0.66666667, 0.68181818, 1.        ]),
 'mean_test_score': array([0.64676617, 0.67367707, 1.        ]),
 'std_test_score': array([0.0251

In [24]:
# feature_names = []
# for i in preprocessor.named_transformers_:
#     if i == 'remainder':
#         continue
#     features = preprocessor.named_transformers_[i].get_feature_names_out().tolist()
#     for feature in features:
#         feature_names.append(feature)
# pd.DataFrame(preprocessor.fit_transform(X_train), columns=feature_names)

# Saving Model

In [25]:
# path = os.path.join('..', '..', 'models', 'model_task1.pkl')
# with open(path, 'wb') as file:
#     pickle.dump(cross_validator.best_estimator_, file)