# Imports

In [1]:
import os
import pickle
import numpy as np
import pandas as pd

In [2]:
# Pipelines
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

# Transformers
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import FunctionTransformer

## Models
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier

## Parameter Tuning
from sklearn.model_selection import GridSearchCV

## Metrics
from sklearn.metrics import f1_score, accuracy_score, confusion_matrix, classification_report

In [3]:
from data_gadgets.cleaning import Cleaner

# Reading Data

In [4]:
path = os.path.join('..', '..', 'data', 'raw', 'data_task5.csv')
data = pd.read_csv(path)

In [5]:
data.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


In [6]:
target = 'Class'

In [7]:
X = data.drop(target, axis=1)
y = data[target]

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=0)

In [9]:
X_test

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount
183484,125821.0,-0.323334,1.057455,-0.048341,-0.607204,1.259821,-0.091761,1.159101,-0.124335,-0.174640,...,0.186409,-0.207098,-0.433890,-0.261613,-0.046651,0.211512,0.008297,0.108494,0.161139,40.00
255448,157235.0,-0.349718,0.932619,0.142992,-0.657071,1.169784,-0.733369,1.009985,-0.071069,-0.302083,...,-0.096502,-0.271537,-0.833209,-0.030360,0.490035,-0.404816,0.134350,0.076830,0.175562,1.98
244749,152471.0,-1.614711,-2.406570,0.326194,0.665520,2.369268,-1.775367,-1.139049,0.329904,0.903813,...,0.419835,0.701399,1.134489,0.965054,0.640981,-1.801998,-1.041114,0.286285,0.437322,96.00
63919,50927.0,-2.477184,0.860613,1.441850,1.051019,-1.856621,2.078384,0.510828,-0.243399,-0.260691,...,-0.987790,0.810408,0.692245,0.150121,-0.260777,0.005183,-0.177847,-0.510060,-0.660533,308.00
11475,19899.0,1.338831,-0.547264,0.737389,-0.212383,-1.110039,-0.525744,-0.801403,-0.063672,0.997276,...,-0.126871,-0.139436,-0.074719,0.067055,0.333122,0.379087,-0.268706,-0.002769,0.003272,5.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
236778,148949.0,-1.227033,0.987207,0.654979,-2.559724,0.346834,-0.634095,0.738285,0.405477,0.070475,...,-0.097164,-0.070342,-0.293542,-0.371721,0.518298,0.858450,0.749546,0.011475,0.027130,20.30
127073,78200.0,1.250596,0.159552,0.147621,0.472220,-0.023937,-0.287444,0.021041,-0.041342,0.102588,...,-0.130836,-0.210083,-0.580817,0.049119,-0.429060,0.299309,0.234803,-0.022217,0.006667,6.15
208502,137149.0,1.125402,-2.288998,-3.123785,-0.103566,-0.311680,-1.151728,1.184200,-0.635862,-1.314001,...,0.484541,0.263259,0.020985,-0.636845,-0.259110,0.272149,0.961966,-0.240867,-0.007494,583.21
263323,160893.0,2.064857,0.285198,-2.487311,0.357674,0.965436,-0.971181,0.622246,-0.313264,-0.190080,...,-0.161876,0.090425,0.422986,-0.019409,0.691878,0.354898,0.662896,-0.103162,-0.061743,2.95


In [10]:
# X_train.to_csv('../../data/raw/X_train_task5.csv', index=False)
# y_train.to_csv('../../data/raw/y_train_task5.csv', index=False)
# X_test.to_csv('../../data/raw/X_test_task5.csv', index=False)
# y_test.to_csv('../../data/raw/y_test_task5.csv', index=False)

# Cleaning Pipeline

In [11]:
def cleaning_pipeline(df):
    cleaner = Cleaner()
    df = cleaner.headers(df)
    df = cleaner.categories(df)
    
    return df

In [12]:
# X_train = cleaning_pipeline(X_train)
cols = Cleaner().separate_data(X_train, None)
cols

{'target': [None],
 'time': [],
 'category': [],
 'category+': [],
 'continuous': ['Time',
  'V1',
  'V2',
  'V3',
  'V4',
  'V5',
  'V6',
  'V7',
  'V8',
  'V9',
  'V10',
  'V11',
  'V12',
  'V13',
  'V14',
  'V15',
  'V16',
  'V17',
  'V18',
  'V19',
  'V20',
  'V21',
  'V22',
  'V23',
  'V24',
  'V25',
  'V26',
  'V27',
  'V28',
  'Amount'],
 'continuous+': [],
 'discrete': []}

# Components

In [13]:
# Time variables pipeline
steps = [
]
time_pipe = Pipeline(steps)

# Continuous variables pipeline
steps = [
    ("imputer", SimpleImputer(missing_values=np.nan, strategy='median')),
]
continuous_pipe = Pipeline(steps)

# Discrete variables pipeline
steps = [
    ("imputer", SimpleImputer(missing_values=np.nan, strategy='most_frequent', ))
]
discrete_pipe = Pipeline(steps)

# Category variables pipeline
steps = [
    ("imputer", SimpleImputer(missing_values=np.nan, strategy='most_frequent')),
    ("encoder", OneHotEncoder(drop='first', handle_unknown='ignore'))
]
category_pipe = Pipeline(steps)

In [14]:
transformers = [
    # ("time", time_pipe, cols['time']),
    ("continuous", continuous_pipe, cols['continuous']), 
    ("discrete", discrete_pipe, cols['discrete']),
    # ("categorical", category_pipe, cols['category']),
]

preprocessor = ColumnTransformer(transformers, remainder='drop')

In [15]:
steps = [
    ('preprocessor', preprocessor), 
    ('model', RandomForestClassifier())
]
pipe = Pipeline(steps)

# Training

In [16]:
pipe.fit(X_train, y_train)

In [17]:
pred = pipe.predict(X_test)

In [18]:
accuracy_score(y_test, pred)

0.9995084442259752

# Finding Best Model

In [19]:
param_grid = {
    'model': [
        SVC(),
        KNeighborsClassifier(),  
        RandomForestClassifier(),
    ],
}

In [20]:
cross_validator = GridSearchCV(pipe, param_grid, cv=5, )

In [21]:
cross_validator.fit(X_train, y_train)

In [22]:
cross_validator.best_estimator_

In [23]:
cross_validator.score(X_train, y_train)

0.9999956110513727

In [24]:
cross_validator.cv_results_

{'mean_fit_time': array([ 13.79499393,   1.65440311, 562.46870384]),
 'std_fit_time': array([0.71202051, 0.2886757 , 8.94113662]),
 'mean_score_time': array([ 6.17860737, 51.98862681,  0.67779846]),
 'std_score_time': array([0.81639431, 2.92155968, 0.06535718]),
 'param_model': masked_array(data=[SVC(), KNeighborsClassifier(),
                    RandomForestClassifier()],
              mask=[False, False, False],
        fill_value='?',
             dtype=object),
 'params': [{'model': SVC()},
  {'model': KNeighborsClassifier()},
  {'model': RandomForestClassifier()}],
 'split0_test_score': array([0.99828831, 0.99839803, 0.99949527]),
 'split1_test_score': array([0.99828831, 0.99835414, 0.99947333]),
 'split2_test_score': array([0.99828831, 0.9983322 , 0.99953916]),
 'split3_test_score': array([0.99828831, 0.99844192, 0.99956111]),
 'split4_test_score': array([0.99826637, 0.9983322 , 0.99951722]),
 'mean_test_score': array([0.99828392, 0.9983717 , 0.99951722]),
 'std_test_score': arra

In [25]:
# feature_names = []
# for i in preprocessor.named_transformers_:
#     if i == 'remainder':
#         continue
#     features = preprocessor.named_transformers_[i].get_feature_names_out().tolist()
#     for feature in features:
#         feature_names.append(feature)
# pd.DataFrame(preprocessor.fit_transform(X_train), columns=feature_names)

# Saving Model

In [26]:
# path = os.path.join('..', '..', 'models', 'model_task5.pkl')
# with open(path, 'wb') as file:
#     pickle.dump(cross_validator.best_estimator_, file)