# Imports

In [1]:
import os
import pickle
import numpy as np
import pandas as pd

In [2]:
# Pipelines
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

# Transformers
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import FunctionTransformer

## Models
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

## Parameter Tuning
from sklearn.model_selection import GridSearchCV

## Metrics
from sklearn.metrics import f1_score, accuracy_score, confusion_matrix, classification_report

In [3]:
from data_gadgets.cleaning import Cleaner

# Reading Data

In [4]:
path = os.path.join('..', '..', 'data', 'raw', 'data_task3.csv')
data = pd.read_csv(path)

In [5]:
data.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa


In [6]:
target = 'species'

In [7]:
X = data.drop(target, axis=1)
y = data[target]

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=0)

In [9]:
X_test

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
114,5.8,2.8,5.1,2.4
62,6.0,2.2,4.0,1.0
33,5.5,4.2,1.4,0.2
107,7.3,2.9,6.3,1.8
7,5.0,3.4,1.5,0.2
100,6.3,3.3,6.0,2.5
40,5.0,3.5,1.3,0.3
86,6.7,3.1,4.7,1.5
76,6.8,2.8,4.8,1.4
71,6.1,2.8,4.0,1.3


In [10]:
# X_train.to_csv('../../data/raw/X_train_task3.csv', index=False)
# y_train.to_csv('../../data/raw/y_train_task3.csv', index=False)
# X_test.to_csv('../../data/raw/X_test_task3.csv', index=False)
# y_test.to_csv('../../data/raw/y_test_task3.csv', index=False)

# Cleaning Pipeline

In [11]:
def cleaning_pipeline(df):
    cleaner = Cleaner()
    df = cleaner.headers(df)
    df = cleaner.categories(df)
    df = df.drop(['petal_length', 'petal_width'], axis=1)
    
    return df

In [12]:
X_train = cleaning_pipeline(X_train)
cols = Cleaner().separate_data(X_train, None)
cols

{'target': [None],
 'time': [],
 'category': [],
 'category+': [],
 'continuous': ['sepal_length', 'sepal_width'],
 'continuous+': [],
 'discrete': []}

# Components

In [13]:
# Time variables pipeline
steps = [
]
time_pipe = Pipeline(steps)

# Continuous variables pipeline
steps = [
    ("imputer", SimpleImputer(missing_values=np.nan, strategy='median')),
]
continuous_pipe = Pipeline(steps)

# Discrete variables pipeline
steps = [
    ("imputer", SimpleImputer(missing_values=np.nan, strategy='most_frequent', ))
]
discrete_pipe = Pipeline(steps)

# Category variables pipeline
steps = [
    ("imputer", SimpleImputer(missing_values=np.nan, strategy='most_frequent')),
    ("encoder", OneHotEncoder(drop='first', handle_unknown='ignore'))
]
category_pipe = Pipeline(steps)

In [14]:
transformers = [
    # ("time", time_pipe, cols['time']),
    ("continuous", continuous_pipe, cols['continuous']), 
    # ("discrete", discrete_pipe, cols['discrete']),
    # ("categorical", category_pipe, cols['category']),
]

preprocessor = ColumnTransformer(transformers, remainder='drop')

In [15]:
steps = [
    ('preprocessor', preprocessor), 
    ('model', RandomForestClassifier())
]
pipe = Pipeline(steps)

# Training

In [16]:
pipe.fit(X_train, y_train)

# Finding Best Model

In [17]:
param_grid = {
    'model': [
        SVC(),
        LogisticRegression(),
        KNeighborsClassifier(), 
        RandomForestClassifier(),
    ],
}

In [18]:
cross_validator = GridSearchCV(pipe, param_grid, cv=5, )

In [19]:
cross_validator.fit(X_train, y_train)

In [20]:
cross_validator.best_estimator_

In [21]:
cross_validator.score(X_train, y_train)

0.85

In [22]:
cross_validator.cv_results_

{'mean_fit_time': array([0.00879488, 0.03865328, 0.00540056, 0.1872004 ]),
 'std_fit_time': array([0.00312444, 0.04532799, 0.00048928, 0.00724475]),
 'mean_score_time': array([0.00360465, 0.00339403, 0.00640049, 0.01199923]),
 'std_score_time': array([4.93956461e-04, 4.81897907e-04, 8.02804526e-04, 1.46661344e-06]),
 'param_model': masked_array(data=[SVC(), LogisticRegression(), KNeighborsClassifier(),
                    RandomForestClassifier()],
              mask=[False, False, False, False],
        fill_value='?',
             dtype=object),
 'params': [{'model': SVC()},
  {'model': LogisticRegression()},
  {'model': KNeighborsClassifier()},
  {'model': RandomForestClassifier()}],
 'split0_test_score': array([0.875     , 0.875     , 0.83333333, 0.70833333]),
 'split1_test_score': array([0.70833333, 0.75      , 0.75      , 0.70833333]),
 'split2_test_score': array([0.875     , 0.875     , 0.83333333, 0.83333333]),
 'split3_test_score': array([0.79166667, 0.79166667, 0.79166667, 0.

In [23]:
# feature_names = []
# for i in preprocessor.named_transformers_:
#     if i == 'remainder':
#         continue
#     features = preprocessor.named_transformers_[i].get_feature_names_out().tolist()
#     for feature in features:
#         feature_names.append(feature)
# pd.DataFrame(preprocessor.fit_transform(X_train), columns=feature_names)

# Saving Model

In [24]:
# path = os.path.join('..', '..', 'models', 'model_task3.pkl')
# with open(path, 'wb') as file:
#     pickle.dump(cross_validator.best_estimator_, file)