In [1]:
import sys
main_path = "../.."
sys.path.append(main_path)

# importing modules
import os
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from src.scripts.model_and_evaluation import ModelAndEvaluation
import pickle

import warnings
warnings.simplefilter('ignore')

# ColonFlag model
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier

In [2]:
model_and_evaluation = ModelAndEvaluation()

In [3]:
pipe_colonflag = Pipeline([['sc', StandardScaler()], ['clf', AdaBoostClassifier(
                 base_estimator = DecisionTreeClassifier(), n_estimators=1000, random_state=0)]])

In [4]:
df_path = f"{main_path}/data/processed/dataframes"
targets = pd.read_csv(f"{df_path}/targets.csv").values.ravel()

### OHE -> DI

In [13]:
ohe_di = pd.read_csv(f"{df_path}/ohe_di.csv")
cf_metrics_score_ohe_di = model_and_evaluation.cross_val(pipe_colonflag, ohe_di.values, targets, grid_search=False)

### OHE -> DI -> FI

In [5]:
ohe_di_fi = pd.read_csv(f"{df_path}/ohe_di_fi.csv")
cf_metrics_score_ohe_di_fi = model_and_evaluation.cross_val(pipe_colonflag, ohe_di_fi.values, targets, grid_search=False)

In [14]:
from sklearn.model_selection import cross_val_score
result = cross_val_score(estimator=pipe_colonflag, X=ohe_di_fi.values, y=targets, cv=10, scoring='f1_micro')

In [15]:
result.mean()

0.85

### CDT -> FI

In [7]:
cdt_di = pd.read_csv(f"{df_path}/cdt_di.csv")
cf_metrics_score_cdt_di = model_and_evaluation.cross_val(pipe_colonflag, cdt_di.values, targets, grid_search=False)

### CDT -> DI -> FI

In [8]:
cdt_di_fi = pd.read_csv(f"{df_path}/cdt_di_fi.csv")
cf_metrics_score_cdt_di_fi = model_and_evaluation.cross_val(pipe_colonflag, cdt_di_fi.values, targets, grid_search=False)

### CDT -> DI -> OHE

In [9]:
cdt_di_ohe = pd.read_csv(f"{df_path}/cdt_di_ohe.csv")
cf_metrics_score_cdt_di_ohe = model_and_evaluation.cross_val(pipe_colonflag, cdt_di_ohe.values, targets, grid_search=False)

### CDT -> DI -> FI -> OHE

In [10]:
cdt_di_fi_ohe = pd.read_csv(f"{df_path}/cdt_di_fi_ohe.csv")
cf_metrics_score_cdt_di_fi_ohe = model_and_evaluation.cross_val(pipe_colonflag, cdt_di_fi_ohe.values, targets, grid_search=False)

### Saving Results

In [11]:
cf_scores = {'ohe_di': cf_metrics_score_ohe_di, 'ohe_di_fi': cf_metrics_score_ohe_di_fi,
         'cdt_di': cf_metrics_score_cdt_di, 'cdt_di_fi': cf_metrics_score_cdt_di_fi, 'cdt_di_ohe': cf_metrics_score_cdt_di_ohe,
         'cdt_di_fi_ohe': cf_metrics_score_cdt_di_fi_ohe}

os.makedirs(f"{main_path}/data/processed/model_scores", exist_ok=True)
with open(f"{main_path}/data/processed/model_scores/colonflag_scores.pkl", 'wb') as f:
    pickle.dump(cf_scores, f)