In [None]:
import pandas as pd
import numpy as np
import json

from utils.models import find_best_model
from utils.search_model import train_models
from utils.get_parameters import max_score_for_each, get_combinations
from utils.visual import plot_results
import os
import matplotlib.pyplot as plt


## Model Training

The first step of model training is to train combinations of algorithm, imputation and balancing approaches\
The algorithms are:

    - KNN
    - Decision Tree
    - Logistic regression
    - SVM
    - Naive Bayes
    - RandomForestClassifier
    - GradientBoostingClassifier
    - BaggingClassifier
    - XGBClassifier

The balancing algorithms are:

    - SMOTE (oversampling)
    - MWMOTE (oversampling)
    - ADASYN (oversampling)
    - AllKNN (undersampling)
    - None (using original data)

The imputation techqniues are:

    - Simple Imputer with mode: for categorical features
    - Simple Imputer with mean: for numerical features
    - KNN Imputer with mean: for numerical features


## Project structure

- [GitHub](https://github.com/albermakaryan/Machine_learning)

- **[data/](data/)**
   - **[initial_data/](data/initial_data/)**:  Full data set for modeling
   - **[model_data/](data/model_data/)**    :  Period 3 data set
- **[src/](src/)**
   - **[models/](src/models/)**   :  The best models for each algorithm in .pkl format
   - **[results/](src/results/)** :  Performance results for each algorithm and total
   - **[utils/](src/utisl/)**     :  Utility functions package
      - **[__init__.py](src/utils/__init__.py)**:  Python package initialization
      - **[data_preparation.py](src/utils/data_preparation.py)**:  Utility function for data preparation
      - **[functions.py](src/utils/functions.py)**:  Utility function for best results combination
      - **[get_parameters.py](src/utils/__init__.py)**:  Utility functions for parameters' combination collection 
      - **[metrics.py](src/utils/metrics.py)**:  Utility function to evaluate model performance
      - **[search_model.py](src/utils/search_model.py)**:  Utility function to train default models with several combinations
      - **[visual.py](src/utils/visual.py)**:  Utility functions to plot models' performance results
      - **[models.py](src/utils/models.py)**:  Utility functions to train 'the best' model for each algorithm
   - **[main.py](src/main.py)**:  The main file in a project
   - **[eda-1.ipynb](src/eda-1.ipynb)**:  Jupyter Notebook for EDA
   - **[end_to_end.ipynb](src/end_to_end.ipynb)**:  Jupyter Notebook for end-to-end workflow presentation
   - **[visualizations.ipynb](src/visualizations.ipynb)**:   Jupyter Notebook for visualizations

- **[requirements.txt](/requirements.txt)**: The inclusion of a requirements.txt file makes it easier to recreate the project's environment and install the necessary dependencies.

## Demo

In [None]:
df = pd.read_csv("../data/initial_data/frmgham2_project_data_full.csv")
print(f"Shape: {df.shape}\n")
df.head()

In [None]:
performances_df = train_models(df=df,target_var='CVD',
                                path_to_save='../src/results/general/full_data_performances_9_models_5_balancers.csv')


In [None]:
performances_df.head()

In [None]:
# get the best results

test_scores = max_score_for_each(performances_df,set_= 'Test')
train_scores = max_score_for_each(performances_df,set_='Train')
differences_df = train_scores.rename(columns={'Score':'TrainScore'}).drop(['Metric','Set','Imputer','Imbalance'],axis=1)\
            .merge(test_scores.rename(columns={"Score":"TestScore"}).drop(['Metric','Set','Imputer','Imbalance'],axis=1),\
                    on=['Algorithm','MainMetric'])

In [None]:
test_scores.head()

In [None]:
counts = performances_df[['Algorithm','Imputer','Imbalance']].drop_duplicates().shape[0]/9
print(f"Number of models for each algorithm: {int(counts)}")

In [None]:
# best model training combinations for training
# this is the combination of model, imputation and balancing techniques

combinations = get_combinations(df=performances_df,by_features=['Algorithm','Metric'],by_metric='AUC',by_set='Test')

In [None]:
print("Number of models: ",len(combinations))
combinations[0]

In [None]:
# df = pd.read_csv("../data/initial_data/frmgham2_project_data_full.csv")

In [None]:
# train best models 
for combination in combinations:
    algorithm, imputer,balanc = combination
    # print(combination)
    # quit()
    print(algorithm.__name__,balanc.__name__,imputer)
    best_model,best_params,output = find_best_model(algorithm=algorithm,
                                                    balancer=balanc,
                                                    imputer=imputer,
                                                    df=df,
                                                    ovewrite=False)


In [None]:
total_perfomance = pd.read_csv("../results/general/best_results_all_models.csv")
total_perfomance_approaches = total_perfomance.drop(['Set','Metric','Score'],axis=1).drop_duplicates()

In [None]:
# which works better 
total_perfomance_approaches.sort_values("Algorithm")

In [None]:
# mean performance
test_performances = total_perfomance[total_perfomance['Set'] == 'Test']
# train_erformances = total_perfomance[total_perfomance['Set'] == 'Train']
mean_performance = test_performances.groupby("Algorithm",as_index=False)['Score'].mean()
mean_performance['Score'] = mean_performance['Score'].round(2)
mean_performance.sort_values("Score",inplace=True)

In [None]:
fig,ax = plt.subplots(1,1)

ax.barh(mean_performance['Algorithm'],mean_performance['Score'],color='orange')
for j, value in enumerate(mean_performance.Score):  # Changed the variable name to j
    ax.annotate(str(value), xy=(value,j), ha='right', va='center')

ax.set_xlabel("Score")
plt.title("Average test performances of algorithms")

plt.show()

In [None]:
# algs = [alg.split("_")[0] for alg in os.listdir("../models/")]

In [None]:
algorithm = 'XGBClassifier'

In [None]:
model = plot_results(performances_df,algorithm,df,set_='Test')