# Task 3: Model optimization


Gruppe Nummer: 1
- Samuel Hempelt
- Andreas Luakat
- John Torres

In this step we will use the data create after applying the preprocessed steps on Step 3, since this generated the best model from all other preprocessed steps: 

- Step 3: Outlier Cleaning
    - Missing Values: Average for numeric, Mode for Category
    - Deletion of entries with missing values in the target colunm
    - Listwise deletion (all rows with multiple missing values, more than 2)
    - Identify outlier with IQR
    - Impute outliers with regression imputation
    - Impute categorical values with random imputation

In [22]:
import pandas as pd
import os
from pathlib import Path
import seaborn as sns  
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score, f1_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.svm import SVC
import optuna


## Functions

In [45]:
def train_random_forest(train_x: pd.DataFrame, 
                        val_x: pd.DataFrame, 
                        train_y: pd.DataFrame, 
                        val_y: pd.DataFrame,
                        random_seed: int = 123,
                        n_opt_trials = 20,
                        cv: int = 5) -> tuple:

    # Train model without optimization
    model_no_opt = RandomForestClassifier(random_state=random_seed)
    model_no_opt.fit(train_x, train_y)
    y_pred = model_no_opt.predict(val_x)
    f1_no_opt = f1_score(val_y, y_pred)

    print(f"F1 Score No Optimization: {f1_no_opt}")
    
    def objective(trial):

        rf_max_depth = trial.suggest_int("rf_max_depth", 2, 32, log=True)
        n_estimators = trial.suggest_int("n_estimators", 10, 100)
        classifier_obj = RandomForestClassifier(
            max_depth=rf_max_depth, 
            n_estimators=n_estimators,
            random_state=random_seed
        )

        score = cross_val_score(classifier_obj, train_x, train_y, n_jobs=-1, cv=cv, scoring="f1_weighted")
        return score.mean()
    
    optuna.logging.set_verbosity(optuna.logging.WARNING)
    study = optuna.create_study(direction="maximize")
    study.optimize(objective, n_trials=n_opt_trials)

    # train model with optimized hyperparameters
    model = RandomForestClassifier(max_depth=study.best_params["rf_max_depth"], 
                                    n_estimators=study.best_params["n_estimators"], 
                                    random_state=random_seed)

    model.fit(train_x, train_y)

    # prediction
    y_pred = model.predict(val_x)
    val_f1_score = round(f1_score(val_y, y_pred),4)
    val_accuracy = round(accuracy_score(val_y, y_pred),4)
    
    print("Optimized Model F1 Score: ", val_f1_score)
    
    # check if no optimization has better f1 score
    if f1_no_opt > val_f1_score:
        print("No Optimization has better f1 score than optimized model")
        model = model_no_opt
        val_f1_score = f1_no_opt
    
    return model, study, val_f1_score, val_accuracy

def train_gradient_boosting(train_x: pd.DataFrame, 
                        val_x: pd.DataFrame, 
                        train_y: pd.DataFrame, 
                        val_y: pd.DataFrame,
                        random_seed: int = 123,
                        n_opt_trials = 20,
                        cv: int = 5) -> tuple:

    # Train model without optimization
    model_no_opt = GradientBoostingClassifier(random_state=random_seed)
    model_no_opt.fit(train_x, train_y)
    y_pred = model_no_opt.predict(val_x)
    f1_no_opt = f1_score(val_y, y_pred)

    print(f"F1 Score No Optimization: {f1_no_opt}")
    
    def objective(trial):
        rf_max_depth = trial.suggest_int("max_depth", 2, 32, log=True)
        n_estimators = trial.suggest_int("n_estimators", 10, 100)
        learning_rate = trial.suggest_float("learning_rate", 0.01, 1.0, log=True)
        classifier_obj = GradientBoostingClassifier(
            max_depth=rf_max_depth, 
            n_estimators=n_estimators, 
            learning_rate=learning_rate,
            random_state=random_seed
        )

        score = cross_val_score(classifier_obj, train_x, train_y, n_jobs=-1, cv=cv, scoring="f1_weighted")
        return score.mean()
    
    optuna.logging.set_verbosity(optuna.logging.WARNING)
    study = optuna.create_study(direction="maximize")
    study.optimize(objective, n_trials=n_opt_trials)

    # train model with optimized hyperparameters
    model = GradientBoostingClassifier(max_depth=study.best_params["max_depth"], 
                                    n_estimators=study.best_params["n_estimators"], 
                                    random_state=random_seed,)

    model.fit(train_x, train_y)

    # prediction
    y_pred = model.predict(val_x)
    val_f1_score = round(f1_score(val_y, y_pred),4)
    val_accuracy = round(accuracy_score(val_y, y_pred),4)
    
    print("Optimized Model F1 Score: ", val_f1_score)
    
    # check if no optimization has better f1 score
    if f1_no_opt > val_f1_score:
        print("No Optimization has better f1 score than optimized model")
        model = model_no_opt
        val_f1_score = f1_no_opt
    
    return model, study, val_f1_score, val_accuracy


def train_deicision_tree(train_x: pd.DataFrame, 
                        val_x: pd.DataFrame, 
                        train_y: pd.DataFrame, 
                        val_y: pd.DataFrame,
                        random_seed: int = 123,
                        n_opt_trials = 20,
                        cv: int = 5) -> tuple:

    # Train model without optimization
    model_no_opt = DecisionTreeClassifier(random_state=random_seed)
    model_no_opt.fit(train_x, train_y)
    y_pred = model_no_opt.predict(val_x)
    f1_no_opt = f1_score(val_y, y_pred)

    print(f"F1 Score No Optimization: {f1_no_opt}")
    
    def objective(trial):
        max_depth = trial.suggest_int("max_depth", 2, 32, log=True)
        classifier_obj = DecisionTreeClassifier(
            max_depth=max_depth
        )

        score = cross_val_score(classifier_obj, train_x, train_y, n_jobs=-1, cv=cv, scoring="f1_weighted")
        return score.mean()
    
    optuna.logging.set_verbosity(optuna.logging.WARNING)
    study = optuna.create_study(direction="maximize")
    study.optimize(objective, n_trials=n_opt_trials)

    # train model with optimized hyperparameters
    model = DecisionTreeClassifier(max_depth=study.best_params["max_depth"],
                                    random_state=random_seed)

    model.fit(train_x, train_y)

    # prediction
    y_pred = model.predict(val_x)
    val_f1_score = round(f1_score(val_y, y_pred),4)
    val_accuracy = round(accuracy_score(val_y, y_pred),4)
    
    print("Optimized Model F1 Score: ", val_f1_score)
    
    # check if no optimization has better f1 score
    if f1_no_opt > val_f1_score:
        print("No Optimization has better f1 score than optimized model")
        model = model_no_opt
        val_f1_score = f1_no_opt
    
    return model, study, val_f1_score, val_accuracy

def train_adaboost(train_x: pd.DataFrame, 
                        val_x: pd.DataFrame, 
                        train_y: pd.DataFrame, 
                        val_y: pd.DataFrame,
                        random_seed: int = 123,
                        n_opt_trials = 20,
                        cv: int = 5) -> tuple:

    # Train model without optimization
    model_no_opt = AdaBoostClassifier(random_state=random_seed)
    model_no_opt.fit(train_x, train_y)
    y_pred = model_no_opt.predict(val_x)
    f1_no_opt = f1_score(val_y, y_pred)

    print(f"F1 Score No Optimization: {f1_no_opt}")
    
    def objective(trial):
        n_estimators = trial.suggest_int("n_estimators", 10, 100)
        learning_rate = trial.suggest_float("learning_rate", 0.01, 1.0, log=True)
        classifier_obj = AdaBoostClassifier(
            n_estimators=n_estimators,
            learning_rate=learning_rate,
            random_state=random_seed    
        )

        score = cross_val_score(classifier_obj, train_x, train_y, n_jobs=-1, cv=cv, scoring="f1_weighted")
        return score.mean()
    
    optuna.logging.set_verbosity(optuna.logging.WARNING)
    study = optuna.create_study(direction="maximize")
    study.optimize(objective, n_trials=n_opt_trials)

    # train model with optimized hyperparameters
    model = AdaBoostClassifier(n_estimators=study.best_params["n_estimators"], 
                               learning_rate=study.best_params["learning_rate"],
                               random_state=random_seed  )

    model.fit(train_x, train_y)

    # prediction
    y_pred = model.predict(val_x)
    val_f1_score = round(f1_score(val_y, y_pred),4)
    val_accuracy = round(accuracy_score(val_y, y_pred),4)
    
    print("Optimized Model F1 Score: ", val_f1_score)
    
    # check if no optimization has better f1 score
    if f1_no_opt > val_f1_score:
        print("No Optimization has better f1 score than optimized model")
        model = model_no_opt
        val_f1_score = f1_no_opt
    
    return model, study, val_f1_score, val_accuracy

def train_svc(train_x: pd.DataFrame, 
                        val_x: pd.DataFrame, 
                        train_y: pd.DataFrame, 
                        val_y: pd.DataFrame,
                        random_seed: int = 123,
                        n_opt_trials = 20,
                        cv: int = 5) -> tuple:

    # Train model without optimization
    model_no_opt = SVC(random_state=random_seed)
    model_no_opt.fit(train_x, train_y)
    y_pred = model_no_opt.predict(val_x)
    f1_no_opt = f1_score(val_y, y_pred)

    print(f"F1 Score No Optimization: {f1_no_opt}")
    
    def objective(trial):
        #C = trial.suggest_float("C", 0.1, 10, log=True)
        kernel = trial.suggest_categorical("kernel", ["linear", "poly", "rbf", "sigmoid"])
        classifier_obj = SVC(
            #C=C,
            kernel=kernel,
            random_state=random_seed    
        )

        score = cross_val_score(classifier_obj, train_x, train_y, n_jobs=-1, cv=cv, scoring="f1_weighted")
        return score.mean()
    
    optuna.logging.set_verbosity(optuna.logging.WARNING)
    study = optuna.create_study(direction="maximize")
    study.optimize(objective, n_trials=n_opt_trials)

    # train model with optimized hyperparameters
    model = SVC(kernel=study.best_params["kernel"],
                               random_state=random_seed  )

    model.fit(train_x, train_y)

    # prediction
    y_pred = model.predict(val_x)
    val_f1_score = round(f1_score(val_y, y_pred),4)
    val_accuracy = round(accuracy_score(val_y, y_pred),4)
    
    print("Optimized Model F1 Score: ", val_f1_score)
    
    # check if no optimization has better f1 score
    if f1_no_opt > val_f1_score:
        print("No Optimization has better f1 score than optimized model")
        model = model_no_opt
        val_f1_score = f1_no_opt
    
    return model, study, val_f1_score, val_accuracy

In [46]:
def split_train_test_data(df_train: pd.DataFrame,
                df_test: pd.DataFrame,) -> tuple:
    """
    - Split the data into features (X) and target variable (y)
    - OneHot-Encoding for categorical columns
    - Split the data into training. validation and testing sets
    
    Args:
        df (pd.DataFrame): DataFrame train data
        df_test (pd.DataFrame): DataFrame test data

    Returns:
        tuple: train_x, test_y, val_x, val_y, X_test, y_test
    """
    
        # Define features (X) and target variable (y)
    X_train = df_train.drop(columns=['user_of_latest_model'])  # Features
    y_train = df_train['user_of_latest_model']  # Target variable
    
    X_test = df_test.drop(columns=['user_of_latest_model'])  # Features
    y_test = df_test['user_of_latest_model']  # Target variable

    # OneHot-Encoding for categorical columns
    X_train = pd.get_dummies(X_train, columns=df_train.select_dtypes(include=['object']).columns.to_list(), drop_first=True)
    X_test = pd.get_dummies(X_test, columns=df_test.select_dtypes(include=['object']).columns.to_list(), drop_first=True)
    
    train_x, val_x, train_y, val_y = train_test_split(X_train, y_train, test_size=0.2, random_state=123)
    
    return train_x, val_x, train_y, val_y, X_test, y_test


def train_test_model(df_train: pd.DataFrame,
                        df_test: pd.DataFrame, 
                        model_name: str, 
                        models_func: str,
                        n_opt_trials: int= 20) -> pd.DataFrame: 
    """
    - Adjust format of categorical columns
    - train model on full training data
    - Make predictions on the test data
    - Evaluate the model using accuracy and F1-score

    Args:
        df (pd.DataFrame): DataFrame train data
        df_test (pd.DataFrame): DataFrame test data
        model_name (str): model name

    Returns:
        pd.DataFrame: df metrics
    """
    
    train_x, val_x, train_y, val_y, X_test, y_test = split_train_test_data(df_train, df_test)
           
    # Train the model
    model, study, val_f1_score, val_accuracy = models_func[model_name](train_x, val_x, train_y, val_y, n_opt_trials=n_opt_trials)
    
    # Evaluate the model
    y_pred = model.predict(X_test)
    test_accuracy = round(accuracy_score(y_test, y_pred),4)
    test_f1_score = round(f1_score(y_test, y_pred),4)

    # Store results in DataFrame
    df_results = pd.DataFrame({'Model_Name': [model_name], 
                               'val_f1_score': [val_f1_score], 
                               'val_accuracy': [val_accuracy], 
                               'test_f1_score': [test_f1_score],
                               'test_accuracy': [test_accuracy]})

    return df_results, study

## Import Data

In [25]:
# Store the dataframe to use them in the next step
df_train = pd.read_csv("../../data/processed/task2_best_model_step3_train_data.csv")
df_test = pd.read_csv("../../data/processed/task2_best_model_step3_test_data.csv")

## Model Evaluation

In [38]:
models_func = {'RandomForest': train_random_forest, 
               'GradientBoosting': train_gradient_boosting, 
               'DecisionTree': train_deicision_tree,
               'AdaBoost': train_adaboost, 
               'SVC': train_svc}

In [27]:
df_results = pd.DataFrame()

### Random Forest

In [28]:
model_name = 'RandomForest'
n_opt_trials = 20
df_results_new, study = train_test_model(df_train=df_train, 
                                     df_test=df_test, 
                                     model_name=model_name, 
                                     models_func=models_func,
                                     n_opt_trials=n_opt_trials)

df_results = pd.concat([df_results, df_results_new], axis=0)

F1 Score No Optimization: 0.7225806451612903
Optimized Model F1 Score:  0.698
No Optimization has better f1 score than optimized model


In [29]:
df_results

Unnamed: 0,Model_Name,val_f1_score,val_accuracy,test_f1_score,test_accuracy
0,RandomForest,0.722581,0.7704,0.7097,0.766


### Gradient Boosting Classifier

In [30]:
model_name = 'GradientBoosting' 
n_opt_trials = 20
df_results_new, study = train_test_model(df_train=df_train, 
                                     df_test=df_test, 
                                     model_name=model_name, 
                                     models_func=models_func,
                                     n_opt_trials=n_opt_trials)
df_results = pd.concat([df_results, df_results_new], axis=0)

F1 Score No Optimization: 0.6951219512195121
Optimized Model F1 Score:  0.7152


In [32]:
df_results

Unnamed: 0,Model_Name,val_f1_score,val_accuracy,test_f1_score,test_accuracy
0,RandomForest,0.722581,0.7704,0.7097,0.766
0,GradientBoosting,0.7152,0.7602,0.6856,0.734


### Decision Tree

In [33]:
model_name = 'DecisionTree'
n_opt_trials = 20
df_results_new, study = train_test_model(df_train=df_train, 
                                     df_test=df_test, 
                                     model_name=model_name, 
                                     models_func=models_func,
                                     n_opt_trials=n_opt_trials)
df_results = pd.concat([df_results, df_results_new], axis=0)

F1 Score No Optimization: 0.6190476190476191
Optimized Model F1 Score:  0.686


In [34]:
df_results

Unnamed: 0,Model_Name,val_f1_score,val_accuracy,test_f1_score,test_accuracy
0,RandomForest,0.722581,0.7704,0.7097,0.766
0,GradientBoosting,0.7152,0.7602,0.6856,0.734
0,DecisionTree,0.686,0.7245,0.6465,0.696


### AdaBoost

In [35]:
model_name = 'AdaBoost'
n_opt_trials = 20
df_results_new, study = train_test_model(df_train=df_train, 
                                     df_test=df_test, 
                                     model_name=model_name, 
                                     models_func=models_func,
                                     n_opt_trials=n_opt_trials)
df_results = pd.concat([df_results, df_results_new], axis=0)

F1 Score No Optimization: 0.6666666666666666
Optimized Model F1 Score:  0.671


In [36]:
df_results

Unnamed: 0,Model_Name,val_f1_score,val_accuracy,test_f1_score,test_accuracy
0,RandomForest,0.722581,0.7704,0.7097,0.766
0,GradientBoosting,0.7152,0.7602,0.6856,0.734
0,DecisionTree,0.686,0.7245,0.6465,0.696
0,AdaBoost,0.671,0.7398,0.6717,0.738


### Support Vector Machine Classifier

In [None]:
model_name = 'SVC'
n_opt_trials = 20
df_results_new, study = train_test_model(df_train=df_train, 
                                     df_test=df_test, 
                                     model_name=model_name, 
                                     models_func=models_func,
                                     n_opt_trials=n_opt_trials)
df_results = pd.concat([df_results, df_results_new], axis=0)

F1 Score No Optimization: 0.5


In [None]:
df_results