In [None]:
# Import necessary libraries and modules
import sys
sys.path.append("ACC_PROJECT")
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from fastai.tabular.all import *
from fastbook import cluster_columns
from dtreeviz.trees import *
import torch as t

# Import custom functions
from functions.model_evaluator import (
    evaluation,
) 

from functions.value_encoding import DataEncoder


# Load the dataset and select relevant columns
df = pd.read_csv("processed_data.csv", low_memory=False)
df.drop(columns="description", inplace=True)


# Check for GPU availability and set the device accordingly
device = "cuda" if t.cuda.is_available() else "cpu"


In [None]:
# Splitting the dataset into training and validation sets with stratification
df_wo_valid, df_valid = train_test_split(
    df, test_size=0.05, random_state=1, stratify=df.label
)
df_wo_valid = df_wo_valid.reset_index()
df_wo_valid.drop(columns="index", inplace=True)
df_valid = df_wo_valid.reset_index()
df_valid.drop(columns="index", inplace=True)


In [None]:
# Encoding categorical data for model training
encoder = DataEncoder(df)  # Initialize with the entire dataset
encoded_training = encoder.transform(df_wo_valid)
encoded_validation = encoder.transform(df_valid)



In [None]:
encoded_training

In [None]:
# Splitting the encoded training data into training and test sets
X_train, X_test, y_train, y_test = encoder.split_data(
    encoded_training, test_size=0.2, random_state=2
)


In [None]:
# Displaying the shapes of the split data
X_train.shape, X_test.shape, y_train.shape, y_test.shape


In [None]:
X_train

In [None]:
# Training a Decision Tree Classifier
m = DecisionTreeClassifier(max_leaf_nodes=12)
m.fit(X_train, y_train)

fig = plt.figure(figsize=(25, 20))
_ = tree.plot_tree(m, feature_names=X_train.columns, filled=True)

# Printing the accuracy of the Decision Tree prediction
print("decision tree prediction", (m.predict(X_test) == y_test).sum() / len(y_test))


In [None]:
# Function to calculate feature importance for Random Forest
def rf_feat_importance(m, df):
    return pd.DataFrame(
        {"cols": df.columns, "imp": m.feature_importances_}
    ).sort_values("imp", ascending=False)


fi = rf_feat_importance(m, X_train)
fi[:10]


In [None]:
# Function to perform grid search for optimal hyperparameters
def optimal_param_search(model, param_grid):
    grid_search = GridSearchCV(
        model,
        param_grid=param_grid,
        cv=5,
        n_jobs=-1,
        verbose=2,
    )
    grid_search.fit(X_train, y_train)

    optimal_params = grid_search.best_params_
    print("Best Score = ", grid_search.best_score_)
    print("Optimal Parameters:", optimal_params)
    return optimal_params


Logistic Regression


In [None]:
param_grid_lr = {"C": [0.1, 1, 10, 100], "max_iter": [100, 1000, 10000]}

optimal_params_lr = optimal_param_search(LogisticRegression(), param_grid_lr)


In [None]:
lr_optimal = LogisticRegression(**optimal_params_lr)
lr_optimal.fit(X_train, y_train)
evaluation(lr_optimal, X_test, y_test)


KNN

In [None]:
# Parameter grid for KNN
param_grid_knn = {
    "n_neighbors": [3, 5, 11, 19],
    "weights": ["uniform", "distance"],
    "metric": ["euclidean", "manhattan"],
}

optimal_params_knn = optimal_param_search(KNeighborsClassifier(), param_grid_knn)


In [None]:
# Retraining Gradient Boosting Machine
knn_optimal = KNeighborsClassifier(**optimal_params_knn)
knn_optimal.fit(X_train, y_train)
evaluation(knn_optimal, X_test, y_test)


SVM

In [None]:
# Parameter grid for SVM
param_grid_svm = {
    "C": [0.1, 1, 10, 100, 1000],
    "gamma": [1, 0.1, 0.01, 0.001],
    "kernel": ["rbf", "poly", "sigmoid"],
}
optimal_params_svm = optimal_param_search(SVC(), param_grid_svm)


In [None]:

svm_optimal = SVC(**optimal_params_svm)
svm_optimal.fit(X_train, y_train)
evaluation(svm_optimal,X_test,y_test)

Decision Tree

In [None]:
# Parameter grid for Decision Tree
param_grid_dt = {
    "max_depth": [5, 10, 20, 30, None],
    "min_samples_split": [2, 5, 10, 20, 40],
    "min_samples_leaf": [1, 2, 4, 8],
}
optimal_params_dt = optimal_param_search(DecisionTreeClassifier(), param_grid_dt)


In [None]:
dt_optimal = DecisionTreeClassifier(**optimal_params_dt)
dt_optimal.fit(X_train, y_train)
evaluation(dt_optimal, X_test, y_test)


Random Forest


In [None]:
param_grid_rf = {
    "n_estimators": [100, 200, 300],
    "max_depth": [10, 20, 30, None],
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf": [1, 2, 4],
    "max_features": ["auto", "sqrt"],
}

optimal_params_rf = optimal_param_search(RandomForestClassifier(), param_grid_rf)


In [None]:

rf_optimal = RandomForestClassifier(**optimal_params_rf)
rf_optimal.fit(X_train, y_train)
evaluation(rf_optimal,X_test,y_test)

GBM

In [None]:
# Defining the hyperparameter grid for Gradient Boosting Machine
param_grid_gbm = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 4, 5]
}

optimal_params_gbm = optimal_param_search(GradientBoostingClassifier(), param_grid_gbm )



In [None]:
gbm_optimal = GradientBoostingClassifier(**optimal_params_gbm)
gbm_optimal.fit(X_train, y_train)
evaluation(gbm_optimal,X_test,y_test)

In [None]:
# Extracting feature importance from the GBM model
feature_importance = gbm_optimal.feature_importances_
features = X_train.columns

# Creating a DataFrame for feature importances
feature_importance_df = pd.DataFrame(
    {"Feature": features, "Importance": feature_importance}
)

# Sorting the DataFrame based on importance
feature_importance_df = feature_importance_df.sort_values(
    by="Importance", ascending=False
)

# Plotting feature importances
plt.figure(figsize=(10, 6))
sns.barplot(x="Importance", y="Feature", data=feature_importance_df)
plt.title("GBM Model Feature Importances")
plt.xlabel("Importance")
plt.ylabel("Feature")
plt.show()


In [None]:
def analyze_confidence_threshold(model, X_test, y_test, probabilities, threshold):
    # Initialize counters
    correct_above_threshold = 0
    correct_below_threshold = 0
    incorrect_above_threshold = 0
    incorrect_below_threshold = 0

    # Loop over all instances in the test set
    for i in range(len(X_test)):
        # Get the predicted class and the maximum probability (confidence)
        predicted_class = model.classes_[np.argmax(probabilities[i])]
        max_probability = max(probabilities[i])

        # Check if the prediction is correct
        is_correct = predicted_class == y_test.iloc[i]

        # Increment counters based on confidence and correctness
        if max_probability >= threshold:
            if is_correct:
                correct_above_threshold += 1
            else:
                incorrect_above_threshold += 1
        else:
            if is_correct:
                correct_below_threshold += 1
            else:
                incorrect_below_threshold += 1

    return (
        correct_above_threshold,
        correct_below_threshold,
        incorrect_above_threshold,
        incorrect_below_threshold,
    )


# Set a confidence threshold (for example, 70%)
threshold = 0.8

# Analyze for GBM model
results_gbm = analyze_confidence_threshold(
    gbm_optimal, X_test, y_test, probabilities_gbm, threshold
)

# Print results
print(
    "GBM Results (Correct Above, Correct Below, Incorrect Above, Incorrect Below):",
    results_gbm,
)
