In [None]:
import sys , os
import pandas as pd
import numpy as np
sys.path.append(os.path.abspath('../scripts'))

In [None]:
import Modelling as mc


In [None]:
path = '../data/proceced_data.csv'  
data = mc.load_data(path)
data.head()

In [None]:
%load_ext autoreload
%autoreload 2

data = mc.preprocess_data(data)

In [None]:
if data is not None:
    # Replace infinite values with NaN
    data.replace([np.inf, -np.inf], np.nan, inplace=True)

    # Replace NaN values with the median of the respective column
    for column in data.columns:
        if data[column].isnull().sum() > 0:
            data[column].fillna(data[column].median(), inplace=True)
            print(f"Missing values in '{column}' after imputation with median:\n", data[column].isnull().sum())

    # Verify there are no more infinite or very large values
    infinite_values_post = data.isin([np.inf, -np.inf]).sum()
    large_values_post = (data.abs() > np.finfo(np.float64).max).sum()

    print(f"Infinite values after cleaning: {infinite_values_post.sum()}")
    print(f"Very large values after cleaning: {large_values_post.sum()}")
else:
    print("Failed to handle infinite or very large values as preprocessing step failed")

In [None]:
data.head()


In [None]:
X_train , X_test , y_train , y_test  = mc.split_the_data(data)


In [None]:
X_train.head()


In [None]:
y_train.head()


In [None]:
logistic_model , random_forest_model = mc.tain_the_models(X_train,y_train,X_test) 


In [None]:
logistic_model


In [None]:
random_forest_model


In [None]:
mc.evaluate_models(random_forest_model,X_test,y_test)


In [None]:
mc.evaluate_models(random_forest_model,X_test,y_test)


In [None]:
mc.evaluate_models(logistic_model,X_test_scaled,y_test)


In [None]:
from sklearn.model_selection import cross_val_score

scores_rf = cross_val_score(random_forest_model, X_train, y_train, cv=5)
scores_lr = cross_val_score(logistic_model, X_train_scaled, y_train, cv=5)

print("Random Forest CV Accuracy: ", scores_rf.mean())
print("Logistic Regression CV Accuracy: ", scores_lr.mean())

In [None]:
# Check data shape and type
print("X_train shape:", X_train.shape)
print("y_train shape:", y_train.shape)
print("Data type of X_train:", type(X_train))
print("Data type of y_train:", type(y_train))
 

In [None]:
from Modelling import define_hyperparameter_grids, perform_grid_search


In [None]:
models = {
    "Logistic Regression": logistic_model,
    "Random Forest": random_forest_model
}

In [None]:
param_grids = define_hyperparameter_grids()


In [None]:
best_models = perform_grid_search(models, param_grids, X_train, y_train)


In [None]:
from Modelling import evaluate_best_models

# Evaluate the best models
test_results = evaluate_best_models(best_models, X_test, y_test)

# Print the results in a DataFrame for better readability
print(pd.DataFrame(test_results).T)


In [None]:
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

cm = confusion_matrix(y_test, logistic_model.predict(X_test_scaled))
disp = ConfusionMatrixDisplay(confusion_matrix=cm)
disp.plot()

cm = confusion_matrix(y_test, random_forest_model.predict(X_test))
disp = ConfusionMatrixDisplay(confusion_matrix=cm)
disp.plot()

In [None]:
import matplotlib.pyplot as plt
import numpy as np

importance = random_forest_model.feature_importances_
indices = np.argsort(importance)[::-1]

plt.figure(figsize=(10, 6))
plt.title("Feature Importance in Random Forest")
plt.bar(range(X_train.shape[1]), importance[indices], align="center")
plt.xticks(range(X_train.shape[1]), X_train.columns[indices], rotation=90)
plt.show()

In [None]:
# Evaluate Random Forest
print("Random Forest CV Accuracy: ", scores_rf.mean())

# Evaluate Logistic Regression
print("Logistic Regression CV Accuracy: ", scores_lr.mean())

In [None]:
# Evaluate Random Forest
print("Random Forest CV Accuracy: ", scores_rf.mean())

# Evaluate Logistic Regression
print("Logistic Regression CV Accuracy: ", scores_lr.mean())