In [1]:
# Models
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier, GradientBoostingClassifier, RandomForestClassifier, StackingClassifier

# Validation of the models
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer, f1_score, confusion_matrix, classification_report, roc_curve, auc

# Preprocessing and trasformations
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.feature_selection import VarianceThreshold
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import joblib

In [2]:
# Import dataset
test_features = pd.read_csv("../data/processed/test_features.csv")
test_labels=pd.read_csv("../data/processed/test_labels.csv")
train_features=pd.read_csv("../data/processed/train_features.csv")
train_labels=pd.read_csv("../data/processed/train_labels.csv")

In [3]:
# Convert object variables into categorical ones for labels dataset

train_labels["income"] = pd.Categorical(train_labels["income"])
train_labels.info()

test_labels["income"] = pd.Categorical(test_labels["income"])
test_labels.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24129 entries, 0 to 24128
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype   
---  ------  --------------  -----   
 0   income  24129 non-null  category
dtypes: category(1)
memory usage: 23.8 KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6033 entries, 0 to 6032
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype   
---  ------  --------------  -----   
 0   income  6033 non-null   category
dtypes: category(1)
memory usage: 6.1 KB


In [4]:
# Convert object variables into categorical ones for features dataset

categorical_columns = train_features.select_dtypes(include='object').columns

for col in categorical_columns:
    train_features[col] = pd.Categorical(train_features[col])

train_features.info()

categorical_columns = test_features.select_dtypes(include='object').columns

for col in categorical_columns:
    test_features[col] = pd.Categorical(test_features[col])

test_features.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24129 entries, 0 to 24128
Data columns (total 14 columns):
 #   Column                  Non-Null Count  Dtype   
---  ------                  --------------  -----   
 0   age                     24129 non-null  int64   
 1   workclass               24129 non-null  category
 2   fnlwgt                  24129 non-null  int64   
 3   education.num           24129 non-null  int64   
 4   marital.status          24129 non-null  category
 5   occupation              24129 non-null  category
 6   relationship            24129 non-null  category
 7   race                    24129 non-null  category
 8   sex                     24129 non-null  category
 9   capital.gain            24129 non-null  int64   
 10  capital.loss            24129 non-null  int64   
 11  hours.per.week          24129 non-null  int64   
 12  native.country.grouped  24129 non-null  category
 13  education.grouped       24129 non-null  category
dtypes: category(8), int64(

In [5]:
train_labels = train_labels.squeeze()
test_labels = test_labels.squeeze()

In [6]:
# Identify categorical and numerical columns for training models
categorical_columns = train_features.select_dtypes(include=['object']).columns
numerical_columns = train_features.select_dtypes(include=['int64', 'float64']).columns

In [7]:
class CollinearityRemover(BaseEstimator, TransformerMixin):
    def __init__(self, threshold=0.9):
        self.threshold = threshold
        self.to_remove_ = None

    def fit(self, X, y=None):
        # Correlation matrix
        corr_matrix = np.corrcoef(X, rowvar=False)
        upper_triangle = np.triu(np.abs(corr_matrix), k=1)

        # Highly correlated variables
        self.to_remove_ = np.where(upper_triangle > self.threshold)[1]
        return self

    def transform(self, X):
        # Remove correlated variables
        if self.to_remove_ is not None:
            return np.delete(X, self.to_remove_, axis=1)
        return X

numerical_columns = ['age', 'fnlwgt', 'education.num', 'capital.gain', 'capital.loss', 'hours.per.week']
categorical_columns = ['workclass', 'marital.status', 'occupation', 'relationship', 'race', 'sex',
                       'native.country.grouped', 'education.grouped']

# Preprocessing
numerical_pipeline = Pipeline(steps=[
    ('scaler', StandardScaler()),
    ('collinearity', CollinearityRemover(threshold=0.9))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_pipeline, numerical_columns),
        ('cat', OneHotEncoder(drop='first'), categorical_columns)
    ]
)

## Logistic Regression

In [9]:
model = LogisticRegression(max_iter=1000)

# Pipeline: Preprocessing -> Feature Selection -> Model
logistic_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('var_thresh', VarianceThreshold(threshold=0.01)),  # Rimuove variabili con varianza bassa
    ('model', model)
])

param_grid = {
    'model__C': [0.01, 0.1, 1, 10, 100]
}

grid_search = GridSearchCV(
    logistic_pipeline,
    param_grid=param_grid,
    cv=10,
    scoring=make_scorer(f1_score, pos_label=1),
    verbose=1
)

grid_search.fit(train_features, train_labels)

print("Best Parameters:", grid_search.best_params_)

y_pred = grid_search.predict(test_features)

# Evaluation
print("Confusion Matrix:")
print(confusion_matrix(test_labels, y_pred))
print("\nClassification Report:")
print(classification_report(test_labels, y_pred))

random_forest_filename = "models/logistic_regression_model.pkl"
joblib.dump(logistic_pipeline, random_forest_filename)
print(f"Random Forest model saved as {random_forest_filename}")


Fitting 10 folds for each of 5 candidates, totalling 50 fits
Best Parameters: {'model__C': 1}
Confusion Matrix:
[[4174  359]
 [ 601  899]]

Classification Report:
              precision    recall  f1-score   support

           0       0.87      0.92      0.90      4533
           1       0.71      0.60      0.65      1500

    accuracy                           0.84      6033
   macro avg       0.79      0.76      0.77      6033
weighted avg       0.83      0.84      0.84      6033

Random Forest model saved as models/logistic_regression_model.pkl


## KNN

In [12]:
# KNN
model = KNeighborsClassifier()

# Pipeline: Preprocessing -> Model
knn_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', model)
])

param_grid = {
    'model__n_neighbors': [3, 5, 7, 9, 11],
    'model__weights': ['uniform', 'distance'],
    'model__p': [1, 2]
}

grid_search = GridSearchCV(
    knn_pipeline,
    param_grid=param_grid,
    cv=10,
    scoring=make_scorer(f1_score, pos_label=1),
    verbose=1
)

grid_search.fit(train_features, train_labels)

print("Best Parameters:", grid_search.best_params_)

y_pred = grid_search.predict(test_features)

print("Confusion Matrix:")
print(confusion_matrix(test_labels, y_pred))
print("\nClassification Report:")
print(classification_report(test_labels, y_pred))

knn_filename = "models/knn_model.pkl"
joblib.dump(knn_pipeline, knn_filename)
print(f"KNN model saved as {knn_filename}")

Fitting 10 folds for each of 20 candidates, totalling 200 fits
Best Parameters: {'model__n_neighbors': 9, 'model__p': 2, 'model__weights': 'uniform'}
Confusion Matrix:
[[4104  429]
 [ 603  897]]

Classification Report:
              precision    recall  f1-score   support

           0       0.87      0.91      0.89      4533
           1       0.68      0.60      0.63      1500

    accuracy                           0.83      6033
   macro avg       0.77      0.75      0.76      6033
weighted avg       0.82      0.83      0.83      6033

KNN model saved as /content/drive/My Drive/models/knn_model.pkl


## Naive Bayes

In [11]:
numerical_pipeline = Pipeline(steps=[
    ('scaler', StandardScaler()),
    ('collinearity', CollinearityRemover(threshold=0.9))
])

# Necessary for Naive Bayes
categorical_pipeline = Pipeline(steps=[
    ('encoder', OneHotEncoder(drop='first', sparse_output=False))  # Converti in formato denso
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_pipeline, numerical_columns),
        ('cat', categorical_pipeline, categorical_columns)
    ]
)

model = GaussianNB()

naive_bayes_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('var_thresh', VarianceThreshold(threshold=0.01)),  # Rimuove variabili con varianza bassa
    ('model', model)
])

param_grid = {
    # Parametri di Naive Bayes
    'model__var_smoothing': [1e-9, 1e-8, 1e-7, 1e-6]
}

grid_search = GridSearchCV(
    naive_bayes_pipeline,
    param_grid=param_grid,
    cv=10,
    scoring=make_scorer(f1_score, pos_label=1),
    verbose=1
)

grid_search.fit(train_features, train_labels)

print("Best Parameters:", grid_search.best_params_)

y_pred = grid_search.predict(test_features)

# Evaluation
print("Confusion Matrix:")
print(confusion_matrix(test_labels, y_pred))
print("\nClassification Report:")
print(classification_report(test_labels, y_pred))

model_filename = "models/naive_bayes_model.pkl"
joblib.dump(grid_search.best_estimator_, model_filename)
print(f"Model saved as {model_filename}")


Fitting 10 folds for each of 4 candidates, totalling 40 fits
Best Parameters: {'model__var_smoothing': 1e-06}
Confusion Matrix:
[[3294 1239]
 [ 246 1254]]

Classification Report:
              precision    recall  f1-score   support

           0       0.93      0.73      0.82      4533
           1       0.50      0.84      0.63      1500

    accuracy                           0.75      6033
   macro avg       0.72      0.78      0.72      6033
weighted avg       0.82      0.75      0.77      6033

Model saved as /content/drive/My Drive/models/naive_bayes_model.pkl


## Bagging

In [None]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_columns),
        ('cat', OneHotEncoder(drop='first'), categorical_columns)
    ]
)

# Base model: Decision Tree
base_model = DecisionTreeClassifier(random_state=42)

bagging_model = BaggingClassifier(
    estimator=base_model,
    random_state=42
)

# Complete Pipeline
bagging_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', bagging_model)
])

param_grid = {
    'model__n_estimators': [100, 250, 500],
    'model__max_samples': [0.5, 0.75, 1.0],
    'model__max_features': [0.5, 0.75, 1.0],
    'model__bootstrap': [True, False],
    'model__bootstrap_features': [True, False]  # Bootstrap delle feature
}

grid_search = GridSearchCV(
    bagging_pipeline,
    param_grid=param_grid,
    cv=10,
    scoring=make_scorer(f1_score, pos_label=1),
    verbose=1
)

grid_search.fit(train_features, train_labels)

print("Best Parameters:", grid_search.best_params_)

y_pred = grid_search.predict(test_features)

# Evaluation
print("Confusion Matrix:")
print(confusion_matrix(test_labels, y_pred))
print("\nClassification Report:")
print(classification_report(test_labels, y_pred))

bagging_filename = "models/bagging_model.pkl"
joblib.dump(bagging_pipeline, bagging_filename)
print(f"Bagging model saved as {bagging_filename}")

Fitting 10 folds for each of 108 candidates, totalling 1080 fits


## Gradient boosting

In [None]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_columns),
        ('cat', OneHotEncoder(drop='first'), categorical_columns)
    ]
)

# Gradient Boosting
model = GradientBoostingClassifier(random_state=42)

gradient_boosting_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', model)
])

param_grid = {
    'model__n_estimators': [100, 200, 300],
    'model__learning_rate': [0.01, 0.1, 0.2],
    'model__max_depth': [3, 5, 7],
    'model__subsample': [0.8, 1.0],
    'model__min_samples_split': [2, 5, 10]
}

grid_search = GridSearchCV(
    gradient_boosting_pipeline,
    param_grid=param_grid,
    cv=10,
    scoring=make_scorer(f1_score, pos_label=1),
    verbose=1
)

grid_search.fit(train_features, train_labels)

print("Best Parameters:", grid_search.best_params_)

y_pred = grid_search.predict(test_features)

# Evaluation
print("Confusion Matrix:")
print(confusion_matrix(test_labels, y_pred))
print("\nClassification Report:")
print(classification_report(test_labels, y_pred))

gradient_boosting_filename = "models/gradient_boosting_model.pkl"
joblib.dump(gradient_boosting_pipeline, gradient_boosting_filename)
print(f"Gradient Boosting model saved as {gradient_boosting_filename}")


Fitting 10 folds for each of 162 candidates, totalling 1620 fits


KeyboardInterrupt: 

## Random Forest

In [None]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_columns),
        ('cat', OneHotEncoder(drop='first'), categorical_columns)
    ]
)

model = RandomForestClassifier(random_state=42)

random_forest_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', model)
])

param_grid = {
    'model__n_estimators': [100, 200, 300],
    'model__max_depth': [None, 10, 20, 30],
    'model__min_samples_split': [2, 5, 10],
    'model__min_samples_leaf': [1, 2, 4],
    'model__max_features': ['sqrt', 'log2']
}

grid_search = GridSearchCV(
    random_forest_pipeline,
    param_grid=param_grid,
    cv=10,
    scoring=make_scorer(f1_score, pos_label=1),
    verbose=1,
    n_jobs=-1
)

grid_search.fit(train_features, train_labels)

print("Best Parameters:", grid_search.best_params_)

y_pred = grid_search.predict(test_features)

# Evaluation
print("Confusion Matrix:")
print(confusion_matrix(test_labels, y_pred))
print("\nClassification Report:")
print(classification_report(test_labels, y_pred))

random_forest_filename = "models/random_forest_model.pkl"
joblib.dump(random_forest_pipeline, random_forest_filename)
print(f"Random Forest model saved as {random_forest_filename}")


Fitting 10 folds for each of 216 candidates, totalling 2160 fits


## Stacking

In [None]:
base_estimators = [
    ('knn', knn_pipeline),
    ('naive_bayes', naive_bayes_pipeline),
    ('gradient_boosting', gradient_boosting_pipeline),
    ('random_forest', random_forest_pipeline),
    ('bagging', bagging_pipeline)
]

final_model = LogisticRegression(max_iter=1000)

# Stacking classifier
stacking_pipeline = StackingClassifier(
    estimators=base_estimators,
    final_estimator=final_model,
    cv=10
)

stacking_pipeline.fit(train_features, train_labels)

y_pred_stacking = stacking_pipeline.predict(test_features)

# Evaluation
from sklearn.metrics import classification_report, confusion_matrix
print("Confusion Matrix:")
print(confusion_matrix(test_labels, y_pred_stacking))
print("\nClassification Report:")
print(classification_report(test_labels, y_pred_stacking))

stacking_filename = "models/stacking_model.pkl"
joblib.dump(stacking_pipeline, stacking_filename)
print(f"Stacking model saved as {stacking_filename}")

## Assessing with ROC curve and Lift Curve

In [None]:
# ROC curves
def plot_roc_curve(y_true, y_probs, model_names):
    plt.figure(figsize=(10, 8))
    for y_prob, model_name in zip(y_probs, model_names):
        fpr, tpr, _ = roc_curve(y_true, y_prob)
        roc_auc = auc(fpr, tpr)
        plt.plot(fpr, tpr, lw=2, label=f'{model_name} (AUC = {roc_auc:.2f})')

    plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--', label='Random Guess')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver Operating Characteristic (ROC)')
    plt.legend(loc="lower right")
    plt.grid()
    plt.show()

# Lift Curves
def plot_lift_curve(y_true, y_probs, model_names):
    plt.figure(figsize=(10, 8))
    for y_prob, model_name in zip(y_probs, model_names):
        data = pd.DataFrame({'true': y_true, 'prob': y_prob})
        data.sort_values('prob', ascending=False, inplace=True)
        data['cumulative_positive'] = data['true'].cumsum()
        data['cumulative_rate'] = np.arange(1, len(data) + 1) / len(data)
        data['lift'] = data['cumulative_positive'] / data['cumulative_rate']

        plt.plot(data['cumulative_rate'], data['lift'], lw=2, label=f'{model_name}')

    plt.axhline(y=1, color='red', linestyle='--', label='Random Model')
    plt.xlabel('Cumulative Percentage of Data')
    plt.ylabel('Lift')
    plt.title('Lift Curve')
    plt.legend(loc="upper right")
    plt.grid()
    plt.show()

# Load saved models
models = {
    'Logistic Regression': joblib.load('logistic_regression_model.pkl'),
    'KNN': joblib.load('knn_model.pkl'),
    'Naive Bayes': joblib.load('naive_bayes_model.pkl'),
    'Bagging': joblib.load('bagging_model.pkl'),
    'Random Forest': joblib.load('random_forest_model.pkl'),
    'Gradient Boosting': joblib.load('gradient_boosting_model.pkl'),
    'Stacking': joblib.load('stacking_model.pkl'),
}

y_probs = []
model_names = []

for name, model in models.items():
    if hasattr(model, 'predict_proba'):
        y_probs.append(model.predict_proba(test_features)[:, 1])
    elif hasattr(model, 'decision_function'):
        y_probs.append(model.decision_function(test_features))
    model_names.append(name)

# Compute and visualize Roc curves
plot_roc_curve(test_labels, y_probs, model_names)

# Compute and visualize Lift curves
plot_lift_curve(test_labels, y_probs, model_names)


## Choosing the right threshold

In [None]:
# Simula il calcolo delle probabilità predette dalla Random Forest (esempio)
y_true = test_labels  # Etichette vere
y_scores = grid_search.predict_proba(test_features)[:, 1]  # Probabilità positive dal modello addestrato

# Calcola la curva ROC
fpr, tpr, thresholds = roc_curve(y_true, y_scores)

# Specificità (1 - FPR) e Sensibilità (TPR)
specificity = 1 - fpr
sensitivity = tpr

# Grafico Specificity e Sensitivity vs Threshold
plt.figure(figsize=(10, 6))
plt.plot(thresholds, specificity, label='Specificity', color='dodgerblue')
plt.plot(thresholds, sensitivity, label='Sensitivity', color='green')
plt.axvline(x=0.4, color='red', linestyle='--', linewidth=2, label='Optimal Threshold')
plt.text(0.42, 0.5, 'Optimal Threshold', color='red', fontsize=12)

# Aggiungi titoli e legende
plt.title("Performance Metrics vs Threshold", fontsize=14)
plt.xlabel("Threshold", fontsize=12)
plt.ylabel("Metrics", fontsize=12)
plt.legend(loc='lower right', fontsize=10)
plt.grid(True)
plt.show()
