In [28]:
# Modelli
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier, GradientBoostingClassifier, RandomForestClassifier, StackingClassifier

# Selezione e valutazione del modello
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer, f1_score, confusion_matrix, classification_report

# Preprocessing e trasformazioni
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.feature_selection import VarianceThreshold
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin

# Librerie di supporto
import numpy as np
import pandas as pd

In [2]:
# Import dataset
test_features = pd.read_csv("../data/processed/test_features.csv")
test_labels=pd.read_csv("../data/processed/test_labels.csv")
train_features=pd.read_csv("../data/processed/train_features.csv")
train_labels=pd.read_csv("../data/processed/train_labels.csv")

In [3]:
# Convert object variables into categorical ones for labels dataset

train_labels["income"] = pd.Categorical(train_labels["income"])
train_labels.info()

test_labels["income"] = pd.Categorical(test_labels["income"])
test_labels.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24129 entries, 0 to 24128
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype   
---  ------  --------------  -----   
 0   income  24129 non-null  category
dtypes: category(1)
memory usage: 23.8 KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6033 entries, 0 to 6032
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype   
---  ------  --------------  -----   
 0   income  6033 non-null   category
dtypes: category(1)
memory usage: 6.1 KB


In [4]:
# Convert object variables into categorical ones for features dataset

categorical_columns = train_features.select_dtypes(include='object').columns

for col in categorical_columns:
    train_features[col] = pd.Categorical(train_features[col])

train_features.info()

categorical_columns = test_features.select_dtypes(include='object').columns

for col in categorical_columns:
    test_features[col] = pd.Categorical(test_features[col])

test_features.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24129 entries, 0 to 24128
Data columns (total 14 columns):
 #   Column                  Non-Null Count  Dtype   
---  ------                  --------------  -----   
 0   age                     24129 non-null  int64   
 1   workclass               24129 non-null  category
 2   fnlwgt                  24129 non-null  int64   
 3   education.num           24129 non-null  int64   
 4   marital.status          24129 non-null  category
 5   occupation              24129 non-null  category
 6   relationship            24129 non-null  category
 7   race                    24129 non-null  category
 8   sex                     24129 non-null  category
 9   capital.gain            24129 non-null  int64   
 10  capital.loss            24129 non-null  int64   
 11  hours.per.week          24129 non-null  int64   
 12  native.country.grouped  24129 non-null  category
 13  education.grouped       24129 non-null  category
dtypes: category(8), int64(

In [5]:
train_labels = train_labels.squeeze()
test_labels = test_labels.squeeze()

In [19]:
# Identify categorical and numerical columns for training models
categorical_columns = train_features.select_dtypes(include=['object']).columns
numerical_columns = train_features.select_dtypes(include=['int64', 'float64']).columns

In [12]:
# Filtro personalizzato per rimuovere variabili collineari
class CollinearityRemover(BaseEstimator, TransformerMixin):
    def __init__(self, threshold=0.9):
        self.threshold = threshold
        self.to_remove_ = None

    def fit(self, X, y=None):
        # Calcola la matrice di correlazione
        corr_matrix = np.corrcoef(X, rowvar=False)
        upper_triangle = np.triu(np.abs(corr_matrix), k=1)

        # Trova colonne con alta collinearità
        self.to_remove_ = np.where(upper_triangle > self.threshold)[1]
        return self

    def transform(self, X):
        # Rimuovi colonne collineari
        if self.to_remove_ is not None:
            return np.delete(X, self.to_remove_, axis=1)
        return X

# Colonne numeriche e categoriche
numerical_columns = ['age', 'fnlwgt', 'education.num', 'capital.gain', 'capital.loss', 'hours.per.week']
categorical_columns = ['workclass', 'marital.status', 'occupation', 'relationship', 'race', 'sex',
                       'native.country.grouped', 'education.grouped']

# Preprocessing: Standardizzazione per le numeriche, One-Hot Encoding per le categoriche, e controllo collinearità
numerical_pipeline = Pipeline(steps=[
    ('scaler', StandardScaler()),
    ('collinearity', CollinearityRemover(threshold=0.9))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_pipeline, numerical_columns),
        ('cat', OneHotEncoder(drop='first'), categorical_columns)
    ]
)

## Logistic Regression

In [13]:
# Modello di regressione logistica
model = LogisticRegression(max_iter=1000)

# Pipeline: Preprocessing -> Feature Selection -> Modello
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('var_thresh', VarianceThreshold(threshold=0.01)),  # Rimuove variabili con varianza bassa
    ('model', model)
])

# Definizione della griglia dei parametri per il tuning
param_grid = {
    'model__C': [0.01, 0.1, 1, 10, 100]  # Parametro di regolarizzazione
}

# Configurazione del GridSearchCV
grid_search = GridSearchCV(
    pipeline,
    param_grid=param_grid,
    cv=10,  # 10-fold cross-validation
    scoring=make_scorer(f1_score, pos_label=1),  # Ottimizza l'F1 Score per la classe positiva
    verbose=1
)

# Addestramento del modello
grid_search.fit(train_features, train_labels)

# Migliori parametri trovati
print("Best Parameters:", grid_search.best_params_)

# Predizioni sul dataset di test
y_pred = grid_search.predict(test_features)

# Matrice di confusione e report di classificazione
print("Confusion Matrix:")
print(confusion_matrix(test_labels, y_pred))
print("\nClassification Report:")
print(classification_report(test_labels, y_pred))


Fitting 10 folds for each of 5 candidates, totalling 50 fits
Best Parameters: {'model__C': 1}
Confusion Matrix:
[[4174  359]
 [ 601  899]]

Classification Report:
              precision    recall  f1-score   support

           0       0.87      0.92      0.90      4533
           1       0.71      0.60      0.65      1500

    accuracy                           0.84      6033
   macro avg       0.79      0.76      0.77      6033
weighted avg       0.83      0.84      0.84      6033



## KNN

In [14]:
# Modello KNN
model = KNeighborsClassifier()

# Pipeline: Preprocessing -> Modello
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', model)
])

# Definizione della griglia dei parametri per il tuning
param_grid = {
    'model__n_neighbors': [3, 5, 7, 9, 11],  # Numero di vicini da considerare
    'model__weights': ['uniform', 'distance'],  # Metodo di pesatura dei vicini
    'model__p': [1, 2]  # Distanza: 1=Manhattan, 2=Euclidea
}

# Configurazione del GridSearchCV
grid_search = GridSearchCV(
    pipeline,
    param_grid=param_grid,
    cv=10,  # 10-fold cross-validation
    scoring=make_scorer(f1_score, pos_label=1),  # Ottimizza l'F1 Score per la classe positiva
    verbose=1
)

# Addestramento del modello
grid_search.fit(train_features, train_labels)

# Migliori parametri trovati
print("Best Parameters:", grid_search.best_params_)

# Predizioni sul dataset di test
y_pred = grid_search.predict(test_features)

# Matrice di confusione e report di classificazione
print("Confusion Matrix:")
print(confusion_matrix(test_labels, y_pred))
print("\nClassification Report:")
print(classification_report(test_labels, y_pred))


Fitting 10 folds for each of 20 candidates, totalling 200 fits
Best Parameters: {'model__n_neighbors': 9, 'model__p': 2, 'model__weights': 'uniform'}
Confusion Matrix:
[[4103  430]
 [ 603  897]]

Classification Report:
              precision    recall  f1-score   support

           0       0.87      0.91      0.89      4533
           1       0.68      0.60      0.63      1500

    accuracy                           0.83      6033
   macro avg       0.77      0.75      0.76      6033
weighted avg       0.82      0.83      0.83      6033



## Naive Bayes

In [17]:
# Preprocessing: Standardizzazione per le numeriche, One-Hot Encoding per le categoriche, e controllo collinearità
numerical_pipeline = Pipeline(steps=[
    ('scaler', StandardScaler()),
    ('collinearity', CollinearityRemover(threshold=0.9))
])

# Configura OneHotEncoder per restituire dati densi
categorical_pipeline = Pipeline(steps=[
    ('encoder', OneHotEncoder(drop='first', sparse_output=False))  # Converti in formato denso
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_pipeline, numerical_columns),
        ('cat', categorical_pipeline, categorical_columns)
    ]
)

# Modello Naive Bayes
model = GaussianNB()

# Pipeline: Preprocessing -> Feature Selection -> Modello
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('var_thresh', VarianceThreshold(threshold=0.01)),  # Rimuove variabili con varianza bassa
    ('model', model)
])

# Definizione della griglia dei parametri per il tuning
param_grid = {
    # Parametri di Naive Bayes
    'model__var_smoothing': [1e-9, 1e-8, 1e-7, 1e-6]  # Analogo di "laplace"
}

# Configurazione del GridSearchCV
grid_search = GridSearchCV(
    pipeline,
    param_grid=param_grid,
    cv=10,  # 10-fold cross-validation
    scoring=make_scorer(f1_score, pos_label=1),  # Ottimizza l'F1 Score per la classe positiva
    verbose=1
)

# Addestramento del modello
grid_search.fit(train_features, train_labels)

# Migliori parametri trovati
print("Best Parameters:", grid_search.best_params_)

# Predizioni sul dataset di test
y_pred = grid_search.predict(test_features)

# Matrice di confusione e report di classificazione
print("Confusion Matrix:")
print(confusion_matrix(test_labels, y_pred))
print("\nClassification Report:")
print(classification_report(test_labels, y_pred))


Fitting 10 folds for each of 4 candidates, totalling 40 fits
Best Parameters: {'model__var_smoothing': 1e-06}
Confusion Matrix:
[[3294 1239]
 [ 246 1254]]

Classification Report:
              precision    recall  f1-score   support

           0       0.93      0.73      0.82      4533
           1       0.50      0.84      0.63      1500

    accuracy                           0.75      6033
   macro avg       0.72      0.78      0.72      6033
weighted avg       0.82      0.75      0.77      6033



## Bagging

In [21]:
# Preprocessing: Standardizzazione per numeriche e One-Hot Encoding per categoriche
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_columns),
        ('cat', OneHotEncoder(drop='first'), categorical_columns)
    ]
)

# Modello base: Decision Tree
base_model = DecisionTreeClassifier(random_state=42)

# Modello di Bagging
bagging_model = BaggingClassifier(
    estimator=base_model,  # Modello base
    random_state=42
)

# Pipeline completa
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', bagging_model)
])

# Griglia dei parametri per il tuning
param_grid = {
    'model__n_estimators': [100, 250, 500],  # Numero di alberi
    'model__max_samples': [0.5, 0.75, 1.0],  # Proporzione di campioni per bootstrap
    'model__max_features': [0.5, 0.75, 1.0],  # Proporzione di feature per bootstrap
    'model__bootstrap': [True, False],  # Bootstrap dei campioni
    'model__bootstrap_features': [True, False]  # Bootstrap delle feature
}

# Configurazione di GridSearchCV
grid_search = GridSearchCV(
    pipeline,
    param_grid=param_grid,
    cv=10,  # 10-fold cross-validation
    scoring=make_scorer(f1_score, pos_label=1),  # Ottimizza l'F1 Score
    verbose=1
)

# Addestramento del modello
grid_search.fit(train_features, train_labels)

# Migliori parametri trovati
print("Best Parameters:", grid_search.best_params_)

# Predizioni sul dataset di test
y_pred = grid_search.predict(test_features)

# Matrice di confusione e report di classificazione
print("Confusion Matrix:")
print(confusion_matrix(test_labels, y_pred))
print("\nClassification Report:")
print(classification_report(test_labels, y_pred))

Fitting 10 folds for each of 108 candidates, totalling 1080 fits


KeyboardInterrupt: 

## Gradient boosting

In [24]:
# Preprocessing: Standardizzazione per numeriche e One-Hot Encoding per categoriche
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_columns),
        ('cat', OneHotEncoder(drop='first'), categorical_columns)
    ]
)

# Modello di Gradient Boosting
model = GradientBoostingClassifier(random_state=42)

# Pipeline completa
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', model)
])

# Griglia dei parametri per il tuning
param_grid = {
    'model__n_estimators': [100, 200, 300],  # Numero di alberi
    'model__learning_rate': [0.01, 0.1, 0.2],  # Tasso di apprendimento
    'model__max_depth': [3, 5, 7],  # Profondità massima degli alberi
    'model__subsample': [0.8, 1.0],  # Proporzione dei campioni per ogni albero
    'model__min_samples_split': [2, 5, 10]  # Minimo numero di campioni per split
}

# Configurazione di GridSearchCV
grid_search = GridSearchCV(
    pipeline,
    param_grid=param_grid,
    cv=10,  # 10-fold cross-validation
    scoring=make_scorer(f1_score, pos_label=1),  # Ottimizza l'F1 Score
    verbose=1
)

# Addestramento del modello
grid_search.fit(train_features, train_labels)

# Migliori parametri trovati
print("Best Parameters:", grid_search.best_params_)

# Predizioni sul dataset di test
y_pred = grid_search.predict(test_features)

# Matrice di confusione e report di classificazione
print("Confusion Matrix:")
print(confusion_matrix(test_labels, y_pred))
print("\nClassification Report:")
print(classification_report(test_labels, y_pred))


Fitting 10 folds for each of 162 candidates, totalling 1620 fits


KeyboardInterrupt: 

## Random Forest

In [None]:
# Preprocessing: Standardizzazione per le numeriche e One-Hot Encoding per le categoriche
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_columns),
        ('cat', OneHotEncoder(drop='first'), categorical_columns)
    ]
)

# Modello Random Forest
model = RandomForestClassifier(random_state=42)

# Pipeline: Preprocessing -> Modello
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', model)
])

# Definizione della griglia dei parametri per il tuning
param_grid = {
    'model__n_estimators': [100, 200, 300],  # Numero di alberi
    'model__max_depth': [None, 10, 20, 30],  # Profondità massima
    'model__min_samples_split': [2, 5, 10],  # Numero minimo di campioni per dividere un nodo
    'model__min_samples_leaf': [1, 2, 4],  # Numero minimo di campioni in una foglia
    'model__max_features': ['sqrt', 'log2']  # Numero massimo di caratteristiche da considerare per lo split
}

# Configurazione del GridSearchCV
grid_search = GridSearchCV(
    pipeline,
    param_grid=param_grid,
    cv=10,  # 10-fold cross-validation
    scoring=make_scorer(f1_score, pos_label=1),  # Ottimizza l'F1 Score per la classe positiva
    verbose=1,
    n_jobs=-1  # Usa tutti i core disponibili
)

# Addestramento del modello
grid_search.fit(train_features, train_labels)

# Migliori parametri trovati
print("Best Parameters:", grid_search.best_params_)

# Predizioni sul dataset di test
y_pred = grid_search.predict(test_features)

# Matrice di confusione e report di classificazione
print("Confusion Matrix:")
print(confusion_matrix(test_labels, y_pred))
print("\nClassification Report:")
print(classification_report(test_labels, y_pred))

## Stacking

In [None]:
base_estimators = [
    ('knn', knn_pipeline),
    ('naive_bayes', naive_bayes_pipeline),
    ('gradient_boosting', gradient_boosting_pipeline),
    ('random_forest', random_forest_pipeline),
    ('bagging', bagging_pipeline)
]

# Final model (meta-learner) for stacking
final_model = LogisticRegression(max_iter=1000)

# Stacking classifier
stacking_pipeline = StackingClassifier(
    estimators=base_estimators,
    final_estimator=final_model,
    cv=10  # Cross-validation folds for blending
)

# Fit the stacking classifier on training data
stacking_pipeline.fit(train_features, train_labels)

# Make predictions on test data
y_pred_stacking = stacking_pipeline.predict(test_features)

# Evaluate the performance of the stacking model
from sklearn.metrics import classification_report, confusion_matrix
print("Confusion Matrix:")
print(confusion_matrix(test_labels, y_pred_stacking))
print("\nClassification Report:")
print(classification_report(test_labels, y_pred_stacking))