In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [5]:
import os
notebook_path = os.path.abspath("data-training.ipynb")
features_path = os.path.join(os.path.dirname(notebook_path), "alt_acsincome_ca_features_85(1).csv")
labels_path = os.path.join(os.path.dirname(notebook_path), "alt_acsincome_ca_labels_85.csv")
dataset_path

'/home/luz/Documents/supervised-learning/alt_acsincome_ca_features_85(1).csv'

In [6]:
try:
    # Read dataset
    df = pd.read_csv(features_path)
    dl = pd.read_csv(labels_path)
except FileNotFoundError as e:
    print(f"Error : {e}")

In [7]:
# Affichage des aperçus des données
print("\nAperçu des données (features) :")
print(df.head())
print("\nAperçu des données (labels) :")
print(dl.head())


Aperçu des données (features) :
   AGEP  COW  SCHL  MAR    OCCP   POBP  RELP  WKHP  SEX  RAC1P
0  41.0  4.0  24.0  1.0  2555.0    6.0   1.0  60.0  2.0    1.0
1  77.0  7.0  22.0  1.0  4920.0   39.0   0.0  35.0  1.0    1.0
2  38.0  1.0  18.0  1.0   440.0    6.0   1.0  50.0  1.0    1.0
3  30.0  1.0  22.0  5.0  1555.0    6.0   2.0  80.0  1.0    6.0
4  36.0  1.0  16.0  1.0  4030.0  314.0   1.0  70.0  2.0    1.0

Aperçu des données (labels) :
   PINCP
0   True
1   True
2  False
3   True
4  False


In [8]:
# Informations générales
print("\nInformations générales sur les données (features) :")
print(df.info())
print("\nInformations générales sur les données (labels) :")
print(dl.info())


Informations générales sur les données (features) :
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 166315 entries, 0 to 166314
Data columns (total 10 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   AGEP    166315 non-null  float64
 1   COW     166315 non-null  float64
 2   SCHL    166315 non-null  float64
 3   MAR     166315 non-null  float64
 4   OCCP    166315 non-null  float64
 5   POBP    166315 non-null  float64
 6   RELP    166315 non-null  float64
 7   WKHP    166315 non-null  float64
 8   SEX     166315 non-null  float64
 9   RAC1P   166315 non-null  float64
dtypes: float64(10)
memory usage: 12.7 MB
None

Informations générales sur les données (labels) :
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 166315 entries, 0 to 166314
Data columns (total 1 columns):
 #   Column  Non-Null Count   Dtype
---  ------  --------------   -----
 0   PINCP   166315 non-null  bool 
dtypes: bool(1)
memory usage: 162.5 KB
None


In [9]:
# Statistiques descriptives
print("\nStatistiques descriptives (features) :")
print(df.describe())


Statistiques descriptives (features) :
                AGEP            COW           SCHL            MAR  \
count  166315.000000  166315.000000  166315.000000  166315.000000   
mean       42.736235       2.144551      18.470054       2.653633   
std        14.882790       1.888220       3.938362       1.846417   
min        17.000000       1.000000       1.000000       1.000000   
25%        30.000000       1.000000      16.000000       1.000000   
50%        42.000000       1.000000      19.000000       1.000000   
75%        55.000000       3.000000      21.000000       5.000000   
max        94.000000       8.000000      24.000000       5.000000   

                OCCP           POBP           RELP           WKHP  \
count  166315.000000  166315.000000  166315.000000  166315.000000   
mean     4019.729279      94.364718       2.506617      37.859255   
std      2638.167883     123.472067       4.443905      13.014087   
min        10.000000       1.000000       0.000000       1.000

In [10]:
# Vérification des valeurs manquantes
print("\nValeurs manquantes dans les features :")
print(df.isnull().sum())
print("\nValeurs manquantes dans les labels :")
print(dl.isnull().sum())


Valeurs manquantes dans les features :
AGEP     0
COW      0
SCHL     0
MAR      0
OCCP     0
POBP     0
RELP     0
WKHP     0
SEX      0
RAC1P    0
dtype: int64


In [21]:
# Split dataset in train and test
X = df.iloc[:,1:31]  # Features
y = df.iloc[:, 31]  # Cible (labels)

# Diviser en ensembles d'entraînement et de test
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, shuffle=True
)

In [22]:
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
models = {
    'RandomForest': RandomForestClassifier(),
    'AdaBoost': AdaBoostClassifier(algorithm="SAMME"),
    'GradientBoosting': GradientBoostingClassifier()
}

In [23]:
from sklearn.model_selection import cross_val_score, GridSearchCV
import numpy as np

for name, model in models.items():
    scores = cross_val_score(model, X_train, y_train, cv=5)
    print(f'{name} - Accuracy: {np.mean(scores):.4f}')


RandomForest - Accuracy: 0.9828
AdaBoost - Accuracy: 0.9313
GradientBoosting - Accuracy: 0.9504


In [24]:
from sklearn.metrics import classification_report, confusion_matrix

# Generating classification_report, confusion_matrix
for name, model in models.items():
    # Training Model
    model.fit(X_train, y_train)
    
    # Predict Test Set
    y_pred = model.predict(X_test)
    
    # Printing classification_report
    print(f'\n{name} - Classification Report:')
    print(classification_report(y_test, y_pred))
    
    # Printing confusion_matrix
    print(f'{name} - Confusion Matrix:')
    print(confusion_matrix(y_test, y_pred))



RandomForest - Classification Report:
              precision    recall  f1-score   support

          -1       0.98      0.98      0.98      1964
           1       0.98      0.99      0.99      2458

    accuracy                           0.98      4422
   macro avg       0.98      0.98      0.98      4422
weighted avg       0.98      0.98      0.98      4422

RandomForest - Confusion Matrix:
[[1925   39]
 [  32 2426]]

AdaBoost - Classification Report:
              precision    recall  f1-score   support

          -1       0.93      0.91      0.92      1964
           1       0.93      0.94      0.94      2458

    accuracy                           0.93      4422
   macro avg       0.93      0.93      0.93      4422
weighted avg       0.93      0.93      0.93      4422

AdaBoost - Confusion Matrix:
[[1784  180]
 [ 140 2318]]

GradientBoosting - Classification Report:
              precision    recall  f1-score   support

          -1       0.95      0.93      0.94      1964
    

## Training model with just Address Bar based features

The paper that comes with the dataset being used specifies 4 different types of features. The first type is Address Bar based features. Since those are the features more easily obtained from a website, we'll train the model just with them to see if those are sufficient. Then we'll do the same with the other type of features in order to optimize our future extension, less features implies less time of data collecting and processing.

In [25]:
# Split dataset in train and test
X = df.iloc[:,1:13]  # Address Bar based features
y = df.iloc[:, 31]  # labels

# train and test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, shuffle=True
)

In [26]:
from sklearn.model_selection import cross_val_score, GridSearchCV
import numpy as np

for name, model in models.items():
    scores = cross_val_score(model, X_train, y_train, cv=5)
    print(f'{name} - Accuracy: {np.mean(scores):.4f}')

RandomForest - Accuracy: 0.9123
AdaBoost - Accuracy: 0.8989
GradientBoosting - Accuracy: 0.9020


## Training model with Abnormal based features and HTML/Javascript based features

In [34]:
# Split dataset in train and test
X = df.iloc[:,13:24]  # Address Bar based features
y = df.iloc[:, 31]  # labels

# train and test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, shuffle=True
)

In [28]:
from sklearn.model_selection import cross_val_score, GridSearchCV
import numpy as np

for name, model in models.items():
    scores = cross_val_score(model, X_train, y_train, cv=5)
    print(f'{name} - Accuracy: {np.mean(scores):.4f}')

RandomForest - Accuracy: 0.8813
AdaBoost - Accuracy: 0.8696
GradientBoosting - Accuracy: 0.8750


## Training model with Domain based features

In [54]:
X = df.iloc[:,24:31]  # Address Bar based features
y = df.iloc[:, 31]  # labels

# train and test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, shuffle=True
)

In [55]:
from sklearn.model_selection import cross_val_score, GridSearchCV
import numpy as np

for name, model in models.items():
    scores = cross_val_score(model, X_train, y_train, cv=5)
    print(f'{name} - Accuracy: {np.mean(scores):.4f}')

RandomForest - Accuracy: 0.7478
AdaBoost - Accuracy: 0.7071
GradientBoosting - Accuracy: 0.7358


In [59]:
columns_1_to_12 = df.iloc[:, 1:13]

# Selecciona las columnas de la 22 a la 30
columns_22_to_30 = df.iloc[:, 22:31]

# Combina ambas selecciones en un único DataFrame
selected_columns = pd.concat([columns_1_to_12, columns_22_to_30], axis=1)

X = selected_columns  # Address Bar based features
y = df.iloc[:, 31]  # labels

# train and test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, shuffle=True
)


In [60]:
from sklearn.model_selection import cross_val_score, GridSearchCV
import numpy as np

for name, model in models.items():
    scores = cross_val_score(model, X_train, y_train, cv=5)
    print(f'{name} - Accuracy: {np.mean(scores):.4f}')

RandomForest - Accuracy: 0.9581
AdaBoost - Accuracy: 0.9051
GradientBoosting - Accuracy: 0.9220


In [63]:
columns_1_to_12 = df.iloc[:, 1:13]

# Selecciona las columnas de la 13 a la 22
columns_13_to_22 = df.iloc[:, 13:23]

# Combina ambas selecciones en un único DataFrame
selected_columns = pd.concat([columns_1_to_12, columns_13_to_22], axis=1)

X = selected_columns  # Address Bar based features
y = df.iloc[:, 31]  # labels

# train and test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, shuffle=True
)

In [64]:
from sklearn.model_selection import cross_val_score, GridSearchCV
import numpy as np

for name, model in models.items():
    scores = cross_val_score(model, X_train, y_train, cv=5)
    print(f'{name} - Accuracy: {np.mean(scores):.4f}')

RandomForest - Accuracy: 0.9580
AdaBoost - Accuracy: 0.9258
GradientBoosting - Accuracy: 0.9374


In [65]:
columns_13_to_22 = df.iloc[:, 13:23]

# Selecciona las columnas de la 13 a la 22
columns_23_to_30 = df.iloc[:, 23:31]

# Combina ambas selecciones en un único DataFrame
selected_columns = pd.concat([columns_13_to_22, columns_23_to_30], axis=1)

X = selected_columns  # Address Bar based features
y = df.iloc[:, 31]  # labels

# train and test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, shuffle=True
)

In [66]:
from sklearn.model_selection import cross_val_score, GridSearchCV
import numpy as np

for name, model in models.items():
    scores = cross_val_score(model, X_train, y_train, cv=5)
    print(f'{name} - Accuracy: {np.mean(scores):.4f}')

RandomForest - Accuracy: 0.9388
AdaBoost - Accuracy: 0.8802
GradientBoosting - Accuracy: 0.9004
