# MODEL III : Decision Trees and Random forests

## Data

In [None]:
# Import libraries and modules

# Import the necessary libraries
import sys
import os

# Add the project directory to the sys.path
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))

# Import everything from lib_import.py, evaluate.py, data_preprocessing.py 
from lib.lib_import import *
from src.data_preprocessing import *
from src.evaluate import *

# Import the data
from data.data_extract import load_data

In [None]:
# Load the data

df_data = load_data()
df_data.head(10)

## Preprocessing

### Basic preprocessing

In [None]:
data = df_data.copy()
data.head(10)

In [None]:
data.info()

In [None]:
data.shape

In [None]:
# Basic preprocessing : Fix target + remove inutil columns + drop outliers
data = preprocess(data)
data.head(10)

In [None]:
data.shape

### Separating dataset - train and test

In [None]:
# Seperate the data into train and test
df_train, df_test = seperate_train_test(data, random_state=42)

In [None]:
df_train.shape, df_test.shape

In [None]:
# Check the distribution of target variable in training data
plt.figure(figsize=(8, 4))
plt.pie(df_train['>50K'].value_counts(), autopct='%1.1f%%')
plt.legend(['<=50K', '>50K'], loc='upper right')
plt.title('Distribution of target variable in training data')
plt.show()

In [None]:
# Check the distribution of target variable in test data
plt.figure(figsize=(8, 4))
plt.pie(df_test['>50K'].value_counts(), autopct='%1.1f%%')
plt.legend(['<=50K', '>50K'], loc='upper right')
plt.title('Distribution of target variable in training data')
plt.show()

### Impute the missing values

In [None]:
# Seperate the categorical and numerical variables
cat_features = get_cat_features(data)
cont_features = get_cont_features(data)

In [None]:
ax = msno.bar(df_train)

In [None]:
# Impute the missing values in the categorical variables
imput_cont = SimpleImputer(missing_values=np.nan, strategy='mean')
imput_cat = SimpleImputer(missing_values=np.nan, strategy='most_frequent')

# on n'impute pas pour les variables numériques car aucune n'est manquante. 
# for feature in cont_features :
#     df_train[feature] = imput_cont.fit_transform(df_train[feature].values.reshape(-1,1)).ravel()
#     df_test[feature] = imput_cont.transform(df_test[feature].values.reshape(-1,1)).ravel()
    
for feature in cat_features :
    df_train[feature] = imput_cat.fit_transform(df_train[feature].values.reshape(-1,1)).ravel()
    df_test[feature] = imput_cat.transform(df_test[feature].values.reshape(-1,1)).ravel()

In [None]:
ax = msno.bar(df_train)

## Regrouping features

### Values not referenced

In [None]:
df_train['workclass'] = df_train['workclass'].replace({'?': 'Not referenced'})
df_test['workclass'] = df_test['workclass'].replace({'?': 'Not referenced'})

df_train['native-country'] = df_train['native-country'].replace({'?': 'Not referenced'})
df_test['native-country'] = df_test['native-country'].replace({'?': 'Not referenced'})

df_train['occupation'] = df_train['occupation'].replace({'?': 'Not referenced'})
df_test['occupation'] = df_test['occupation'].replace({'?': 'Not referenced'})

### Workclass

In [None]:
# Regroup 'Without-pay' and 'Never-worked' to 'No revenu'
df_train['workclass'] = df_train['workclass'].replace({'Without-pay': 'No revenu', 'Never-worked': 'No revenu'})
df_test['workclass'] = df_test['workclass'].replace({'Without-pay': 'No revenu', 'Never-worked': 'No revenu'})

# Regrouper 'Self-emp-not-inc' and 'Self-emp-inc' to 'Self-emp'
df_train['workclass'] = df_train['workclass'].replace({'Self-emp-not-inc': 'Self-emp', 'Self-emp-inc': 'Self-emp'})
df_test['workclass'] = df_test['workclass'].replace({'Self-emp-not-inc': 'Self-emp', 'Self-emp-inc': 'Self-emp'})

In [None]:
df_train[df_train['workclass'] == 'Self-emp'].head(10)

### Marital-status

In [None]:
df_train['marital-status'] = df_train['marital-status'].replace({'Divorced': 'Now Single', 'Separated': 'Now Single', 'Widowed': 'Now Single'})
df_test['marital-status'] = df_test['marital-status'].replace({'Divorced': 'Now Single', 'Separated': 'Now Single', 'Widowed': 'Now Single'})

df_train['marital-status'] = df_train['marital-status'].replace({'Married-civ-spouse': 'Married', 'Married-AF-spouse': 'Married'})
df_test['marital-status'] = df_test['marital-status'].replace({'Married-civ-spouse': 'Married', 'Married-AF-spouse': 'Married'})

### Relationship

In [None]:
df_train['relationship'] = df_train['relationship'].replace({'Husband': 'Married', 'Wife': 'Married'})
df_test['relationship'] = df_test['relationship'].replace({'Husband': 'Married', 'Wife': 'Married'})

### Race

In [None]:
df_train['race'] = df_train['race'].replace({'Amer-Indian-Eskimo': 'Other'})
df_test['race'] = df_test['race'].replace({'Amer-Indian-Eskimo': 'Other'})

### Native-country

In [None]:
# For each value of the 'native-country' variable, we calculate the number of individuals who have this value
filtered = df_train[df_train['native-country'] != 'United-States']

plt.figure(figsize=(20, 10))
filtered['native-country'].value_counts().plot(kind='bar')
plt.title('Histogram of the native-country variable')
plt.show()


On met dans la catégorie "Other" tous les pays qui obtiennent un nombre d'observations inférieur à 200. 

In [None]:
# Counting for each value of the variable 'native-country' the number of individuals who have this value
filtered = df_train[df_train['native-country'] != 'United-States']

for country in filtered['native-country'].unique():
    nb_samples_associated = filtered[filtered['native-country'] == country].shape[0]
    if nb_samples_associated < 200:
        df_train['native-country'] = df_train['native-country'].replace({country: 'Other'})
        df_test['native-country'] = df_test['native-country'].replace({country: 'Other'})
        

### New features

In [None]:
# Plot the distribution of the categorical variables
df_train_categ = df_train.select_dtypes(include='object')

plt.figure(figsize=(18, 18))
for i, feature in enumerate(df_train_categ.columns):
    if i <= 7:
        ax = plt.subplot(4, 2, i + 1)
        hist = sns.histplot(df_train[feature], ax=ax)
        
        if i == 7:  # Si c'est le dernier graphique, afficher uniquement la valeur la plus élevée car sinon illisible
            max_height = 0
            max_p = None
            for p in hist.patches:
                height = p.get_height()
                if height > max_height:
                    max_height = height
                    max_p = p
            if max_p is not None:
                ax.annotate(f'{max_height:.0f}',
                            xy=(max_p.get_x() + max_p.get_width() / 2, max_height),
                            xytext=(0, 5),  # Décalage vertical de 5 points
                            textcoords="offset points",
                            ha='center', va='bottom', fontsize=10, color='black')
        else:  # Pour les autres graphiques, afficher toutes les valeurs
            for p in hist.patches:
                height = p.get_height()
                ax.annotate(f'{height:.0f}',
                            xy=(p.get_x() + p.get_width() / 2, height),
                            xytext=(0, 5),  # Décalage vertical de 5 points
                            textcoords="offset points",
                            ha='center', va='bottom', fontsize=10, color='black')
        
        ax.set_xlabel(feature, fontsize=20)
        ax.set_xticklabels(ax.get_xticklabels(), rotation=45, horizontalalignment='right', fontsize=12)

plt.tight_layout()
plt.show()

## Standardization

### Df_train, Df_test

In [None]:
df_train.head(10)

In [None]:
df_test.head(10)

In [None]:
# Separate the features and target variable
X_train = df_train.drop('>50K', axis=1)
y_train = df_train['>50K']

X_test = df_test.drop('>50K', axis=1)
y_test = df_test['>50K']

In [None]:
X_train.shape, y_train.shape, X_test.shape, y_test.shape

### Separate Categorical and numerical variables

In [None]:
# Seperate the categorical and numerical variables
cat_features = df_train.select_dtypes('object').columns
cat_features.append(pd.Index(['education-num']))

In [None]:
cont_features = df_train.select_dtypes('int64').columns
cont_features = cont_features.drop('education-num')
cont_features = cont_features.drop('>50K')
cont_features

### Numerical variables

In [None]:
X_train.head(10)

In [None]:
# Standard Scaler

scale_standard = StandardScaler() 

print('Categorical features : ', cont_features)
for feature in cont_features:

    # Normaliser les données d'entrainement
    X_train[feature] = scale_standard.fit_transform(X_train[feature].values.reshape(-1,1)) 
    X_train[feature] = X_train[feature].ravel()

    # Normaliser les données 
    X_test[feature] = scale_standard.transform(X_test[feature].values.reshape(-1,1))
    X_test[feature] = X_test[feature].ravel()


In [None]:
X_train.head(10)

### Categorical variables

In [None]:
X_train.head(10)

In [None]:
# Label Encoding

label_encoder = LabelEncoder()

X_train_LEncoder = X_train.copy()
X_test_LEncoder = X_test.copy()

for feature in cat_features:
    X_train_LEncoder[feature] = label_encoder.fit_transform(X_train_LEncoder[feature])
    X_test_LEncoder[feature] = label_encoder.transform(X_test_LEncoder[feature])

In [None]:
X_train_LEncoder.head(10)

In [None]:
# One Hot Encoding

one_hot_encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')

X_train_OHEncoder = X_train.copy()
X_test_OHEncoder= X_test.copy()

for feature in cat_features:
    # Appliquer OneHotEncoder et convertir en DataFrame
    encoded_train = one_hot_encoder.fit_transform(X_train[[feature]])
    encoded_test = one_hot_encoder.transform(X_test[[feature]])
    
    # Obtenir les noms des colonnes encodées
    encoded_columns = one_hot_encoder.get_feature_names_out([feature])
    
    # Créer des DataFrames pour les features encodées sans réinitialiser les index
    encoded_train_df = pd.DataFrame(encoded_train, columns=encoded_columns, index=X_train.index)
    encoded_test_df = pd.DataFrame(encoded_test, columns=encoded_columns, index=X_test.index)
    
    # Concaténer les DataFrames encodés avec les DataFrames originaux sans changer les index
    X_train_OHEncoder = pd.concat([X_train_OHEncoder, encoded_train_df], axis=1).drop(columns=[feature])
    X_test_OHEncoder = pd.concat([X_test_OHEncoder, encoded_test_df], axis=1).drop(columns=[feature])

In [None]:
X_train_OHEncoder.head(10)

# Modelisation - Decision Trees

On utilise dans la suite : X_train_OHEncoder, y_train et X_test_OHEncoder, y_test.

In [None]:
X_train_OHEncoder.shape, y_train.shape, X_test_OHEncoder.shape, y_test.shape

In [None]:
# Decision Tree

decision_tree = DecisionTreeClassifier(random_state=42)
decision_tree.fit(X_train_OHEncoder, y_train)

y_pred = decision_tree.predict(X_test_OHEncoder)
evaluate.plot_confusion_matrix_sns(y_test, y_pred, "Decision Tree")


In [None]:
# Display the confusion matrix
print('='*20)
print('Decision Tree')
print('='*20, '\n')

print("Matrice de confusion:")
print(confusion_matrix(y_test, y_pred), '\n') # afficher à l'écran notre matrice de confusion
print("Rapport de classification:")
print(classification_report(y_test, y_pred), '\n')
print('Exactitude: %f' %(accuracy_score(y_test,y_pred)*100), '\n')
c_matrix = confusion_matrix(y_test, y_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=c_matrix) 
disp.plot() 
plt.show()