In [None]:
import pandas as pd 
import numpy as np 
import seaborn as sns
import matplotlib.pyplot as plt 
import scipy.stats as stats 

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

In [None]:
df=pd.read_csv('projet_finale_vf2.csv')
df.head()

In [None]:
print(df.columns)

In [None]:
df=df.drop(['Unnamed: 0'],axis=1)
df.head()

In [None]:
print('nombre de lignes & de colonnes')
print(df.shape)
print('***********************************************')
print('descriptif du dataframe')
print(df.describe(include='all'))
print('***********************************************')
print('nombre de valeurs null')
print(df.isnull().sum().sum())
if df.isnull().sum().sum()!=0:
    print('nombre de valeur null par colonne')
    print(df.isnull().sum())
print('***********************************************')
print('aperçu des 5 premières lignes')
print(df.head())

In [None]:
df=df.drop(columns=['departement','revenu_moyen'],axis=1)
print(df.head())

In [None]:
print(df.columns)

### caractéristiques TreeRegressor modèle V4
#max_depth=5,          # profondeur maximale de l’arbre
#min_samples_split=5, # min d’échantillons pour un split
#min_samples_leaf=1 

##PREPROCESSING - Modèle Socio-eco (mse1)

In [None]:
#SPLIT dataset into X and Y

features_list_mse1=['tx_pauvrete', 'revenu_median', 'tx_chomage', 'tx_urbanisation', 'densite_2018_(hab/km²)', 'tx_scolarisation_pop']

print('Splitting dataset into X and Y...')
X_mse1=df[features_list_mse1]

y_mse1=df['tx_crim_pour_100 M_hab']
print('...Done.')

#SPLIT dataset into train test and test...
print('Splitting dataset into train test and test...')
X_mse1_train,X_mse1_test,y_mse1_train,y_mse1_test=train_test_split(X_mse1,y_mse1,
                                                               test_size=0.2,
                                                               random_state=0
                                                               )
print('...Done.')

### Training pipeline ###
print('---Training pipeline---')

# Before preprocessing
print("#### X_train BEFORE preprocessing ####")
print(X_mse1_train.head())  
print()

print("Encoding categorical features and standardizing numerical features...")

#Preprocessing

numeric_transformer = StandardScaler()

# Apply ColumnTransformer to create a pipeline that will apply the above preprocessing

feature_encoder = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, features_list_mse1)
    ],
)

X_mse1_train=feature_encoder.fit_transform(X_mse1_train)
X_mse1_test=feature_encoder.transform(X_mse1_test)

#Vérification
print('...Done.')
print("#### X_mse_train AFTER preprocessing ####")
print(X_mse1_train[0:5, :])  # affiche les 5 premières lignes



#BUILD MODEL(mse1)

In [None]:
# Training model
print("Training model...")
regressor = DecisionTreeRegressor(
    random_state=0,
    max_depth=5,          # profondeur maximale de l’arbre
    min_samples_split=5, # min d’échantillons pour un split
    min_samples_leaf=1   # min d’échantillons dans une feuille
    )
regressor.fit(X_mse1_train, y_mse1_train)
print("...Done.")

# Predictions on training set
print("Predictions on train set...")
y_mse1_train_pred = regressor.predict(X_mse1_train)
print("...Done.")
print()

# Afficher les 5 premières prédictions train vs valeurs réelles
for vrai, pred in zip(y_mse1_train[:5], y_mse1_train_pred[:5]):
    print(f"Réel: {vrai:.2f}  →  Prédit: {pred:.2f}")

# Predictions on test set
print("Predictions on test set...")
y_mse1_test_pred = regressor.predict(X_mse1_test)
print("...Done.")

# Afficher les 5 premières prédictions test vs valeurs réelles
for vrai, pred in zip(y_mse1_test[:5], y_mse1_test_pred[:5]):
    print(f"Réel: {vrai:.2f}  →  Prédit: {pred:.2f}")



#EVALUATE MODEL(mse1)

In [None]:
# Performance assessment
print("--- Assessing the performances of the model ---")

r2_train_mse1=regressor.score(X_mse1_train, y_mse1_train)
r2_test_mse1=regressor.score(X_mse1_test, y_mse1_test)

# Print R^2 scores
print("R2 score on training set : ", r2_train_mse1)
print("R2 score on test set : ", r2_test_mse1)


#Features Importance(mse1))

In [None]:
# Importance assessment

importance_mse1 = pd.Series(regressor.feature_importances_, index=features_list_mse1)
importance_mse1.sort_values(ascending=False, inplace=True)
print("Importance des features :")
print(importance_mse1)

df_imp_mse1 = importance_mse1.reset_index()
df_imp_mse1.columns = ['Feature', 'Importance']
df_imp_mse1['Model'] = 'MSE1'


In [None]:
#Visualisation Importance
importance_mse1.plot(kind='barh')
plt.title("Importance des features")
plt.show()


##PREPROCESSING - Modèle Socio-eco (mse2) 

In [None]:
#SPLIT dataset into X and Y

features_list_mse2=['tx_pauvrete', 'revenu_median', 'tx_chomage', 'tx_urbanisation', 'densite_2018_(hab/km²)', 'tx_pop_sans_dipl']

print('Splitting dataset into X and Y...')
X_mse2=df[features_list_mse2]

y_mse2=df['tx_crim_pour_100 M_hab']
print('...Done.')

#SPLIT dataset into train test and test...
print('Splitting dataset into train test and test...')
X_mse2_train,X_mse2_test,y_mse2_train,y_mse2_test=train_test_split(X_mse2,y_mse2,
                                                               test_size=0.2,
                                                               random_state=0
                                                               )
print('...Done.')

### Training pipeline ###
print('---Training pipeline---')

# Before preprocessing
print("#### X_train BEFORE preprocessing ####")
print(X_mse2_train.head())  
print()

print("Encoding categorical features and standardizing numerical features...")

#Preprocessing

numeric_transformer = StandardScaler()

# Apply ColumnTransformer to create a pipeline that will apply the above preprocessing

feature_encoder = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, features_list_mse2)
    ],
)

X_mse2_train=feature_encoder.fit_transform(X_mse2_train)
X_mse2_test=feature_encoder.transform(X_mse2_test)

#Vérification
print('...Done.')
print("#### X_mse_train AFTER preprocessing ####")
print(X_mse2_train[0:5, :])  # affiche les 5 premières lignes



#BUILD MODEL(mse2)

In [None]:
# Training model
print("Training model...")
regressor = DecisionTreeRegressor(
    random_state=0,
    max_depth=5,          # profondeur maximale de l’arbre
    min_samples_split=5, # min d’échantillons pour un split
    min_samples_leaf=1   # min d’échantillons dans une feuille
    )
regressor.fit(X_mse2_train, y_mse2_train)
print("...Done.")

# Predictions on training set
print("Predictions on train set...")
y_mse2_train_pred = regressor.predict(X_mse2_train)
print("...Done.")
print()

# Afficher les 5 premières prédictions train vs valeurs réelles
for vrai, pred in zip(y_mse2_train[:5], y_mse2_train_pred[:5]):
    print(f"Réel: {vrai:.2f}  →  Prédit: {pred:.2f}")

# Predictions on test set
print("Predictions on test set...")
y_mse2_test_pred = regressor.predict(X_mse2_test)
print("...Done.")

# Afficher les 5 premières prédictions test vs valeurs réelles
for vrai, pred in zip(y_mse2_test[:5], y_mse2_test_pred[:5]):
    print(f"Réel: {vrai:.2f}  →  Prédit: {pred:.2f}")



#EVALUATE MODEL(mse2)

In [None]:
# Performance assessment
print("--- Assessing the performances of the model ---")

r2_train_mse2=regressor.score(X_mse2_train, y_mse2_train)
r2_test_mse2=regressor.score(X_mse2_test, y_mse2_test)

# Print R^2 scores
print("R2 score on training set : ", r2_train_mse2)
print("R2 score on test set : ", r2_test_mse2)


#Features Importance(mse2)

In [None]:
# Importance assessment

importance_mse2 = pd.Series(regressor.feature_importances_, index=features_list_mse2)
importance_mse2.sort_values(ascending=False, inplace=True)
print("Importance des features :")
print(importance_mse2)

df_imp_mse2 = importance_mse2.reset_index()
df_imp_mse2.columns = ['Feature', 'Importance']
df_imp_mse2['Model'] = 'MSE2'



In [None]:
#Visualisation Importance

importance_mse2.plot(kind='barh')
plt.title("Importance des features")
plt.show()


##PREPROCESSING - Modèle Socio (ms1)

In [None]:
#SPLIT dataset into X and Y

features_list_ms1=['tx_urbanisation', 'densite_2018_(hab/km²)', 'tx_scolarisation_pop']

print('Splitting dataset into X and Y...')
X_ms1=df[features_list_ms1]

y_ms1=df['tx_crim_pour_100 M_hab']
print('...Done.')

#SPLIT dataset into train test and test...
print('Splitting dataset into train test and test...')
X_ms1_train,X_ms1_test,y_ms1_train,y_ms1_test=train_test_split(X_ms1,y_ms1,
                                                               test_size=0.2,
                                                               random_state=0
                                                               )
print('...Done.')

### Training pipeline ###
print('---Training pipeline---')

# Before preprocessing
print("#### X_train BEFORE preprocessing ####")
print(X_ms1_train.head())  
print()

print("Encoding categorical features and standardizing numerical features...")

#Preprocessing

numeric_transformer = StandardScaler()

# Apply ColumnTransformer to create a pipeline that will apply the above preprocessing

feature_encoder = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, features_list_ms1)
    ],
)

X_ms1_train=feature_encoder.fit_transform(X_ms1_train)
X_ms1_test=feature_encoder.transform(X_ms1_test)

#Vérification
print('...Done.')
print("#### X_ms1_train AFTER preprocessing ####")
print(X_ms1_train[0:5, :])  # affiche les 5 premières lignes



##BUILD MODEL(ms1)

In [None]:
# Training model
print("Training model...")
regressor = DecisionTreeRegressor(
    random_state=0,
    max_depth=5,          # profondeur maximale de l’arbre
    min_samples_split=5, # min d’échantillons pour un split
    min_samples_leaf=1   # min d’échantillons dans une feuille
    )
regressor.fit(X_ms1_train, y_ms1_train)
print("...Done.")

# Predictions on training set
print("Predictions on train set...")
y_ms1_train_pred = regressor.predict(X_ms1_train)
print("...Done.")
print()

# Afficher les 5 premières prédictions train vs valeurs réelles
for vrai, pred in zip(y_ms1_train[:5], y_ms1_train_pred[:5]):
    print(f"Réel: {vrai:.2f}  →  Prédit: {pred:.2f}")

# Predictions on test set
print("Predictions on test set...")
y_ms1_test_pred = regressor.predict(X_ms1_test)
print("...Done.")

# Afficher les 5 premières prédictions test vs valeurs réelles
for vrai, pred in zip(y_ms1_test[:5], y_ms1_test_pred[:5]):
    print(f"Réel: {vrai:.2f}  →  Prédit: {pred:.2f}")



#EVALUATE MODEL(ms1)

In [None]:
# Performance assessment
print("--- Assessing the performances of the model ---")

r2_train_ms1=regressor.score(X_ms1_train, y_ms1_train)
r2_test_ms1=regressor.score(X_ms1_test, y_ms1_test)

# Print R^2 scores
print("R2 score on training set : ", r2_train_ms1)
print("R2 score on test set : ", r2_test_ms1)


#Features Importance(ms1)

In [None]:
#Importa,ce assessment

importance_ms1 = pd.Series(regressor.feature_importances_, index=features_list_ms1)
importance_ms1.sort_values(ascending=False, inplace=True)
print("Importance des features :")
print(importance_ms1)

df_imp_ms1 = importance_ms1.reset_index()
df_imp_ms1.columns = ['Feature', 'Importance']
df_imp_ms1['Model'] = 'MS1'



In [None]:
#Visualisation Importance

importance_ms1.plot(kind='barh')
plt.title("Importance des features")
plt.show()


##PREPROCESSING - Modèle Socio (ms2)

In [None]:
#SPLIT dataset into X and Y

features_list_ms2=['tx_urbanisation', 'densite_2018_(hab/km²)', 'tx_pop_sans_dipl']

print('Splitting dataset into X and Y...')
X_ms2=df[features_list_ms2]

y_ms2=df['tx_crim_pour_100 M_hab']
print('...Done.')

#SPLIT dataset into train test and test...
print('Splitting dataset into train test and test...')
X_ms2_train,X_ms2_test,y_ms2_train,y_ms2_test=train_test_split(X_ms2,y_ms2,
                                                               test_size=0.2,
                                                               random_state=0
                                                               )
print('...Done.')

### Training pipeline ###
print('---Training pipeline---')

# Before preprocessing
print("#### X_train BEFORE preprocessing ####")
print(X_ms2_train.head())  
print()

print("Encoding categorical features and standardizing numerical features...")

#Preprocessing

numeric_transformer = StandardScaler()

# Apply ColumnTransformer to create a pipeline that will apply the above preprocessing

feature_encoder = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, features_list_ms2)
    ],
)

X_ms2_train=feature_encoder.fit_transform(X_ms2_train)
X_ms2_test=feature_encoder.transform(X_ms2_test)

#Vérification
print('...Done.')
print("#### X_ms2_train AFTER preprocessing ####")
print(X_ms2_train[0:5, :])  # affiche les 5 premières lignes



#BUILD MODEL(ms2)

In [None]:
# Training model
print("Training model...")
regressor = DecisionTreeRegressor(
    random_state=0,
    max_depth=5,          # profondeur maximale de l’arbre
    min_samples_split=5, # min d’échantillons pour un split
    min_samples_leaf=1   # min d’échantillons dans une feuille
                                  )
regressor.fit(X_ms2_train, y_ms2_train)
print("...Done.")

# Predictions on training set
print("Predictions on train set...")
y_ms2_train_pred = regressor.predict(X_ms2_train)
print("...Done.")
print()

# Afficher les 5 premières prédictions train vs valeurs réelles
for vrai, pred in zip(y_ms2_train[:5], y_ms2_train_pred[:5]):
    print(f"Réel: {vrai:.2f}  →  Prédit: {pred:.2f}")

# Predictions on test set
print("Predictions on test set...")
y_ms2_test_pred = regressor.predict(X_ms2_test)
print("...Done.")

# Afficher les 5 premières prédictions test vs valeurs réelles
for vrai, pred in zip(y_ms2_test[:5], y_ms2_test_pred[:5]):
    print(f"Réel: {vrai:.2f}  →  Prédit: {pred:.2f}")



#EVALUATE MODEL(ms2)

In [None]:
# Performance assessment
print("--- Assessing the performances of the model ---")

r2_train_ms2=regressor.score(X_ms2_train, y_ms2_train)
r2_test_ms2=regressor.score(X_ms2_test, y_ms2_test)

# Print R^2 scores
print("R2 score on training set : ", r2_train_ms2)
print("R2 score on test set : ", r2_test_ms2)


#Features Importance(ms2)

In [None]:
#Importance assessment

importance_ms2 = pd.Series(regressor.feature_importances_, index=features_list_ms2)
importance_ms2.sort_values(ascending=False, inplace=True)
print("Importance des features :")
print(importance_ms2)

df_imp_ms2 = importance_ms2.reset_index()
df_imp_ms2.columns = ['Feature', 'Importance']
df_imp_ms2['Model'] = 'MS2'



In [None]:
#Visualisation Importance

importance_ms2.plot(kind='barh')
plt.title("Importance des features")
plt.show()


##PREPROCESSING - Modèle Socio + tx chomage (ms1ptc)

In [None]:
#SPLIT dataset into X and Y

features_list_ms1ptc=['tx_chomage','tx_urbanisation', 'densite_2018_(hab/km²)', 'tx_scolarisation_pop']

print('Splitting dataset into X and Y...')
X_ms1ptc=df[features_list_ms1ptc]

y_ms1ptc=df['tx_crim_pour_100 M_hab']
print('...Done.')

#SPLIT dataset into train test and test...
print('Splitting dataset into train test and test...')
X_ms1ptc_train,X_ms1ptc_test,y_ms1ptc_train,y_ms1ptc_test=train_test_split(X_ms1ptc,y_ms1ptc,
                                                               test_size=0.2,
                                                               random_state=0
                                                               )
print('...Done.')

### Training pipeline ###
print('---Training pipeline---')

# Before preprocessing
print("#### X_train BEFORE preprocessing ####")
print(X_ms1ptc_train.head())  
print()

print("Encoding categorical features and standardizing numerical features...")

#Preprocessing

numeric_transformer = StandardScaler()

# Apply ColumnTransformer to create a pipeline that will apply the above preprocessing

feature_encoder = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, features_list_ms1ptc)
    ],
)

X_ms1ptc_train=feature_encoder.fit_transform(X_ms1ptc_train)
X_ms1ptc_test=feature_encoder.transform(X_ms1ptc_test)

#Vérification
print('...Done.')
print("#### X_ms_train AFTER preprocessing ####")
print(X_ms1ptc_train[0:5, :])  # affiche les 5 premières lignes



#BUILD MODEL(ms1ptc)

In [None]:
# Training model
print("Training model...")
regressor = DecisionTreeRegressor(
    random_state=0,
    max_depth=5,          # profondeur maximale de l’arbre
    min_samples_split=5, # min d’échantillons pour un split
    min_samples_leaf=1   # min d’échantillons dans une feuille
    )
regressor.fit(X_ms1ptc_train, y_ms1ptc_train)
print("...Done.")

# Predictions on training set
print("Predictions on train set...")
y_ms1ptc_train_pred = regressor.predict(X_ms1ptc_train)
print("...Done.")
print()

# Afficher les 5 premières prédictions train vs valeurs réelles
for vrai, pred in zip(y_ms1ptc_train[:5], y_ms1ptc_train_pred[:5]):
    print(f"Réel: {vrai:.2f}  →  Prédit: {pred:.2f}")

# Predictions on test set
print("Predictions on test set...")
y_ms1ptc_test_pred = regressor.predict(X_ms1ptc_test)
print("...Done.")

# Afficher les 5 premières prédictions test vs valeurs réelles
for vrai, pred in zip(y_ms1ptc_test[:5], y_ms1ptc_test_pred[:5]):
    print(f"Réel: {vrai:.2f}  →  Prédit: {pred:.2f}")



#EVALUATE MODEL(ms1ptc)

In [None]:
# Performance assessment
print("--- Assessing the performances of the model ---")

r2_train_ms1ptc=regressor.score(X_ms1ptc_train, y_ms1ptc_train)
r2_test_ms1ptc=regressor.score(X_ms1ptc_test, y_ms1ptc_test)

# Print R^2 scores
print("R2 score on training set : ", r2_train_ms1ptc)
print("R2 score on test set : ", r2_test_ms1ptc)


#Features Importance (ms1ptc)

In [None]:
#Importance assessment

importance_ms1ptc = pd.Series(regressor.feature_importances_, index=features_list_ms1ptc)
importance_ms1ptc.sort_values(ascending=False, inplace=True)
print("Importance des features :")
print(importance_ms1ptc)

df_imp_ms1ptc = importance_ms1ptc.reset_index()
df_imp_ms1ptc.columns = ['Feature', 'Importance']
df_imp_ms1ptc['Model'] = 'MS1PTC'



In [None]:
#Visualisation Importance

importance_ms1ptc.plot(kind='barh')
plt.title("Importance des features")
plt.show()


##PREPROCESSING - Modèle Socio + tx chomage (ms2ptc)

In [None]:
#SPLIT dataset into X and Y

features_list_ms2ptc=['tx_chomage','tx_urbanisation', 'densite_2018_(hab/km²)', 'tx_pop_sans_dipl']

print('Splitting dataset into X and Y...')
X_ms2ptc=df[features_list_ms2ptc]

y_ms2ptc=df['tx_crim_pour_100 M_hab']
print('...Done.')

#SPLIT dataset into train test and test...
print('Splitting dataset into train test and test...')
X_ms2ptc_train,X_ms2ptc_test,y_ms2ptc_train,y_ms2ptc_test=train_test_split(X_ms2ptc,y_ms2ptc,
                                                               test_size=0.2,
                                                               random_state=0
                                                               )
print('...Done.')

### Training pipeline ###
print('---Training pipeline---')

# Before preprocessing
print("#### X_train BEFORE preprocessing ####")
print(X_ms2ptc_train.head())  
print()

print("Encoding categorical features and standardizing numerical features...")

#Preprocessing

numeric_transformer = StandardScaler()

# Apply ColumnTransformer to create a pipeline that will apply the above preprocessing

feature_encoder = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, features_list_ms2ptc)
    ],
)

X_ms2ptc_train=feature_encoder.fit_transform(X_ms2ptc_train)
X_ms2ptc_test=feature_encoder.transform(X_ms2ptc_test)

#Vérification
print('...Done.')
print("#### X_ms_train AFTER preprocessing ####")
print(X_ms2ptc_train[0:5, :])  # affiche les 5 premières lignes



#BUILD MODEL(ms2ptc)

In [None]:
# Training model
print("Training model...")
regressor = DecisionTreeRegressor(
    random_state=0,
    max_depth=5,          # profondeur maximale de l’arbre
    min_samples_split=5, # min d’échantillons pour un split
    min_samples_leaf=1   # min d’échantillons dans une feuille
)
regressor.fit(X_ms2ptc_train, y_ms2ptc_train)
print("...Done.")

# Predictions on training set
print("Predictions on train set...")
y_ms2ptc_train_pred = regressor.predict(X_ms2ptc_train)
print("...Done.")
print()

# Afficher les 5 premières prédictions train vs valeurs réelles
for vrai, pred in zip(y_ms2ptc_train[:5], y_ms2ptc_train_pred[:5]):
    print(f"Réel: {vrai:.2f}  →  Prédit: {pred:.2f}")

# Predictions on test set
print("Predictions on test set...")
y_ms2ptc_test_pred = regressor.predict(X_ms2ptc_test)
print("...Done.")

# Afficher les 5 premières prédictions test vs valeurs réelles
for vrai, pred in zip(y_ms2ptc_test[:5], y_ms2ptc_test_pred[:5]):
    print(f"Réel: {vrai:.2f}  →  Prédit: {pred:.2f}")



#EVALUATE MODEL(ms2ptc)

In [None]:
# Performance assessment
print("--- Assessing the performances of the model ---")

r2_train_ms2ptc=regressor.score(X_ms2ptc_train, y_ms2ptc_train)
r2_test_ms2ptc=regressor.score(X_ms2ptc_test, y_ms2ptc_test)

# Print R^2 scores
print("R2 score on training set : ", r2_train_ms2ptc)
print("R2 score on test set : ", r2_test_ms2ptc)


#Features Importance (ms2ptc)

In [None]:
# Importance assessment

importance_ms2ptc = pd.Series(regressor.feature_importances_, index=features_list_ms2ptc)
importance_ms2ptc.sort_values(ascending=False, inplace=True)
print("Importance des features :")
print(importance_ms2ptc)

df_imp_ms2ptc = importance_ms2ptc.reset_index()
df_imp_ms2ptc.columns = ['Feature', 'Importance']
df_imp_ms2ptc['Model'] = 'MS2PTC'



In [None]:
#Visualisation Importance

importance_ms2ptc.plot(kind='barh')
plt.title("Importance des features")
plt.show()


##PREPROCESSING - Modèle Socio + tx pauvrete (ms1ptp)

In [None]:
#SPLIT dataset into X and Y

features_list_ms1ptp=['tx_pauvrete','tx_urbanisation', 'densite_2018_(hab/km²)', 'tx_scolarisation_pop']

print('Splitting dataset into X and Y...')
X_ms1ptp=df[features_list_ms1ptp]

y_ms1ptp=df['tx_crim_pour_100 M_hab']
print('...Done.')

#SPLIT dataset into train test and test...
print('Splitting dataset into train test and test...')
X_ms1ptp_train,X_ms1ptp_test,y_ms1ptp_train,y_ms1ptp_test=train_test_split(X_ms1ptp,y_ms1ptp,
                                                               test_size=0.2,
                                                               random_state=0
                                                               )
print('...Done.')

### Training pipeline ###
print('---Training pipeline---')

# Before preprocessing
print("#### X_train BEFORE preprocessing ####")
print(X_ms1ptp_train.head())  
print()

print("Encoding categorical features and standardizing numerical features...")

#Preprocessing

numeric_transformer = StandardScaler()

# Apply ColumnTransformer to create a pipeline that will apply the above preprocessing

feature_encoder = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, features_list_ms1ptp)
    ],
)

X_ms1ptp_train=feature_encoder.fit_transform(X_ms1ptp_train)
X_ms1ptp_test=feature_encoder.transform(X_ms1ptp_test)

#Vérification
print('...Done.')
print("#### X_ms_train AFTER preprocessing ####")
print(X_ms1ptp_train[0:5, :])  # affiche les 5 premières lignes



#BUILD MODEL(ms1ptp)

In [None]:
# Training model
print("Training model...")
regressor = DecisionTreeRegressor(
    random_state=0,
    max_depth=5,          # profondeur maximale de l’arbre
    min_samples_split=5, # min d’échantillons pour un split
    min_samples_leaf=1   # min d’échantillons dans une feuille
)
regressor.fit(X_ms1ptp_train, y_ms1ptp_train)
print("...Done.")

# Predictions on training set
print("Predictions on train set...")
y_ms1ptp_train_pred = regressor.predict(X_ms1ptp_train)
print("...Done.")
print()

# Afficher les 5 premières prédictions train vs valeurs réelles
for vrai, pred in zip(y_ms1ptp_train[:5], y_ms1ptp_train_pred[:5]):
    print(f"Réel: {vrai:.2f}  →  Prédit: {pred:.2f}")

# Predictions on test set
print("Predictions on test set...")
y_ms1ptp_test_pred = regressor.predict(X_ms1ptp_test)
print("...Done.")

# Afficher les 5 premières prédictions test vs valeurs réelles
for vrai, pred in zip(y_ms1ptp_test[:5], y_ms1ptp_test_pred[:5]):
    print(f"Réel: {vrai:.2f}  →  Prédit: {pred:.2f}")



#EVALUATE MODEL(ms1ptp)

In [None]:
# Performance assessment
print("--- Assessing the performances of the model ---")

r2_train_ms1ptp=regressor.score(X_ms1ptp_train, y_ms1ptp_train)
r2_test_ms1ptp=regressor.score(X_ms1ptp_test, y_ms1ptp_test)

# Print R^2 scores
print("R2 score on training set : ", r2_train_ms1ptp)
print("R2 score on test set : ", r2_test_ms1ptp)


#Features Importance (ms1ptp)

In [None]:
# Importance assessment

importance_ms1ptp = pd.Series(regressor.feature_importances_, index=features_list_ms1ptp)
importance_ms1ptp.sort_values(ascending=False, inplace=True)
print("Importance des features :")
print(importance_ms1ptp)

df_imp_ms1ptp = importance_ms1ptp.reset_index()
df_imp_ms1ptp.columns = ['Feature', 'Importance']
df_imp_ms1ptp['Model'] = 'MS1PTP'


In [None]:
#visualisation Importance
importance_ms1ptp.plot(kind='barh')
plt.title("Importance des features")
plt.show()


##PREPROCESSING - Modèle Socio + tx pauvrete (ms2ptp)

In [None]:
#SPLIT dataset into X and Y

features_list_ms2ptp=['tx_pauvrete','tx_urbanisation', 'densite_2018_(hab/km²)', 'tx_scolarisation_pop']

print('Splitting dataset into X and Y...')
X_ms2ptp=df[features_list_ms2ptp]

y_ms2ptp=df['tx_crim_pour_100 M_hab']
print('...Done.')

#SPLIT dataset into train test and test...
print('Splitting dataset into train test and test...')
X_ms2ptp_train,X_ms2ptp_test,y_ms2ptp_train,y_ms2ptp_test=train_test_split(X_ms2ptp,y_ms2ptp,
                                                               test_size=0.2,
                                                               random_state=0
                                                               )
print('...Done.')

### Training pipeline ###
print('---Training pipeline---')

# Before preprocessing
print("#### X_train BEFORE preprocessing ####")
print(X_ms2ptp_train.head())  
print()

print("Encoding categorical features and standardizing numerical features...")

#Preprocessing

numeric_transformer = StandardScaler()

# Apply ColumnTransformer to create a pipeline that will apply the above preprocessing

feature_encoder = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, features_list_ms2ptp)
    ],
)

X_ms2ptp_train=feature_encoder.fit_transform(X_ms2ptp_train)
X_ms2ptp_test=feature_encoder.transform(X_ms2ptp_test)

#Vérification
print('...Done.')
print("#### X_ms_train AFTER preprocessing ####")
print(X_ms2ptp_train[0:5, :])  # affiche les 5 premières lignes



#BUILD MODEL(ms2ptp)

In [None]:
# Training model
print("Training model...")
regressor = DecisionTreeRegressor(
    random_state=0,
    max_depth=5,          # profondeur maximale de l’arbre
    min_samples_split=5, # min d’échantillons pour un split
    min_samples_leaf=1   # min d’échantillons dans une feuille
)
regressor.fit(X_ms2ptp_train, y_ms2ptp_train)
print("...Done.")

# Predictions on training set
print("Predictions on train set...")
y_ms2ptp_train_pred = regressor.predict(X_ms2ptp_train)
print("...Done.")
print()

# Afficher les 5 premières prédictions train vs valeurs réelles
for vrai, pred in zip(y_ms2ptp_train[:5], y_ms2ptp_train_pred[:5]):
    print(f"Réel: {vrai:.2f}  →  Prédit: {pred:.2f}")

# Predictions on test set
print("Predictions on test set...")
y_ms2ptp_test_pred = regressor.predict(X_ms2ptp_test)
print("...Done.")

# Afficher les 5 premières prédictions test vs valeurs réelles
for vrai, pred in zip(y_ms2ptp_test[:5], y_ms2ptp_test_pred[:5]):
    print(f"Réel: {vrai:.2f}  →  Prédit: {pred:.2f}")



#EVALUATE MODEL(ms2ptp)

In [None]:
# Performance assessment
print("--- Assessing the performances of the model ---")

r2_train_ms2ptp=regressor.score(X_ms2ptp_train, y_ms2ptp_train)
r2_test_ms2ptp=regressor.score(X_ms2ptp_test, y_ms2ptp_test)

# Print R^2 scores
print("R2 score on training set : ", r2_train_ms2ptp)
print("R2 score on test set : ", r2_test_ms2ptp)


#Features Importance (ms2ptp)

In [None]:
# Importance assessment

importance_ms2ptp = pd.Series(regressor.feature_importances_, index=features_list_ms2ptp)
importance_ms2ptp.sort_values(ascending=False, inplace=True)
print("Importance des features :")
print(importance_ms2ptp)

df_imp_ms2ptp = importance_ms2ptp.reset_index()
df_imp_ms2ptp.columns = ['Feature', 'Importance']
df_imp_ms2ptp['Model'] = 'MS2PTP'



In [None]:
#Visualisation Importance

importance_ms2ptp.plot(kind='barh')
plt.title("Importance des features")
plt.show()


###Regroupement modèles

In [None]:
#création dataframe pour stocké resultat R2 par modèle
df_results = pd.DataFrame({
    'Model': ['MSE1', 'MSE2', 'MS1', 'MS2', 'MS1PTC', 'MS2PTC', 'MS1PTP', 'MS2PTP'],
    'R2_train': [r2_train_mse1, r2_train_mse2, r2_train_ms1, r2_train_ms2,
                 r2_train_ms1ptc, r2_train_ms2ptc, r2_train_ms1ptp, r2_train_ms2ptp],
    'R2_test':  [r2_test_mse1, r2_test_mse2, r2_test_ms1, r2_test_ms2,
                 r2_test_ms1ptc, r2_test_ms2ptc, r2_test_ms1ptp, r2_test_ms2ptp]
})

# Tri par meilleur R2_test
df_results = df_results.sort_values(by='R2_test', ascending=False)

print(df_results)


In [None]:
#visualisation comparaison R2 par modèle


x = np.arange(len(df_results['Model']))  # position des modèles
width = 0.35  # largeur des barres

fig, ax = plt.subplots(figsize=(12,6))
ax.bar(x - width/2, df_results['R2_train'], width, label='R2_train')
ax.bar(x + width/2, df_results['R2_test'], width, label='R2_test')

ax.set_ylabel('R²')
ax.set_xlabel('Modèle')
ax.set_title('Comparaison R² Train vs R² Test par modèle')
ax.set_xticks(x)
ax.set_xticklabels(df_results['Model'])
ax.legend()

plt.show()


In [None]:
#création dataframe pour stocké Importance par modèle
df_imp = pd.concat([df_imp_mse1, df_imp_mse2, df_imp_ms1, df_imp_ms2,
                    df_imp_ms1ptc, df_imp_ms2ptc, df_imp_ms1ptp, df_imp_ms2ptp],axis=0,
                   ignore_index=True)


In [None]:
#visualisation Importance
import seaborn as sns
import matplotlib.pyplot as plt

plt.figure(figsize=(12,6))
sns.barplot(x='Model', y='Importance', hue='Feature', data=df_imp)
plt.title('Importance des features par modèle')
plt.xticks(rotation=45)
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
plt.show()
