In [1]:
import pandas as pd 
import numpy as np 
import seaborn as sns
import matplotlib.pyplot as plt 
import scipy.stats as stats 

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import ConfusionMatrixDisplay, accuracy_score


In [2]:
df=pd.read_csv('projet_finale_vf.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,departement,tx_pauvrete,revenu_median,revenu_moyen,tx_crim_pour_100 M_hab,tx_chomage,tx_urbanisation,densite_2018_(hab/km²),tx_scolarisation_pop
0,0,20,18.55,20060.0,3.6,3576.184249,9.425,71.85,39.0,58.633464
1,1,1,10.5,22640.0,3.4,3273.801197,6.75,67.0,112.0,58.320593
2,2,2,18.5,19100.0,3.1,3752.923347,12.95,53.2,72.0,57.65585
3,3,3,15.4,19750.0,3.0,3587.40547,9.675,58.3,46.0,58.484813
4,4,4,16.6,19940.0,3.2,4462.068755,10.7,61.9,24.0,58.337452


In [3]:
print(df.columns)

Index(['Unnamed: 0', 'departement', 'tx_pauvrete', 'revenu_median',
       'revenu_moyen', 'tx_crim_pour_100 M_hab', 'tx_chomage',
       'tx_urbanisation', 'densite_2018_(hab/km²)', 'tx_scolarisation_pop'],
      dtype='object')


In [4]:
df=df.drop(['Unnamed: 0'],axis=1)
df.head()

Unnamed: 0,departement,tx_pauvrete,revenu_median,revenu_moyen,tx_crim_pour_100 M_hab,tx_chomage,tx_urbanisation,densite_2018_(hab/km²),tx_scolarisation_pop
0,20,18.55,20060.0,3.6,3576.184249,9.425,71.85,39.0,58.633464
1,1,10.5,22640.0,3.4,3273.801197,6.75,67.0,112.0,58.320593
2,2,18.5,19100.0,3.1,3752.923347,12.95,53.2,72.0,57.65585
3,3,15.4,19750.0,3.0,3587.40547,9.675,58.3,46.0,58.484813
4,4,16.6,19940.0,3.2,4462.068755,10.7,61.9,24.0,58.337452


In [5]:
print('nompbre de lignes & de colonnes')
print(df.shape)
print('***********************************************')
print('descriptif du dataframe')
print(df.describe(include='all'))
print('***********************************************')
print('nombre de valeurs null')
print(df.isnull().sum().sum())
if df.isnull().sum().sum()!=0:
    print('nombre de valeur null par colonne')
    print(df.isnull().sum())
print('***********************************************')
print('aperçu des 5 premières lignes')
print(df.head())

nompbre de lignes & de colonnes
(95, 9)
***********************************************
descriptif du dataframe
       departement  tx_pauvrete  revenu_median  revenu_moyen  \
count    95.000000    95.000000       95.00000     95.000000   
mean     48.000000    14.380526    20788.00000      3.174737   
std      27.568098     2.997670     1619.34489      0.486404   
min       1.000000     9.100000    17310.00000      2.600000   
25%      24.500000    12.350000    19865.00000      2.900000   
50%      48.000000    14.300000    20410.00000      3.100000   
75%      71.500000    15.450000    21320.00000      3.300000   
max      95.000000    27.900000    27400.00000      6.300000   

       tx_crim_pour_100 M_hab  tx_chomage  tx_urbanisation  \
count               95.000000   95.000000        95.000000   
mean              4132.519919    9.042632        68.030000   
std               1385.850369    1.704448        17.712418   
min               2325.981963    5.650000        21.400000   
2

In [6]:
df=df.drop(columns=['departement','revenu_moyen'])
print(df.head())

   tx_pauvrete  revenu_median  tx_crim_pour_100 M_hab  tx_chomage  \
0        18.55        20060.0             3576.184249       9.425   
1        10.50        22640.0             3273.801197       6.750   
2        18.50        19100.0             3752.923347      12.950   
3        15.40        19750.0             3587.405470       9.675   
4        16.60        19940.0             4462.068755      10.700   

   tx_urbanisation  densite_2018_(hab/km²)  tx_scolarisation_pop  
0            71.85                    39.0             58.633464  
1            67.00                   112.0             58.320593  
2            53.20                    72.0             57.655850  
3            58.30                    46.0             58.484813  
4            61.90                    24.0             58.337452  


In [7]:
print(df.columns)

Index(['tx_pauvrete', 'revenu_median', 'tx_crim_pour_100 M_hab', 'tx_chomage',
       'tx_urbanisation', 'densite_2018_(hab/km²)', 'tx_scolarisation_pop'],
      dtype='object')


#PREPROCESSING - Modèle Socio-eco (mse)

In [8]:
#SPLIT dataset into X and Y

features_list=['tx_pauvrete', 'revenu_median', 'tx_chomage', 'tx_urbanisation', 'densite_2018_(hab/km²)', 'tx_scolarisation_pop']

print('Splitting dataset into X and Y...')
X_mse=df.loc[:,features_list]

y_mse=df.loc[:,'tx_crim_pour_100 M_hab']
print('...Done.')


Splitting dataset into X and Y...
...Done.


In [9]:
#SPLIT dataset into train test and test...
print('Splitting dataset into train test and test...')
X_mse_train,X_mse_test,y_mse_train,y_mse_test=train_test_split(X_mse,y_mse,
                                                               test_size=0.2,
                                                               random_state=0
                                                               )
print('...Done.')

Splitting dataset into train test and test...
...Done.


In [10]:
###Training pipeline ###
print('---Training pipeline---')

# Encoding categorical features and standardizing numeric features

print("#### X_train BEFORE preprocessing ####")
print(X_mse_train)
print()

print("Encoding categorical features and standardizing numerical features...")

numeric_features=[0,1,2,3,4,5]
numeric_transformer=StandardScaler()

# Apply ColumnTransformer to create a pipeline that will apply the above preprocessing

feature_encoder=ColumnTransformer(
    transformers=[
        ('num',numeric_transformer,numeric_features)
        ]
    )

X_mse_train=feature_encoder.fit_transform(X_mse_train)
print('...Done.')
print("#### X_mse_train AFTER preprocessing ####")
print(X_mse_train[0:5,:]) # print first 5 rows (not using iloc since now X_train became a numpy array)


---Training pipeline---
#### X_train BEFORE preprocessing ####
    tx_pauvrete  revenu_median  tx_chomage  tx_urbanisation  \
45         14.8        20220.0       8.775             39.5   
26         12.5        21060.0       9.500             55.9   
43         10.1        21910.0       7.550             84.6   
24         11.9        21900.0       8.250             65.4   
6          15.5        21590.0       9.800             95.9   
..          ...            ...         ...              ...   
91         11.8        27090.0       7.075            100.0   
67         12.9        22490.0       9.000             78.9   
64         15.0        19910.0      10.300             62.2   
47         14.9        19770.0       5.950             37.0   
44         13.4        21280.0       9.050             74.9   

    densite_2018_(hab/km²)  tx_scolarisation_pop  
45                    33.0             58.993269  
26                    99.0             57.975449  
43                   207.0 

##BUILD MODEL(mse)

In [11]:
# Training model
print("Training model...")
regressor = DecisionTreeRegressor(random_state=0)
regressor.fit(X_mse_train, y_mse_train)
print("...Done.")


Training model...
...Done.


In [12]:
# Predictions on training set
print("Predictions on train set...")
y_mse_train_pred = regressor.predict(X_mse_train)
print("...Done.")
print()


Predictions on train set...
...Done.



In [13]:
# Predictions on test set
print("Predictions on test set...")
y_mse_test_pred = regressor.predict(X_mse_test)
print("...Done.")
print()


Predictions on test set...
...Done.



