In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

In [3]:
df = pd.read_csv('dataset.csv')
df.drop_duplicates(inplace=True)

In [4]:
# Définir les tranches de BMI
bins = [0, 18.5, 24.9, 29.9, 40, 100]  # Tranches de BMI
labels = ['Sous-poids', 'Poids normal', 'Surpoids', 'Obésité', 'Obésité sévère']

# Ajouter une nouvelle colonne dans le DataFrame pour les tranches de BMI
df['BMI_category'] = pd.cut(df['bmi'], bins=bins, labels=labels, right=False)

In [5]:
# Définir les tranches d'ages
bins = [18, 36, 45, 55, 65]  # Tranches de BMI
labels = ['18-35', '36-45', '46-55', 'plus de 55 ans']

# Ajouter une nouvelle colonne dans le DataFrame pour les tranches de BMI
df['age_categories'] = pd.cut(df['age'], bins=bins, labels=labels, right=False)

In [6]:
#Label encoder pour le sexe et si les usagers sont fumeurs

labe_encod = LabelEncoder()
df['sex_encode']= labe_encod.fit_transform(df['sex'])
df['smoker_encode']= labe_encod.fit_transform(df['smoker'])
df

Unnamed: 0,age,sex,bmi,children,smoker,region,charges,BMI_category,age_categories,sex_encode,smoker_encode
0,19,female,27.900,0,yes,southwest,16884.92400,Surpoids,18-35,0,1
1,18,male,33.770,1,no,southeast,1725.55230,Obésité,18-35,1,0
2,28,male,33.000,3,no,southeast,4449.46200,Obésité,18-35,1,0
3,33,male,22.705,0,no,northwest,21984.47061,Poids normal,18-35,1,0
4,32,male,28.880,0,no,northwest,3866.85520,Surpoids,18-35,1,0
...,...,...,...,...,...,...,...,...,...,...,...
1333,50,male,30.970,3,no,northwest,10600.54830,Obésité,46-55,1,0
1334,18,female,31.920,0,no,northeast,2205.98080,Obésité,18-35,0,0
1335,18,female,36.850,0,no,southeast,1629.83350,Obésité,18-35,0,0
1336,21,female,25.800,0,no,southwest,2007.94500,Surpoids,18-35,0,0


In [11]:
df_reg_encod = pd.get_dummies(df['region'])
df_reg_encod

Unnamed: 0,northeast,northwest,southeast,southwest
0,False,False,False,True
1,False,False,True,False
2,False,False,True,False
3,False,True,False,False
4,False,True,False,False
...,...,...,...,...
1333,False,True,False,False
1334,True,False,False,False
1335,False,False,True,False
1336,False,False,False,True


In [7]:
# encodage par région et par bmi category

df_reg_encod = pd.get_dummies(df['region'])
df_reg_encod.replace(False, 0, inplace=True)
df_reg_encod.replace(True, 1, inplace=True)
df_reg_encod

df_bmi_encod = pd.get_dummies(df['BMI_category'])
df_bmi_encod.replace(False, 0, inplace=True)
df_bmi_encod.replace(True, 1, inplace=True)
df_bmi_encod

  df_reg_encod.replace(True, 1, inplace=True)
  df_bmi_encod.replace(True, 1, inplace=True)


Unnamed: 0,Sous-poids,Poids normal,Surpoids,Obésité,Obésité sévère
0,0,0,1,0,0
1,0,0,0,1,0
2,0,0,0,1,0
3,0,1,0,0,0
4,0,0,1,0,0
...,...,...,...,...,...
1333,0,0,0,1,0
1334,0,0,0,1,0
1335,0,0,0,1,0
1336,0,0,1,0,0


In [9]:
#concatener les df encodé avec le df

df_2 = pd.concat([df,df_reg_encod,df_bmi_encod], axis=1)
#supprimer les colonnes qui ont été encodés

df_2.drop(columns=['sex','smoker','region','BMI_category','age_categories'],inplace=True)
df_2

Unnamed: 0,age,bmi,children,charges,sex_encode,smoker_encode,northeast,northwest,southeast,southwest,Sous-poids,Poids normal,Surpoids,Obésité,Obésité sévère
0,19,27.900,0,16884.92400,0,1,0,0,0,1,0,0,1,0,0
1,18,33.770,1,1725.55230,1,0,0,0,1,0,0,0,0,1,0
2,28,33.000,3,4449.46200,1,0,0,0,1,0,0,0,0,1,0
3,33,22.705,0,21984.47061,1,0,0,1,0,0,0,1,0,0,0
4,32,28.880,0,3866.85520,1,0,0,1,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1333,50,30.970,3,10600.54830,1,0,0,1,0,0,0,0,0,1,0
1334,18,31.920,0,2205.98080,0,0,1,0,0,0,0,0,0,1,0
1335,18,36.850,0,1629.83350,0,0,0,0,1,0,0,0,0,1,0
1336,21,25.800,0,2007.94500,0,0,0,0,0,1,0,0,1,0,0


In [12]:
df_2

Unnamed: 0,age,bmi,children,charges,sex_encode,smoker_encode,northeast,northwest,southeast,southwest,Sous-poids,Poids normal,Surpoids,Obésité,Obésité sévère
0,19,27.900,0,16884.92400,0,1,0,0,0,1,0,0,1,0,0
1,18,33.770,1,1725.55230,1,0,0,0,1,0,0,0,0,1,0
2,28,33.000,3,4449.46200,1,0,0,0,1,0,0,0,0,1,0
3,33,22.705,0,21984.47061,1,0,0,1,0,0,0,1,0,0,0
4,32,28.880,0,3866.85520,1,0,0,1,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1333,50,30.970,3,10600.54830,1,0,0,1,0,0,0,0,0,1,0
1334,18,31.920,0,2205.98080,0,0,1,0,0,0,0,0,0,1,0
1335,18,36.850,0,1629.83350,0,0,0,0,1,0,0,0,0,1,0
1336,21,25.800,0,2007.94500,0,0,0,0,0,1,0,0,1,0,0


In [None]:
line_reg = LinearRegression()