In [22]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split,GridSearchCV,learning_curve,cross_val_score
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, PolynomialFeatures
from sklearn.linear_model import LinearRegression,Ridge,Lasso,ElasticNet

In [23]:
df= pd.read_csv('brief_assurance.csv')
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [24]:
def categorie_bmi(bmi):
    if bmi < 18.5:
        return 'faible'
    elif bmi < 25:
        return 'normal'
    elif bmi < 30:
        return 'surpoids'
    else:
        return 'obesité'

df['categorie_bmi'] = df['bmi'].apply(categorie_bmi)

df = df.drop(['bmi'],axis=1)

In [25]:
df

Unnamed: 0,age,sex,children,smoker,region,charges,categorie_bmi
0,19,female,0,yes,southwest,16884.92400,surpoids
1,18,male,1,no,southeast,1725.55230,obesité
2,28,male,3,no,southeast,4449.46200,obesité
3,33,male,0,no,northwest,21984.47061,normal
4,32,male,0,no,northwest,3866.85520,surpoids
...,...,...,...,...,...,...,...
1333,50,male,3,no,northwest,10600.54830,obesité
1334,18,female,0,no,northeast,2205.98080,obesité
1335,18,female,0,no,southeast,1629.83350,obesité
1336,21,female,0,no,southwest,2007.94500,surpoids


In [26]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   age            1338 non-null   int64  
 1   sex            1338 non-null   object 
 2   children       1338 non-null   int64  
 3   smoker         1338 non-null   object 
 4   region         1338 non-null   object 
 5   charges        1338 non-null   float64
 6   categorie_bmi  1338 non-null   object 
dtypes: float64(1), int64(2), object(4)
memory usage: 73.3+ KB


In [28]:
X=df.drop('charges',axis=1)
y=df['charges']
X_train,X_test,y_train,y_test = train_test_split(X, y,shuffle=True, test_size=0.2,random_state=42, stratify=X[['smoker','region','sex']])

In [29]:
X_train

Unnamed: 0,age,sex,children,smoker,region,categorie_bmi
717,60,male,1,no,northwest,normal
787,21,male,0,no,northwest,obesité
323,57,male,0,no,northeast,obesité
393,49,male,1,no,northeast,obesité
1102,29,male,1,no,southeast,obesité
...,...,...,...,...,...,...
1079,63,male,3,no,southeast,obesité
438,52,female,5,no,southeast,obesité
823,44,female,2,no,southeast,surpoids
1302,25,female,1,no,southwest,normal


In [30]:
quantitative_col = list(X_train.select_dtypes(include=[float,int]).columns)
categoriel_col = list(X_train.select_dtypes(include=[object]).columns)

quantitative_pipeline = make_pipeline(StandardScaler())
categoriel_pipeline = make_pipeline(OneHotEncoder(drop='if_binary'))


preprocessing=ColumnTransformer([("one_hot",categoriel_pipeline,categoriel_col),("scaling",quantitative_pipeline,quantitative_col)])

In [31]:
quantitative_col

['age', 'children']

In [33]:
categoriel_col

['sex', 'smoker', 'region', 'categorie_bmi']

In [32]:
categoriel_pipeline

In [51]:
from sklearn.tree import DecisionTreeRegressor

param_grid = {'decisiontreeregressor__max_depth': [3, 4, 5],
              'decisiontreeregressor__min_samples_leaf': [5, 10, 20, 30, 50,100]}

regressor = DecisionTreeRegressor()

my_pipe_decision_tree = make_pipeline(preprocessing, regressor)

grid_search = GridSearchCV(my_pipe_decision_tree, param_grid, cv=5)



In [52]:
grid_search.fit(X_train,y_train)



In [53]:
dir(grid_search)

['__abstractmethods__',
 '__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__setstate__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_abc_impl',
 '_check_feature_names',
 '_check_n_features',
 '_check_refit_for_multimetric',
 '_estimator_type',
 '_format_results',
 '_get_param_names',
 '_get_tags',
 '_more_tags',
 '_repr_html_',
 '_repr_html_inner',
 '_repr_mimebundle_',
 '_required_parameters',
 '_run_search',
 '_select_best_index',
 '_validate_data',
 '_validate_params',
 'best_estimator_',
 'best_index_',
 'best_params_',
 'best_score_',
 'classes_',
 'cv',
 'cv_results_',
 'decision_function',
 'error_score',
 'estimator',
 'feature_names_in_',
 'fit',
 'get_params',
 'inverse_transform',
 

In [55]:
grid_search.best_params_

{'decisiontreeregressor__max_depth': 4,
 'decisiontreeregressor__min_samples_leaf': 10}