## Partie modélisation

In [10]:
import pandas as pd
from sklearn.dummy import DummyRegressor
from sklearn.metrics import root_mean_squared_error
from sklearn.feature_selection import r_regression
from sklearn.model_selection import cross_validate
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LinearRegression
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt

In [11]:
df = pd.read_csv("../4072eb5e-e963-4a17-a794-3ea028d0a9c4.csv")
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [12]:
features_of_interest = [
    "age",
    "sex",
    "bmi",
    "children",
    "smoker",
    "region"
]

numerical_column = [
    "age",
    "bmi",
    "children"
]

bool_column = [
    "sex",
    "smoker"
]

categorical_column = [
    "region",
]


target_name = "charges"
data, target, numerical_data, bool_data, categorical_data = (
    df[features_of_interest],
    df[target_name],
    df[numerical_column],
    df[bool_column],
    df[categorical_column]
)

### dummy model

In [13]:
dummy_model = DummyRegressor()
_ = dummy_model.fit(data,target)

In [14]:
target_predicted = dummy_model.predict(data)
target_predicted

array([13270.42226514, 13270.42226514, 13270.42226514, ...,
       13270.42226514, 13270.42226514, 13270.42226514], shape=(1338,))

On obtient bien un modèle qui nous prédit pour tout le monde la moyenne des charges...

In [15]:
rmse = root_mean_squared_error(dummy_model.predict(data), target)
rmse

12105.484975561612

Avec un rmse de 12105 ce qui est beaucoup

### première regression linéaire

Les r2 pour nos valeurs numériques :

In [16]:
r_regression(numerical_data, target)

array([0.29900819, 0.19834097, 0.06799823])

In [17]:
preprocessor = ColumnTransformer(
    [("categorical", OneHotEncoder(), categorical_column),
    ("bool", OrdinalEncoder(), bool_column),
    ],
    remainder="passthrough",
)

# normalizer = ColumnTransformer(
#     [('normalize', StandardScaler(), features_of_interest)],
#     remainder="passthrough",
# )

model = make_pipeline(preprocessor, LinearRegression())
model.fit(data,target)



The format of the columns of the 'remainder' transformer in ColumnTransformer.transformers_ will change in version 1.7 to match the format of the other transformers.
At the moment the remainder columns are stored as indices (of type int). With the same ColumnTransformer configuration, in the future they will be stored as column names (of type str).



In [18]:
cv_results = cross_validate(
    model,
    data,
    target,
    cv=10,
    scoring="neg_mean_squared_error",
    return_train_score=True,
    return_estimator=True,
)

In [19]:
model.score(data,target)

0.7509130345985207

In [20]:
import numpy as np

train_error = -cv_results["train_score"]
print(
    np.sqrt(train_error.mean())
)

6039.568676068135


On obtient en moyenne un rmse de 6039 ce qui est toujours mieux que le dummy model mais pas tant...