### Imports

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.datasets import make_regression
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
import pickle
filename='model.pkl'

data = pd.read_csv("insurance.csv")

### Reshape rows

In [2]:
X=data.drop('charges', axis=1)
y=data['charges']
print(X)
print(y)

      age     sex     bmi  children smoker     region
0      19  female  27.900         0    yes  southwest
1      18    male  33.770         1     no  southeast
2      28    male  33.000         3     no  southeast
3      33    male  22.705         0     no  northwest
4      32    male  28.880         0     no  northwest
...   ...     ...     ...       ...    ...        ...
1333   50    male  30.970         3     no  northwest
1334   18  female  31.920         0     no  northeast
1335   18  female  36.850         0     no  southeast
1336   21  female  25.800         0     no  southwest
1337   61  female  29.070         0    yes  northwest

[1338 rows x 6 columns]
0       16884.92400
1        1725.55230
2        4449.46200
3       21984.47061
4        3866.85520
           ...     
1333    10600.54830
1334     2205.98080
1335     1629.83350
1336     2007.94500
1337    29141.36030
Name: charges, Length: 1338, dtype: float64


### Process the data

In [3]:
cols_to_process = ['sex', 'smoker', 'region']
cols_ready = ['age', 'bmi', 'children']
preprocessor = ColumnTransformer(transformers=[('num', 'passthrough', cols_ready),('cat', OneHotEncoder(), cols_to_process)])

### Create test and training sets

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

### Create random forest

In [5]:
forest = RandomForestRegressor(n_estimators=100, random_state=50, n_jobs=-1, oob_score=True)
forestProcessed = Pipeline(steps=[('preprocessor', preprocessor), ('model',forest)])
forestProcessed.fit(X_train, y_train)

### Fit Model

In [6]:
score_train = forestProcessed.score(X_train, y_train)
print("R-squared train: ", score_train) 

R-squared train:  0.9738363430002951


In [7]:
score_test = forestProcessed.score(X_test, y_test)
print("R-squared test: ", score_test)

R-squared test:  0.8826810966788274


In [8]:
pickle.dump(forestProcessed, open(filename, 'wb'))