In [1]:
import pandas as pd 
import numpy as np 
  
from sklearn.compose import ColumnTransformer 
from sklearn.model_selection import train_test_split, cross_val_score 
from sklearn.impute import SimpleImputer 
from sklearn.preprocessing import OneHotEncoder, StandardScaler, MinMaxScaler 
from sklearn.pipeline import Pipeline 
from sklearn.ensemble import RandomForestRegressor 

In [2]:
train_data = pd.read_csv('Insurance/insurance.csv')

In [3]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1338 non-null   int64  
 1   sex       1338 non-null   object 
 2   bmi       1338 non-null   float64
 3   children  1338 non-null   int64  
 4   smoker    1338 non-null   object 
 5   region    1338 non-null   object 
 6   charges   1338 non-null   float64
dtypes: float64(2), int64(2), object(3)
memory usage: 73.3+ KB


In [6]:
#introduce ‘impurities’ in this dataset
np.random.seed(0) # for reproducibility 
for _ in range(10): 
    r = np.random.randint(len(train_data)) 
    c = np.random.randint(6) 
    train_data.iloc[r, c] = np.nan 

In [8]:
train_data.isna().sum()

age         2
sex         1
bmi         1
children    3
smoker      1
region      2
charges     0
dtype: int64

In [9]:
X_train, X_test, y_train, y_test = train_test_split(train_data.drop('charges', 1), 
                                                    train_data['charges'], 
                                                    test_size = 0.2, random_state = 0) 

In [10]:
trf1 = ColumnTransformer(transformers =[ 
    ('cat', SimpleImputer(strategy ='most_frequent'), ['sex', 'smoker', 'region']), 
    ('num', SimpleImputer(strategy ='median'), ['age', 'bmi', 'children']), 
      
], remainder ='passthrough') 

In [11]:
first_step = trf1.fit_transform(X_train) 
first_step 

array([['male', 'yes', 'southwest', 37.0, 34.1, 4.0],
       ['male', 'no', 'southeast', 18.0, 34.43, 0.0],
       ['female', 'yes', 'northeast', 23.0, 36.67, 2.0],
       ...,
       ['male', 'no', 'southeast', 40.0, 25.08, 1.0],
       ['male', 'no', 'northwest', 19.0, 35.53, 0.0],
       ['female', 'no', 'southeast', 33.0, 18.5, 1.0]], dtype=object)

In [14]:
pd.DataFrame(first_step).head()

Unnamed: 0,0,1,2,3,4,5
0,male,yes,southwest,37,34.1,4
1,male,no,southeast,18,34.43,0
2,female,yes,northeast,23,36.67,2
3,male,no,southwest,32,35.2,2
4,female,no,northeast,58,32.395,1


In [16]:
trf1.named_transformers_ 

{'cat': SimpleImputer(strategy='most_frequent'),
 'num': SimpleImputer(strategy='median')}

In [17]:
# these were the median values of each of the three numerical columns. 
# for any transformer, you can access its specific attributes this way.
trf1.named_transformers_['num'].statistics_


array([39. , 30.4,  1. ])

In [18]:
#note: OneHotEncoder can’t handle missing values, hence it is important to get rid of them before encoding. Now, we make another transformer object for the encoding. We couldn’t do this in ‘trf1’ because at that point in time, there were missing values in the X_train, and OneHotEncoder can’t deal with missing values as discussed earlier. Hence we first needed to remove the missing values, and then pass this new ‘first_step’ array (with no missing values) to OneHotEncoder.

#We set the sparse parameter to False (because we want a dense array output) and we can toggle between dropping the first of the dummy encoded columns or not, depending upon the type of model we’re fitting, to avoid the ‘dummy variable trap’. Learn more about it here: A general rule of thumb: drop a dummy-encoded column if using a linear-based model, and do not drop it if using a tree-based model. Also, did you see how for the columns parameter, we specified list(range(3)) instead of the column names? That is because now, we’ve lost the column names (as seen in ‘first_step’, but we know the categorical columns are the first three columns (after reordering), hence we specify [0, 1, 2].
trf2 = ColumnTransformer(transformers =[ 
    ('enc', OneHotEncoder(sparse = False, drop ='first'), list(range(3))), 
], remainder ='passthrough') 

In [19]:
pipe = Pipeline(steps =[ 
    ('tf1', trf1), 
    ('tf2', trf2), 
    ('tf3', StandardScaler()), # or StandardScaler, or any other scaler 
    ('model', RandomForestRegressor(n_estimators = 200)), 
# or LinearRegression, SVR, DecisionTreeRegressor, etc 
]) 

In [20]:
cvs = cross_val_score(pipe, X_train, y_train, cv = 5) 
print("All cross val scores:", cvs) 
print("Mean of all scores: ", cvs.mean()) 

All cross val scores: [0.85782059 0.81670185 0.80040809 0.81790831 0.76733147]
Mean of all scores:  0.8120340616469687


In [21]:
pipe.fit(X_train, y_train) 

Pipeline(steps=[('tf1',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('cat',
                                                  SimpleImputer(strategy='most_frequent'),
                                                  ['sex', 'smoker', 'region']),
                                                 ('num',
                                                  SimpleImputer(strategy='median'),
                                                  ['age', 'bmi',
                                                   'children'])])),
                ('tf2',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('enc',
                                                  OneHotEncoder(drop='first',
                                                                sparse=False),
                                                  [0, 1, 2])])),
                ('tf3', StandardScaler()),
        

In [22]:
preds = pipe.predict(X_test) 

In [23]:
pd.DataFrame({'original test set':y_test, 'predictions': preds})

Unnamed: 0,original test set,predictions
578,9724.53000,10648.232822
610,8547.69130,9887.297975
569,45702.02235,44485.007832
1034,12950.07120,13208.181387
198,9644.25250,10054.271969
...,...,...
1084,15019.76005,15878.597914
726,6664.68595,6617.496041
1132,20709.02034,11295.988688
725,40932.42950,43049.301895
