In [33]:
import pandas as pd
import numpy as np

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import r2_score

from sklearn.pipeline import Pipeline


from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler

In [15]:
insu = pd.read_csv(r"C:\Hogwarts\machine_learning\Cases\Medical Cost Personal\insurance.csv")

In [16]:
insu.tail(7)

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
1331,23,female,33.4,0,no,southwest,10795.93733
1332,52,female,44.7,3,no,southwest,11411.685
1333,50,male,30.97,3,no,northwest,10600.5483
1334,18,female,31.92,0,no,northeast,2205.9808
1335,18,female,36.85,0,no,southeast,1629.8335
1336,21,female,25.8,0,no,southwest,2007.945
1337,61,female,29.07,0,yes,northwest,29141.3603


In [17]:
insu = pd.get_dummies(insu, drop_first = True)

In [18]:
X = insu.drop("charges", axis = 1)
y = insu["charges"]

# Linear Regression

In [30]:
kfold = KFold(n_splits = 5,
             shuffle = True,
             random_state = 2022)

lr = LinearRegression()

results = cross_val_score(lr,
                         X, y,
                         cv = kfold)

In [20]:
print(results.mean())

0.7440038506879969


# Pipeline

In [34]:
scaler = StandardScaler()
knn = KNeighborsRegressor()

pipe = Pipeline([("STD",scaler), ("KNN",knn)])

kfold = KFold (n_splits = 5,
               shuffle = True,
               random_state = 2022)

params = {"KNN__n_neighbors": np.arange(1,16)}

In [35]:
gcv = GridSearchCV(pipe, param_grid = params, scoring = "r2", cv = kfold)
gcv.fit(X, y)

In [36]:
print(gcv.best_params_)
print(gcv.best_score_)

{'KNN__n_neighbors': 7}
0.7902611200269143


# Predicting on unlabelled data

In [88]:
knn = KNeighborsRegressor(n_neighbors = 7)
pipe = Pipeline([("STD", scaler), ("KNN",knn)])
pipe.fit(X, y)

Pipeline(steps=[('STD', StandardScaler()),
                ('KNN', KNeighborsRegressor(n_neighbors=7))])

In [103]:
tst_insu = pd.read_csv(r"C:\Hogwarts\machine_learning\Cases\Medical Cost Personal\tst_insure.csv")

In [104]:
dum_tst = pd.get_dummies(tst_insu, drop_first = True)
print(X.dtypes)
print(dum_tst.dtypes)
predictions = pipe.predict(dum_tst)

age                   int64
bmi                 float64
children              int64
sex_male              uint8
smoker_yes            uint8
region_northwest      uint8
region_southeast      uint8
region_southwest      uint8
dtype: object
age                   int64
bmi                 float64
children              int64
sex_male              uint8
smoker_yes            uint8
region_northwest      uint8
region_southeast      uint8
region_southwest      uint8
dtype: object


# Using grid Search 

In [106]:
pd_cv = pd.DataFrame(gcv.cv_results_)
best_model = gcv.best_estimator_

tst_insure = pd.read_csv("Cases/Medical Cost Personal/tst_insure.csv")

dum_tst = pd.get_dummies(tst_insure, drop_first = True)
predictions = best_model.predict(dum_tst)

drop_first = True

The drop_first parameter specifies whether or not you want to drop the first category of the categorical variable you're encoding. By default, this is set to drop_first = False . This will cause get_dummies to create one dummy variable for every level of the input categorical variable.

In [110]:
tst_insure.head()

Unnamed: 0,age,sex,bmi,children,smoker,region
0,19,female,27.9,0,yes,southwest
1,18,male,33.77,1,no,southeast
2,28,male,33.0,3,no,southeast
3,33,male,22.705,0,no,northwest
4,37,female,23.37,2,no,northwest


In [111]:
dum_tst.head(7)

Unnamed: 0,age,bmi,children,sex_male,smoker_yes,region_northwest,region_southeast,region_southwest
0,19,27.9,0,0,1,0,0,1
1,18,33.77,1,1,0,0,1,0
2,28,33.0,3,1,0,0,1,0
3,33,22.705,0,1,0,1,0,0
4,37,23.37,2,0,0,1,0,0
5,44,37.1,2,1,0,0,0,1
6,18,23.75,0,1,0,0,0,0


In [107]:
print(predictions)

[24411.94929     5270.08306429  4680.98034286  6808.30289429
  6808.34575     7670.527       1902.37384286  2453.48372857
 34845.44309286 10229.51452143  3275.05149286 24411.94929
 10304.84671429 22843.14585714  6653.04271429 12392.16091429
 12966.40537857 15364.97829     1921.51835     5741.935917
  1930.68848571  4496.99419     1921.51835    15434.37549143
  3363.35190429 12532.92425857  6900.08065714 22101.33436714
  6251.48125    28107.11708429  5386.91035714 39300.07997286
 11911.19909429 12579.67512714  1760.93657143  5274.64907143
 12669.75224143 11614.55771143 20496.56267857  8816.08189143
 11979.46763714 29567.18479429 21407.89491143 34833.36007143
 14687.10969143 27513.17474286 34469.17022857 10360.09185714
  7323.58244714  4683.26036429  9444.90332143  6725.23610429
  7400.81874286 11273.45092857 15676.58758     2374.16768571
  8295.42352857 15306.54476714 36306.90228714  8304.37471429
 11517.8622     13324.86568143 29108.33957143 12765.53671429
  6500.90299286  3938.1470142