In [1]:
import pandas as pd
import numpy as np
import sklearn
import matplotlib.pyplot as plt 

In [2]:
df = pd.read_csv(r"insurance.csv")
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [3]:
df.isnull().sum()

age         0
sex         0
bmi         0
children    0
smoker      0
region      0
charges     0
dtype: int64

In [4]:
X = df.drop(columns='charges')
y = df['charges']

In [5]:
X

Unnamed: 0,age,sex,bmi,children,smoker,region
0,19,female,27.900,0,yes,southwest
1,18,male,33.770,1,no,southeast
2,28,male,33.000,3,no,southeast
3,33,male,22.705,0,no,northwest
4,32,male,28.880,0,no,northwest
...,...,...,...,...,...,...
1333,50,male,30.970,3,no,northwest
1334,18,female,31.920,0,no,northeast
1335,18,female,36.850,0,no,southeast
1336,21,female,25.800,0,no,southwest


In [6]:
X = pd.get_dummies(X,columns=['sex','region'],drop_first=True)

In [7]:
X['smoker'] = X['smoker'].map({'yes':1,'no':0})

In [8]:
X

Unnamed: 0,age,bmi,children,smoker,sex_male,region_northwest,region_southeast,region_southwest
0,19,27.900,0,1,0,0,0,1
1,18,33.770,1,0,1,0,1,0
2,28,33.000,3,0,1,0,1,0
3,33,22.705,0,0,1,1,0,0
4,32,28.880,0,0,1,1,0,0
...,...,...,...,...,...,...,...,...
1333,50,30.970,3,0,1,1,0,0
1334,18,31.920,0,0,0,0,0,0
1335,18,36.850,0,0,0,0,1,0
1336,21,25.800,0,0,0,0,0,1


In [9]:
from sklearn.preprocessing import RobustScaler
sc = RobustScaler()
X[['age','bmi']] = sc.fit_transform(X[['age','bmi']])
X.head() 

Unnamed: 0,age,bmi,children,smoker,sex_male,region_northwest,region_southeast,region_southwest
0,-0.833333,-0.297708,0,1,0,0,0,1
1,-0.875,0.40131,1,0,1,0,1,0
2,-0.458333,0.309616,3,0,1,0,1,0
3,-0.25,-0.916344,0,0,1,1,0,0
4,-0.291667,-0.181006,0,0,1,1,0,0


# train test split

In [10]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=0)

In [11]:
X_train.head()

Unnamed: 0,age,bmi,children,smoker,sex_male,region_northwest,region_southeast,region_southwest
621,-0.083333,0.440607,4,1,1,0,0,1
194,-0.875,0.479905,0,0,1,0,1,0
240,-0.666667,0.746651,2,1,0,0,0,0
1168,-0.291667,0.571599,2,0,1,0,0,1
1192,0.791667,0.237571,1,0,0,0,0,0


In [12]:
X_test

Unnamed: 0,age,bmi,children,smoker,sex_male,region_northwest,region_southeast,region_southwest
578,0.541667,-0.023817,1,0,1,0,0,1
610,0.333333,-0.122656,1,0,0,0,1,0
569,0.375000,1.210479,2,1,1,1,0,0
1034,0.916667,0.950283,0,0,1,1,0,0
198,0.500000,-1.470676,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...
1084,0.958333,0.011313,2,0,0,1,0,0
726,0.083333,-0.237571,1,0,1,1,0,0
1132,0.750000,1.176541,0,0,1,0,0,0
725,-0.375000,1.030068,3,1,0,0,1,0


# polinomial regression

In [13]:
from sklearn.preprocessing import PolynomialFeatures 

In [14]:
poly = PolynomialFeatures(degree=2)
X_train_poly = poly.fit_transform(X_train)
X_test_poly = poly.transform(X_test) 

In [15]:
from sklearn.linear_model import LinearRegression
lr = LinearRegression()
model = lr.fit(X_train_poly,y_train) 

# prediction

In [16]:
y_pred =lr.predict(X_test_poly)
y_pred 

array([11557.1971734 , 10313.87152127, 50009.96992384, 15145.51648858,
        7665.81603214,  4815.51448147,  4563.50510619, 14617.85614674,
       10413.91985157,  8498.57025737,  8581.7140014 , 11879.01227986,
        8786.06890979,  5741.90588688, 25086.81610576, 12571.83008489,
       13092.73760628,  5886.37223222,  9092.85391358, 27944.72255242,
       26321.93095449, 14901.53414257, 11445.68047608, 29470.32161267,
        3483.21146586,  7071.95253293,  4091.68612599,  9545.0921474 ,
        5331.8497802 , 11187.10299042, 10663.40430726, 52444.51083561,
       14755.29358634, 11868.89871771, 14317.42510681,  6185.68884333,
       10302.81892513, 37198.12545244, 36528.18367138,  2595.17928458,
        5974.20553104,  5254.05429414, 24770.23432099, 48252.03903556,
       35034.81236865,  6424.34829312, 12561.37453899,  8719.65101027,
        6662.35698651, 13536.99711028,  5442.38703535,  6776.88472345,
       31034.21637042, 47988.73538528, 11620.90352567,  5168.66359086,
      

In [17]:
train_acc = lr.predict(X_train_poly)
train_acc 

array([35443.76273806,  2732.99961518, 40672.92213182, ...,
        7947.29590437,  3699.26687668,  5404.07755322])

# evaluation

In [18]:
from sklearn.metrics import r2_score
r2_score(y_test,y_pred)

0.8802620539610428

In [19]:
r2_score(y_train,train_acc)

0.8373709632701476

# real data prediction

In [20]:
sc.transform([[19,27.900]])



array([[-0.83333333, -0.29770765]])

In [21]:
lr.predict(poly.transform([[-0.833333, -0.297708, 0, 1, 0, 0, 0, 1]]))



array([24521.55059636])

# for loop

In [22]:
for var in range(1,5):
    poly = PolynomialFeatures(degree=var)
    X_train_poly = poly.fit_transform(X_train)
    X_test_poly = poly.transform(X_test)
    lr = LinearRegression()
    model = lr.fit(X_train_poly,y_train)
    y_pred = lr.predict(X_test_poly)
    print(r2_score(y_test,y_pred))

0.7999876970680435
0.8802620539610428
0.8751791257987495
0.850765714634891
