In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split


In [None]:
insurance_data = pd.read_csv('insurance.csv')
insurance_data.info()

insurance_data

In [None]:
filled_insurance_data = insurance_data.dropna()
filled_insurance_data['region'] = filled_insurance_data['region'].str.lower()
filled_insurance_data['region'].unique()

In [None]:

gender_mapping = {'F': 'female', 'M':'male', 'woman':'female', 'man': 'male'}
filled_insurance_data['sex'] = filled_insurance_data['sex'].replace(gender_mapping)
filled_insurance_data['sex'].unique()
    

In [None]:
filled_insurance_data['smoker'] = (filled_insurance_data['smoker']=='yes')
filled_insurance_data['smoker'].unique()

In [None]:
filled_insurance_data.sample(10)

In [None]:
filled_insurance_data = filled_insurance_data.apply(lambda x: x.abs() if x.dtype == "float64" else x)
filled_insurance_data.sample(10)
filled_insurance_data.info()

In [None]:
import matplotlib.pyplot as plt

df = filled_insurance_data.copy()

plt.scatter(df['bmi'], df['charges'], alpha=0.5)

In [None]:
df_dummy = pd.get_dummies(df, prefix=['region'], columns=['region'], dtype=int)
df_dummy = df_dummy.drop(columns=['region_northeast'])
df_dummy.sample(10)

In [None]:
df_dummy['smoker'] = df_dummy['smoker'].astype(int)
df_dummy.sample(10)

In [None]:
df_dummy['is_male'] = (df_dummy['sex'] == 'male').astype(int)
df_dummy = df_dummy.drop(columns=['sex'])
df_dummy = df_dummy.dropna(columns=['charges'])
df_dummy.info


In [112]:
df_dummy_new = df_dummy.dropna()
df_dummy_new['charges'] = df_dummy_new['charges'].astype(float)


In [113]:
from sklearn.linear_model import LinearRegression

X = df_dummy_new.drop(columns=['charges'])
y = df_dummy_new['charges']

model = LinearRegression()
model.fit(X, y)

print("Model coefficients:", model.coef_)
print("Model intercept:", model.intercept_)

ValueError: Input y contains NaN.

In [114]:
df_clean = df_dummy_new.dropna(subset=['charges'])
X_clean = df_clean.drop(columns=['charges'])
y_clean = df_clean['charges']

model = LinearRegression()
model.fit(X_clean, y_clean)

In [118]:
from sklearn.metrics import r2_score

y_pred = model.predict(X_clean)
r2 = r2_score(y_clean, y_pred)
print("R^2 score:", r2)

R^2 score: 0.7489986224696537


In [122]:
val_df = pd.read_csv('validation_dataset.csv')
val_df['sex'].unique()
val_df

Unnamed: 0,age,sex,bmi,children,smoker,region
0,18.0,female,24.09,1.0,no,southeast
1,39.0,male,26.41,0.0,yes,northeast
2,27.0,male,29.15,0.0,yes,southeast
3,71.0,male,65.502135,13.0,yes,southeast
4,28.0,male,38.06,0.0,no,southeast
5,70.0,female,72.958351,11.0,yes,southeast
6,29.0,female,32.11,2.0,no,northwest
7,42.0,female,41.325,1.0,no,northeast
8,48.0,female,36.575,0.0,no,northwest
9,63.0,male,33.66,3.0,no,southeast


In [127]:
def preprocess_validation_data(df):
    df = df.dropna()
    df = pd.get_dummies(df, prefix=['region'], columns=['region'], dtype=int)
    df  = df.drop(columns=['region_northeast'])
    df['smoker'] = df['smoker'].map({'yes': 1, 'no': 0})
    df['is_male'] = (df['sex'] == 'male').astype(int)
    df = df.drop(columns=['sex'])
    return df

In [128]:
input_val_df = preprocess_validation_data(val_df)
input_val_df

Unnamed: 0,age,bmi,children,smoker,region_northwest,region_southeast,region_southwest,is_male
0,18.0,24.09,1.0,0,0,1,0,0
1,39.0,26.41,0.0,1,0,0,0,1
2,27.0,29.15,0.0,1,0,1,0,1
3,71.0,65.502135,13.0,1,0,1,0,1
4,28.0,38.06,0.0,0,0,1,0,1
5,70.0,72.958351,11.0,1,0,1,0,0
6,29.0,32.11,2.0,0,1,0,0,0
7,42.0,41.325,1.0,0,0,0,0,0
8,48.0,36.575,0.0,0,1,0,0,0
9,63.0,33.66,3.0,0,0,1,0,1


In [130]:
predictions = model.predict(input_val_df)
predictions

array([ 5.50513496e+02,  3.08000819e+04,  2.77617771e+04,  5.68042804e+04,
        6.98190350e+03,  5.83435823e+04,  7.01010916e+03,  1.32034639e+04,
        1.24921882e+04,  1.59529054e+04,  2.64595339e+03,  1.41932436e+04,
        1.11247510e+04,  1.16848331e+04,  2.76485729e+03,  4.05952458e+03,
        4.24222915e+04,  6.36864941e+04,  5.88359222e+04,  1.11795607e+04,
       -6.29397254e+01,  1.28634908e+04,  3.22082335e+04,  1.19194818e+04,
        9.65826824e+03,  5.22100757e+03,  5.86514038e+04,  3.22952481e+03,
        1.16350881e+04,  1.03896914e+04,  6.40972937e+03,  2.72424305e+04,
        3.07927037e+04,  1.30038945e+04,  3.21634181e+04,  1.38426002e+04,
        5.85584766e+04,  1.42453380e+04, -1.79857428e+01,  2.96731640e+04,
        2.99686042e+04,  1.19415915e+04,  3.68196090e+03,  5.99454203e+04,
        5.91909046e+03,  3.97776638e+04,  6.76712198e+04,  3.08138739e+04,
        1.50012750e+04,  3.53295692e+04])

In [131]:
predictions_new = [x if x > 0 else 1000 for x in predictions ]
predictions_new

[np.float64(550.5134958676481),
 np.float64(30800.08185673449),
 np.float64(27761.777099957675),
 np.float64(56804.28042043944),
 np.float64(6981.903497109652),
 np.float64(58343.5822687128),
 np.float64(7010.109164872745),
 np.float64(13203.463920746002),
 np.float64(12492.188217231607),
 np.float64(15952.905356155265),
 np.float64(2645.9533896343073),
 np.float64(14193.243599994807),
 np.float64(11124.75102549377),
 np.float64(11684.833136162139),
 np.float64(2764.857294548141),
 np.float64(4059.5245802622303),
 np.float64(42422.29149586007),
 np.float64(63686.49405422346),
 np.float64(58835.92223936137),
 np.float64(11179.560731580736),
 1000,
 np.float64(12863.490833511027),
 np.float64(32208.233542156187),
 np.float64(11919.481768443948),
 np.float64(9658.26824316282),
 np.float64(5221.007570341109),
 np.float64(58651.403821253036),
 np.float64(3229.5248107437546),
 np.float64(11635.08814590799),
 np.float64(10389.691448422436),
 np.float64(6409.729371028407),
 np.float64(27242.43

In [132]:
validation_data = input_val_df.copy()
validation_data['predicted_charges'] = predictions_new
validation_data


Unnamed: 0,age,bmi,children,smoker,region_northwest,region_southeast,region_southwest,is_male,predicted_charges
0,18.0,24.09,1.0,0,0,1,0,0,550.513496
1,39.0,26.41,0.0,1,0,0,0,1,30800.081857
2,27.0,29.15,0.0,1,0,1,0,1,27761.7771
3,71.0,65.502135,13.0,1,0,1,0,1,56804.28042
4,28.0,38.06,0.0,0,0,1,0,1,6981.903497
5,70.0,72.958351,11.0,1,0,1,0,0,58343.582269
6,29.0,32.11,2.0,0,1,0,0,0,7010.109165
7,42.0,41.325,1.0,0,0,0,0,0,13203.463921
8,48.0,36.575,0.0,0,1,0,0,0,12492.188217
9,63.0,33.66,3.0,0,0,1,0,1,15952.905356
