In [1]:
import numpy as np
import pandas as pd
import statsmodels.api as sm

In [2]:
df = pd.read_csv('insurance_data.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1250 entries, 0 to 1249
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1250 non-null   int64  
 1   gender    1250 non-null   object 
 2   BMI       1250 non-null   float64
 3   weight    1248 non-null   float64
 4   children  1250 non-null   int64  
 5   smoker    1249 non-null   object 
 6   region    1250 non-null   object 
 7   expenses  1248 non-null   float64
dtypes: float64(3), int64(2), object(3)
memory usage: 78.3+ KB


In [3]:
df.dropna(inplace=True)
df.sample(frac=1).reset_index(drop=True) # shuffle the data
df = pd.get_dummies(df, columns=['smoker'], dtype="float", drop_first=True) # smoker is categorical so we need to convert it to a numerical value



In [4]:
df_train = df.iloc[:int(len(df) * 0.8)]
df_test = df.iloc[int(len(df) * 0.8):]
x_train = df_train[['smoker_yes', 'age', 'BMI']]
y_train = df_train['expenses']
x_train = sm.add_constant(x_train)
model = sm.OLS(y_train, x_train)
result = model.fit()
result.summary()

0,1,2,3
Dep. Variable:,expenses,R-squared:,0.751
Model:,OLS,Adj. R-squared:,0.75
Method:,Least Squares,F-statistic:,998.6
Date:,"Thu, 08 May 2025",Prob (F-statistic):,4.3499999999999996e-299
Time:,14:09:33,Log-Likelihood:,-10072.0
No. Observations:,996,AIC:,20150.0
Df Residuals:,992,BIC:,20170.0
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-1.482e+04,1131.208,-13.098,0.000,-1.7e+04,-1.26e+04
smoker_yes,2.368e+04,476.436,49.699,0.000,2.27e+04,2.46e+04
age,266.3308,13.499,19.730,0.000,239.841,292.820
BMI,324.9430,31.446,10.333,0.000,263.234,386.652

0,1,2,3
Omnibus:,230.375,Durbin-Watson:,2.045
Prob(Omnibus):,0.0,Jarque-Bera (JB):,543.117
Skew:,1.239,Prob(JB):,1.16e-118
Kurtosis:,5.636,Cond. No.,357.0


In [5]:
# comments:

# (1) there is a small amount of rows with missing values - they can be dropped

# (2) you may want to make use of https://pandas.pydata.org/docs/reference/api/pandas.get_dummies.html
# read through the function documentation carefully, and use dtype=float for the "dtype" parameter

# (3) perform all your computations (solve the task) before the questions part, in a complete, clear and effective manner

# (4) in the questions part only print answers based on your solution

#### Questions (answer the questions, all computations should precede this part)

#### Question 1

In [6]:
# did you remove any numerical predictor from the data based on multi-collinearity considerations?
# if not - why, if yes - how did you decide on the predictor to remove?
# print a short (one-sentence) answer using the print() command

In [7]:
print("yes we removed the weight predictor because it was highly correlated with the BMI predictor.\nwe decided to remove the weight becasue the R^2 was heigher than if we removed the BMI predictor")

yes we removed the weight predictor because it was highly correlated with the BMI predictor.
we decided to remove the weight becasue the R^2 was heigher than if we removed the BMI predictor


#### Question 2

In [8]:
# what is the amount of money a person is likely to spend on medical expenses with each additional year of age?
# write here the value itself (hardcoded) based on your inspection of the regression summary (after taking care of multi-collinearity)
# display your answer as a dataframe (as in assignment 2)

In [9]:
money_per_year = pd.DataFrame([result.params['age']], index=['money_per_year'], columns=['value'])
print(money_per_year)

                     value
money_per_year  266.330834


#### Question 3

In [None]:
# consider the predictors: age, gender, BMI, weight, children, smoker
# what predictors (out of this list) have significant contribution to predicting medical expenses?

# report only signifnicant predictors sorted by their contribution to the prediction from highest to lowest
# for each predictor specify if it has a positive or a negative effect on the medical expenses

# display your answer as a dataframe with two columns: (1) predictor, (2) effect (positive or negative)
# no need to include the constant (b_0) value

In [13]:
x_train_with_weight = df_train[['smoker_yes', 'age', 'BMI', 'weight']]
y_train_with_weight = df_train['expenses']
x_train_with_weight = sm.add_constant(x_train_with_weight)
model_with_weight = sm.OLS(y_train_with_weight, x_train_with_weight)
result_with_weight = model_with_weight.fit()

In [15]:

coefficients = result_with_weight.params
p_values = result_with_weight.pvalues
coefficients = coefficients.drop('const')
sorted_coefficients = coefficients.abs().sort_values(ascending=False)
df_sorted_coefficients = pd.DataFrame({'predictor': sorted_coefficients, 'effect': ['positive' if coefficients[p] > 0 else 'negative' for p in sorted_coefficients.index]})
print(df_sorted_coefficients)

               predictor    effect
smoker_yes  23674.978489  positive
BMI           387.686385  positive
age           266.869516  positive
weight         28.674576  negative


#### Question 4

In [11]:
# compute R-squared for four regression versions:
# (1) including all predictors from the csv file
# (2) including predictors after taking care of the multi-collineraity issue
# (3) (2) above + including only predictors with signficant contribution to the model
# (4) (3) above + after preditor scaling

#### Question 5

In [12]:
# what medical expenses may expect a person with the following data?
# age=66, gender=female, BMI=35.4, weight=70.5, children=1, smoker=no, region=southeast

# for this question only, include you computation *in the answer below* using model (3) from Question 4

# !! you may face difficuly adding a constant (sm.add_constant()) to a DataFrame with a single row
# try to search for solution, and in case you need a hint, you may find these links useful - read carefully:
# https://github.com/statsmodels/statsmodels/issues/7057
# https://www.statsmodels.org/0.9.0/generated/statsmodels.tools.tools.add_constant.html
# in this specific case add_constant() has a somewhat unexpected behavior