In [1]:
import numpy as np
import pandas as pd
import statsmodels.api as sm
from sklearn.preprocessing import StandardScaler

In [2]:
df = pd.read_csv('insurance_data.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1250 entries, 0 to 1249
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1250 non-null   int64  
 1   gender    1250 non-null   object 
 2   BMI       1250 non-null   float64
 3   weight    1248 non-null   float64
 4   children  1250 non-null   int64  
 5   smoker    1249 non-null   object 
 6   region    1250 non-null   object 
 7   expenses  1248 non-null   float64
dtypes: float64(3), int64(2), object(3)
memory usage: 78.3+ KB


In [3]:
df.dropna(inplace=True) # drop rows with missing values
df.sample(frac=1).reset_index(drop=True) # shuffle the data
df = pd.get_dummies(df, dtype="float", drop_first=True, columns=['gender', 'region', 'smoker']) # convert categorical variables to dummy variables
df.head()


Unnamed: 0,age,BMI,weight,children,expenses,gender_male,region_northwest,region_southeast,region_southwest,smoker_yes
0,29,27.9,55.651235,0,16884.92,0.0,0.0,0.0,1.0,1.0
1,28,33.8,76.008228,3,1725.55,1.0,0.0,1.0,0.0,0.0
2,38,33.0,58.492328,4,4449.46,1.0,0.0,1.0,0.0,0.0
3,43,22.7,49.77875,3,21984.47,1.0,1.0,0.0,0.0,0.0
4,42,28.9,79.693884,0,3866.86,1.0,1.0,0.0,0.0,0.0


In [4]:
# create a linear regression model
# using the 'smoker_yes', 'age', and 'BMI' columns as independent variables
# and the 'expenses' column as the dependent variable
x = df[['smoker_yes', 'age', 'BMI']]
y = df['expenses']
x = sm.add_constant(x)
model = sm.OLS(y, x)
result = model.fit()
result.summary()

0,1,2,3
Dep. Variable:,expenses,R-squared:,0.748
Model:,OLS,Adj. R-squared:,0.747
Method:,Least Squares,F-statistic:,1227.0
Date:,"Thu, 15 May 2025",Prob (F-statistic):,0.0
Time:,16:12:27,Log-Likelihood:,-12609.0
No. Observations:,1245,AIC:,25230.0
Df Residuals:,1241,BIC:,25250.0
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-1.426e+04,1024.537,-13.919,0.000,-1.63e+04,-1.23e+04
smoker_yes,2.39e+04,429.326,55.670,0.000,2.31e+04,2.47e+04
age,259.5974,12.339,21.039,0.000,235.390,283.804
BMI,322.2587,28.330,11.375,0.000,266.679,377.839

0,1,2,3
Omnibus:,271.132,Durbin-Watson:,2.075
Prob(Omnibus):,0.0,Jarque-Bera (JB):,617.725
Skew:,1.194,Prob(JB):,7.289999999999999e-135
Kurtosis:,5.49,Cond. No.,355.0


In [5]:
# comments:

# (1) there is a small amount of rows with missing values - they can be dropped

# (2) you may want to make use of https://pandas.pydata.org/docs/reference/api/pandas.get_dummies.html
# read through the function documentation carefully, and use dtype=float for the "dtype" parameter

# (3) perform all your computations (solve the task) before the questions part, in a complete, clear and effective manner

# (4) in the questions part only print answers based on your solution

#### Questions (answer the questions, all computations should precede this part)

#### Question 1

In [6]:
# did you remove any numerical predictor from the data based on multi-collinearity considerations?
# if not - why, if yes - how did you decide on the predictor to remove?
# print a short (one-sentence) answer using the print() command

In [7]:
print("yes we removed the weight predictor because it was highly correlated with the BMI predictor.\nwe decided to remove the weight becasue the R^2 was heigher than if we removed the BMI predictor.\nThey are correlated becasue if a person has higher weight he has a larger BMI")

yes we removed the weight predictor because it was highly correlated with the BMI predictor.
we decided to remove the weight becasue the R^2 was heigher than if we removed the BMI predictor.
They are correlated becasue if a person has higher weight he has a larger BMI


#### Question 2

In [8]:
# what is the amount of money a person is likely to spend on medical expenses with each additional year of age?
# write here the value itself (hardcoded) based on your inspection of the regression summary (after taking care of multi-collinearity)
# display your answer as a dataframe (as in assignment 2)

In [9]:
money_per_year = pd.DataFrame([result.params['age']], index=['money_per_year'], columns=['value'])
print(money_per_year)

                    value
money_per_year  259.59742


#### Question 3

In [10]:
# consider the predictors: age, gender, BMI, weight, children, smoker
# what predictors (out of this list) have significant contribution to predicting medical expenses?

# report only signifnicant predictors sorted by their contribution to the prediction from highest to lowest
# for each predictor specify if it has a positive or a negative effect on the medical expenses

# display your answer as a dataframe with two columns: (1) predictor, (2) effect (positive or negative)
# no need to include the constant (b_0) value

In [72]:
# create a linear regression model

X = df[['smoker_yes', 'age', 'BMI', 'weight', 'gender_male']]# with 'somker_yes', 'age', 'BMI', 'weight', and 'gender_male' as predicators
y = df['expenses']

scaler = StandardScaler()
X_scaled_values = scaler.fit_transform(X)
X_scaled = pd.DataFrame(X_scaled_values, columns=X.columns, index=X.index)

X_scaled = sm.add_constant(X_scaled)

model = sm.OLS(y, X_scaled)
results = model.fit()

coefficients = results.params.drop('const')
p_values = results.pvalues.drop('const')

significant = p_values[p_values < 0.05].index
significant_coefficients = coefficients[significant]

sorted_coefficients = significant_coefficients.abs().sort_values(ascending=False)
sorted_predictors = sorted_coefficients.index

df_result = pd.DataFrame({
    'predictor value': sorted_coefficients,
    'effect': ['positive' if significant_coefficients[p] > 0 else 'negative' for p in sorted_predictors]
})

print(df_result)


            predictor value    effect
smoker_yes      9584.409039  positive
age             3646.441172  positive
BMI             2273.091405  positive


#### Question 4

In [73]:
# compute R-squared for four regression versions:
# (1) including all predictors from the csv file
# (2) including predictors after taking care of the multi-collineraity issue
# (3) (2) above + including only predictors with signficant contribution to the model
# (4) (3) above + after preditor scaling

(1) including all predictors from the csv file

In [74]:

x_all = df[['smoker_yes', 'age', 'BMI', 'weight', 'children', 'gender_male', 'region_northwest', 'region_southeast', 'region_southwest']]
y_all = df['expenses']
x_all = sm.add_constant(x_all)
model_all = sm.OLS(y_all, x_all)
result_all = model_all.fit()
r2all_df = pd.DataFrame({'R_squared': [result_all.rsquared]}, index=['all_predictors'])
print(r2all_df)

                R_squared
all_predictors   0.750133


including predictors after taking care of the multi-collineraity issue

In [75]:
x_no_collin = df[['smoker_yes', 'age', 'BMI', 'children', 'region_northwest', 'region_southeast', 'region_southwest', 'gender_male']]
y_no_collin = df['expenses']
x_no_collin = sm.add_constant(x_no_collin)
model_no_collin = sm.OLS(y_no_collin, x_no_collin)
result_no_collin = model_no_collin.fit()
r2_no_col = pd.DataFrame({'R_squared': [result_no_collin.rsquared]}, index=['no_collinearity'])
print(r2_no_col)

                 R_squared
no_collinearity   0.749585


(3) (2) above + including only predictors with signficant contribution to the model
its the model that we created at the begging

In [76]:
r2_significant = pd.DataFrame({'R_squared': [result_no_collin.rsquared]}, index=['significant_predictors'])
print(r2_significant)

                        R_squared
significant_predictors   0.749585


(4) (3) above + after preditor scaling

In [77]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(df)
predicators_scaled = scaler.transform(df) # scale the predictors
df_scaled = pd.DataFrame(predicators_scaled, columns = df.columns)
x_scaled = df_scaled[['smoker_yes', 'age', 'BMI']]
y_scaled = df_scaled['expenses']
x_scaled = sm.add_constant(x_scaled)
model_scaled = sm.OLS(y_scaled, x_scaled)
result_scaled = model_scaled.fit()
r2_scaled = pd.DataFrame({'R_squared': [result_scaled.rsquared]}, index=['scaled_predictors'])
print(r2_scaled)

                   R_squared
scaled_predictors   0.747812


#### Question 5

In [78]:
# what medical expenses may expect a person with the following data?
# age=66, gender=female, BMI=35.4, weight=70.5, children=1, smoker=no, region=southeast

# for this question only, include you computation *in the answer below* using model (3) from Question 4

# !! you may face difficuly adding a constant (sm.add_constant()) to a DataFrame with a single row
# try to search for solution, and in case you need a hint, you may find these links useful - read carefully:
# https://github.com/statsmodels/statsmodels/issues/7057
# https://www.statsmodels.org/0.9.0/generated/statsmodels.tools.tools.add_constant.html
# in this specific case add_constant() has a somewhat unexpected behavior

In [79]:

person_data = pd.DataFrame({
    'smoker_yes': [0],
    'age': [66],
    'BMI': [35.4]
})

person_data = sm.add_constant(person_data, has_constant='add') # add a constant term with had constant = 'add' because we have a single row

predicted_expenses = result.predict(person_data)
print(f"Expected medical expenses: {predicted_expenses[0]:.2f}")

Expected medical expenses: 14280.47
