In [None]:
import numpy as np
import pandas as pd
import statsmodels.api as sm
from sklearn.preprocessing import StandardScaler

In [None]:
df = pd.read_csv('insurance_data.csv')
df.info()

In [None]:
df.dropna(inplace=True) # drop rows with missing values
df.sample(frac=1).reset_index(drop=True) # shuffle the data
df = pd.get_dummies(df, dtype="float", drop_first=True, columns=['gender', 'region', 'smoker']) # convert categorical variables to dummy variables
df.head()


In [None]:
# create a linear regression model
# using the 'smoker_yes', 'age', and 'BMI' columns as independent variables
# and the 'expenses' column as the dependent variable
x = df[['smoker_yes', 'age', 'BMI']]
y = df['expenses']
x = sm.add_constant(x)
model = sm.OLS(y, x)
result = model.fit()
result.summary()

In [None]:
# comments:

#Â (1) there is a small amount of rows with missing values - they can be dropped

# (2) you may want to make use of https://pandas.pydata.org/docs/reference/api/pandas.get_dummies.html
# read through the function documentation carefully, and use dtype=float for the "dtype" parameter

# (3) perform all your computations (solve the task) before the questions part, in a complete, clear and effective manner

# (4) in the questions part only print answers based on your solution

#### Questions (answer the questions, all computations should precede this part)

#### Question 1

In [None]:
# did you remove any numerical predictor from the data based on multi-collinearity considerations?
# if not - why, if yes - how did you decide on the predictor to remove?
# print a short (one-sentence) answer using the print() command

In [None]:
p_corr = round(df['weight'].corr(df['BMI']), 2)
print("We removed the weight predictor because it was highly correlated with the BMI predictor.")
print("The decision was based on the fact that the R^2 value was higher when weight was removed compared to removing BMI.")
print("This correlation exists because BMI is calculated using weight.")
print(f"The Pearson correlation between weight and BMI is {p_corr}.")

#### Question 2

In [None]:
# what is the amount of money a person is likely to spend on medical expenses with each additional year of age?
# write here the value itself (hardcoded) based on your inspection of the regression summary (after taking care of multi-collinearity)
# display your answer as a dataframe (as in assignment 2)

In [None]:
money_per_year = pd.DataFrame([result.params['age']], index=['money_per_year'], columns=['value'])
print(money_per_year)

#### Question 3

In [None]:
# consider the predictors: age, gender, BMI, weight, children, smoker
# what predictors (out of this list) have significant contribution to predicting medical expenses?

# report only signifnicant predictors sorted by their contribution to the prediction from highest to lowest
# for each predictor specify if it has a positive or a negative effect on the medical expenses

# display your answer as a dataframe with two columns: (1) predictor, (2) effect (positive or negative)
# no need to include the constant (b_0) value

In [None]:
# Prepare the data
X = df[['smoker_yes', 'age', 'BMI', 'weight', 'gender_male']]
y = df['expenses']

scaler = StandardScaler()
X_scaled_values = scaler.fit_transform(X)
X_scaled = pd.DataFrame(X_scaled_values, columns=X.columns, index=X.index)
X_scaled = sm.add_constant(X_scaled)

# Fit model
model = sm.OLS(y, X_scaled)
results = model.fit()

# Get coefficients, p-values, and confidence intervals
coefficients = results.params.drop('const')
p_values = results.pvalues.drop('const')
conf_int = results.conf_int().drop('const')
conf_int.columns = ['CI Lower', 'CI Upper']

# Identify significant predictors based on CI not containing 0
significant = conf_int[(conf_int['CI Lower'] > 0) | (conf_int['CI Upper'] < 0)].index
significant_coefficients = coefficients[significant]
sorted_coefficients = significant_coefficients.abs().sort_values(ascending=False)
sorted_predictors = sorted_coefficients.index

# Create results DataFrame
df_result = pd.DataFrame({
    'predictor value': sorted_coefficients,
    'effect': ['positive' if significant_coefficients[p] > 0 else 'negative' for p in sorted_predictors],
    'CI Lower': conf_int.loc[sorted_predictors, 'CI Lower'].values,
    'CI Upper': conf_int.loc[sorted_predictors, 'CI Upper'].values
})

print(df_result)

#### Question 4

In [None]:
# compute R-squared for four regression versions:
# (1) including all predictors from the csv file
# (2) including predictors after taking care of the multi-collineraity issue
# (3) (2) above + including only predictors with signficant contribution to the model
# (4) (3) above + after preditor scaling

(1) including all predictors from the csv file

In [None]:

x_all = df[['smoker_yes', 'age', 'BMI', 'weight', 'children', 'gender_male', 'region_northwest', 'region_southeast', 'region_southwest']]
y_all = df['expenses']
x_all = sm.add_constant(x_all)
model_all = sm.OLS(y_all, x_all)
result_all = model_all.fit()
r2all_df = pd.DataFrame({'R_squared': [result_all.rsquared]}, index=['all_predictors'])
print(r2all_df)

including predictors after taking care of the multi-collineraity issue

In [None]:
x_no_collin = df[['smoker_yes', 'age', 'BMI', 'children', 'region_northwest', 'region_southeast', 'region_southwest', 'gender_male']]
y_no_collin = df['expenses']
x_no_collin = sm.add_constant(x_no_collin)
model_no_collin = sm.OLS(y_no_collin, x_no_collin)
result_no_collin = model_no_collin.fit()
r2_no_col = pd.DataFrame({'R_squared': [result_no_collin.rsquared]}, index=['no_collinearity'])
print(r2_no_col)

(3) (2) above + including only predictors with signficant contribution to the model
its the model that we created at the begging

In [None]:
r2_significant = pd.DataFrame({'R_squared': [result_no_collin.rsquared]}, index=['significant_predictors'])
print(r2_significant)

(4) (3) above + after preditor scaling

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(df)
predicators_scaled = scaler.transform(df) # scale the predictors
df_scaled = pd.DataFrame(predicators_scaled, columns = df.columns)
x_scaled = df_scaled[['smoker_yes', 'age', 'BMI']]
y_scaled = df_scaled['expenses']
x_scaled = sm.add_constant(x_scaled)
model_scaled = sm.OLS(y_scaled, x_scaled)
result_scaled = model_scaled.fit()
r2_scaled = pd.DataFrame({'R_squared': [result_scaled.rsquared]}, index=['scaled_predictors'])
print(r2_scaled)

#### Question 5

In [None]:
# what medical expenses may expect a person with the following data?
# age=66, gender=female, BMI=35.4, weight=70.5, children=1, smoker=no, region=southeast

# for this question only, include you computation *in the answer below* using model (3) from Question 4

# !! you may face difficuly adding a constant (sm.add_constant()) to a DataFrame with a single row
# try to search for solution, and in case you need a hint, you may find these links useful - read carefully:
# https://github.com/statsmodels/statsmodels/issues/7057
# https://www.statsmodels.org/0.9.0/generated/statsmodels.tools.tools.add_constant.html
# in this specific case add_constant() has a somewhat unexpected behavior

In [None]:

person_data = pd.DataFrame({
    'smoker_yes': [0],
    'age': [66],
    'BMI': [35.4]
})

person_data = sm.add_constant(person_data, has_constant='add') # add a constant term with had constant = 'add' because we have a single row

predicted_expenses = result.predict(person_data)
print(f"Expected medical expenses: {predicted_expenses[0]:.2f}")