In [None]:
import numpy as np
import pandas as pd
import statsmodels.api as sm
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler

In [None]:
df = pd.read_csv('insurance_data.csv')
df.info()

In [None]:
# comments:
# (1) there is a small amount of rows with missing values - they can be dropped
# (2) you may want to maake use of https://pandas.pydata.org/docs/reference/api/pandas.get_dummies.html
# (3) perform all your computations (solve the task) before the questions part, in a complete, clear and effective manner
# (4) the questions part only print answers based on your solution

### Data Preparation

In [None]:
# Drop rows with missing values
df = df.dropna()

# Convert categorical variables to numerical
df['gender'] = df['gender'].map({'male': 0, 'female': 1})
df['smoker'] = df['smoker'].map({'no': 0, 'yes': 1})

### Exploratory Data Analysis

In [None]:
# Visualize the data with scatter plots
sns.pairplot(df)
# Adjust subplot parameters manually
plt.subplots_adjust(left=0.1, right=0.9, top=0.9, bottom=0.1)
plt.show()

In [None]:
# Visualize the data with a correlation matrix
correlation_matrix = df.corr() # Compute the correlation matrix
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm')
plt.show()

### Model Training

In [None]:
# Define features and target
x = df[['age', 'bmi', 'children', 'smoker']]
# Not including weight because it is strongly correlated with bmi
# Not including gender because it is useless perdictor (0 is in confidence interval)
y = df['expenses']

# Add a constant to the model (intercept)
x = sm.add_constant(x)

# Build the regression model
model = sm.OLS(y, x)
result = model.fit()

# Print the summary of the regression model
print(result.summary())

### Feature Scaling

In [None]:
x = df[['age', 'bmi', 'children', 'smoker']]
scaler = StandardScaler() # standardize features to have the mean of 0 and std of 1
scaler.fit(x) # compute mean and std of the data, seperately for each column
predictors_scaled = scaler.transform(x)
df_scaled = pd.DataFrame(predictors_scaled, columns=['age', 'bmi', 'children', 'smoker'])
df_scaled.describe()

### Model Training (Scaled Features)

In [None]:
y = y.reset_index(drop=True) # Should reset y's index

x_scaled = scaler.fit_transform(x)
x_scaled = pd.DataFrame(x_scaled, columns=x.columns)

# Add a constant to the model (intercept)
x_scaled = sm.add_constant(x_scaled)

# Build the regression model
model = sm.OLS(y, x_scaled)
result = model.fit()

# Print the summary of the regression model
print(result.summary())

#### Questions (answer the quesitons, all computations should precede this part)

#### Question 1

In [None]:
# did you remove any numerical predictor from the data?
# if no - why, if yes - how did you decide on the predictor to remove?
# print a short (one-sentence) answer using the print() command

### Question 1 - Solution

In [None]:
print('Removing weight because it is strongly correlated with bmi helps to avoid multicollinearity, which can inflate the variance of coefficient estimates and make the model less reliable.')

#### Question 2

In [None]:
# what is the amount of money a person is likely to spend on medical expenses with each additional year of age?
# write here the value itself (hardcoded) based on your solution above
# display your answer as a dataframe (as in assignment 2)

### Question 2 - Solution

In [None]:
result = [('The amount of money a person is likely to spend on medical expenses with each additional year of age', 258.689)]
pd.DataFrame(result)

#### Question 3

In [None]:
# what predictors have a significant contribution to the medical expenses amount?
# report only signifnicant (P<0.05) predictors sorted by their contribution to the prediction from highest to lowest,
# where for each predictor you specify if it has a positive or a negative effect on the medical expenses

# for categorical variables - specify the effect of individual values that appear signfnicant (e.g., "smoker-yes", "smoker-no")

# display your answer as a dataframe with two columns: (1) predictor, (2) effect (positive or negative)
# no need to include the constant (b_0) value

### Question 3 - Solution

In [None]:
# Create a DataFrame for significant predictors and their effects
data = {
    "predictor": ["smoker-yes", "age", "bmi", "children"],
    "effect": ["positive", "positive", "positive", "positive"]
}

pd.DataFrame(data)