## DATA DISCOVERY AND DATA QUALITY

In [None]:
import pandas as pd

insurance_initial = pd.read_csv('insurance.csv')
insurance = insurance_initial.copy()
insurance.head()

In [None]:
insurance.info()

In [None]:
insurance.describe()

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Create subplots
fig, axes = plt.subplots(2, 2, figsize=(12, 8))

# Plot boxplot for 'age'
sns.boxplot(ax=axes[0, 0], data=insurance, y='age')
axes[0, 0].set_title('Age')

# Plot boxplot for 'bmi'
sns.boxplot(ax=axes[0, 1], data=insurance, y='bmi')
axes[0, 1].set_title('BMI')

# Plot boxplot for 'children'
sns.boxplot(ax=axes[1, 0], data=insurance, y='children')
axes[1, 0].set_title('Children')

# Plot boxplot for 'charges'
sns.boxplot(ax=axes[1, 1], data=insurance, y='charges')
axes[1, 1].set_title('Charges')

# Adjust spacing between subplots
plt.tight_layout()

# Show the plot
plt.show()

In [None]:
insurance['region'].value_counts()

In [None]:
insurance['smoker'].value_counts()

In [None]:
insurance['sex'].value_counts()

In [None]:
import matplotlib.pyplot as plt

insurance.hist(bins=50, figsize=(12, 8))
plt.show()

In [None]:
import seaborn as sns
sns.pairplot(insurance[['age', 'bmi', 'children', 'charges']])
plt.show()

### TRAINING AND TEST SET

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.preprocessing import StandardScaler



In [None]:

strat_train_set, strat_test_set = train_test_split(
    insurance, test_size=0.3, stratify=pd.cut(insurance['bmi'],bins=4), random_state=42)

insurance_training  = strat_train_set.drop("charges", axis=1)
charges_training = strat_train_set["charges"].copy()

In [None]:
insurance['bmi'].hist()

In [None]:
insurance_training['bmi'].hist()

In [None]:
test_bmi = strat_test_set['bmi'].copy().hist()

use one hot encoder for region


In [None]:
region_encoder = OneHotEncoder(drop='first')
binary_encoder = OneHotEncoder(drop='if_binary')
res_smoker = binary_encoder.fit_transform(insurance['smoker'].values.reshape(-1,1))
res_smoker.toarray()


Standardize numerical values

In [None]:
ins_num = insurance_training[['age', 'bmi', 'children']]
std_scaler = StandardScaler()
ins_num_std_scaled = std_scaler.fit_transform(ins_num)
ins_num_std_scaled

In [None]:
from sklearn.pipeline import Pipeline, make_pipeline

num_pipeline = Pipeline([
    ("standardize", StandardScaler()),
])
cat_pipeline = make_pipeline(
    OneHotEncoder(handle_unknown="ignore"))

In [None]:
from sklearn.compose import ColumnTransformer

preprocessing = ColumnTransformer([
    ("num", num_pipeline, ['age', 'bmi', 'children']),
    ("reg", region_encoder, ['region']),
    ("bin", binary_encoder, ['sex','smoker']),
])

In [None]:
ins_prepared = preprocessing.fit_transform(insurance_training, charges_training)
ins_prepared

In [None]:
preprocessing.get_feature_names_out()

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

lin_reg = make_pipeline(preprocessing, LinearRegression())
lin_reg.fit(insurance_training, charges_training)

In [None]:
ins_predictions = lin_reg.predict(insurance_training)
lin_rmse = mean_squared_error(charges_training, ins_predictions, squared=False)
lin_rmse


In [None]:
print(lin_reg.named_steps['linearregression'].coef_)
print(lin_reg.named_steps['linearregression'].intercept_)


In [None]:
import statsmodels.api as sm

# Fit the linear regression model
X2 = sm.add_constant(preprocessing.fit_transform(insurance_training, charges_training))
model = sm.OLS(charges_training, X2).fit()

# Display the p-values and coefficients
model.use_t = True
print("P-values:")
print(model.pvalues)
print("\nCoefficients:")
print(model.summary())


In [None]:
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

r2 = r2_score(charges_training, ins_predictions)
mae = mean_absolute_error(charges_training, ins_predictions)
rmse = mean_squared_error(charges_training, ins_predictions, squared=False)

print(f"R2 Score: {r2}, Mean Absolute Error: {mae}, Root Mean Squared Error: {rmse}")

In [None]:
# standardize the prevision
ins_predictions_std = StandardScaler().fit_transform(ins_predictions.reshape(-1, 1))
# standardize the residus
residus = charges_training - ins_predictions
residus_std = StandardScaler().fit_transform(residus.values.reshape(-1, 1))
residus_std
import seaborn as sns
sns.scatterplot(x=ins_predictions_std.reshape(-1),y=residus_std.reshape(-1))
plt.xlabel('Fitted Values')
plt.ylabel('Error')
plt.axhline(y=0,color='red')
plt.axhline(y=3,color='orange')
plt.show()

In [None]:
# validate the model on the test set
ins_test = strat_test_set.drop("charges", axis=1)
charges_test = strat_test_set["charges"].copy()
ins_predictions_test = lin_reg.predict(ins_test)
# Compute RMSE
rmse_test = mean_squared_error(charges_test , ins_predictions_test, squared=False)
rmse_test

Lets try a Decision Tree

In [None]:
from sklearn.tree import DecisionTreeRegressor

tree_reg = make_pipeline(preprocessing, DecisionTreeRegressor(random_state=42, min_samples_split=20, max_depth=5))
tree_reg.fit(insurance_training, charges_training)

In [None]:
ins_predictions_tree = tree_reg.predict(insurance_training)
tree_rmse = mean_squared_error(charges_training, ins_predictions_tree, squared=False)
tree_rmse

In [None]:
from sklearn import tree
fig, ax = plt.subplots(figsize=(12, 12))
tree.plot_tree(tree_reg.named_steps['decisiontreeregressor'], ax=ax, node_ids=True, filled=True)
plt.show()

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score

forest_reg = make_pipeline(preprocessing, RandomForestRegressor(random_state=42,max_depth=5,min_samples_split=20))
forest_rmses = -cross_val_score(forest_reg, insurance_training, charges_training, scoring="neg_root_mean_squared_error", cv=10)
pd.Series(forest_rmses).describe()

In [None]:

forest_reg.fit(insurance_training, charges_training)
ins_predictions_forest = forest_reg.predict(insurance_training)
r2_score(charges_training, ins_predictions_forest)

In [None]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint

full_pipeline = Pipeline([
    ("preprocessing", preprocessing),
    ("random_forest", RandomForestRegressor(random_state=42,max_depth=5,min_samples_split=20)),
])

param_distribs = {'random_forest__max_features': randint(low=2, high=20)}

rnd_search = RandomizedSearchCV(
    full_pipeline, param_distributions=param_distribs, n_iter=10, cv=5,
    scoring='neg_root_mean_squared_error', random_state=42)

rnd_search.fit(insurance_training, charges_training)


In [None]:
final_model = rnd_search.best_estimator_ 
feature_importances = final_model["random_forest"].feature_importances_
sorted(zip(feature_importances,final_model["preprocessing"].get_feature_names_out()),reverse=True)


In [None]:
ins_predictions_final = final_model.predict(insurance_training)
r2_score(charges_training, ins_predictions_final)



In [None]:
# Compute RMSE
rmse_final = mean_squared_error(charges_training, ins_predictions_final, squared=False)
rmse_final

In [None]:
# standardize the prevision
ins_predictions_final_std = StandardScaler().fit_transform(ins_predictions_final.reshape(-1, 1))
# standardtze the residus
residus = charges_training - ins_predictions_final
residus_std = StandardScaler().fit_transform(residus.values.reshape(-1, 1))
residus_std
import seaborn as sns
sns.scatterplot(x=ins_predictions_final_std.reshape(-1),y=residus_std.reshape(-1))
plt.xlabel('Fitted Values')
plt.ylabel('Error')
plt.axhline(y=0,color='red')
plt.axhline(y=3,color='orange')
plt.show()


In [None]:
# validate the model on the test set
ins_test = strat_test_set.drop("charges", axis=1)
charges_test = strat_test_set["charges"].copy()
ins_predictions_test = final_model.predict(ins_test)
# Compute RMSE
rmse_test = mean_squared_error(charges_test , ins_predictions_test, squared=False)
rmse_test

In [None]:
print(final_model)

In [None]:
import joblib

joblib.dump(final_model, "my_insurance_ml.pkl")

In [None]:
import pandas as pd
# 'num__age', 'num__bmi', 'num__children', 'reg__region_northwest','reg__region_southeast', 'reg__region_southwest', 'bin__sex_male','bin__smoker_yes'

data = {
    'age': [25, 30, 35],
    'bmi': [20, 22, 35],
    'children': [0,3,1],
    'region': ['northwest', 'southeast', 'southwest'],
    'sex': ['male', 'female', 'female'],
    'smoker': ['no', 'yes', 'yes'],
    
    
}

df = pd.DataFrame(data)
final_model_reloaded = joblib.load("my_insurance_ml.pkl")
predictions = final_model_reloaded.predict(df)
predictions

In [None]:
#!pip install graphviz

In [None]:
from sklearn.tree import export_graphviz
import graphviz
import os
dot_data = export_graphviz(final_model.named_steps['random_forest'].estimators_[0], out_file=None, 
                           feature_names=final_model.named_steps['preprocessing'].get_feature_names_out(),
                           filled=True)
graph = graphviz.Source(dot_data)
os.environ["PATH"] += os.pathsep + '/usr/local/bin'
graph.render("random_forest_tree.dot")

In [None]:
#!pip install yellowbrick
from yellowbrick.model_selection import FeatureImportances



# Use Yellowbrick to visualize feature importances
viz = FeatureImportances(final_model.named_steps['random_forest'], labels=final_model.named_steps['preprocessing'].get_feature_names_out())
viz.fit(preprocessing, charges_training)
viz.show()