In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score
from tabulate import tabulate

In [3]:
# Load the dataset
file_path = 'insurance_pre.csv'
dataset = pd.read_csv(file_path)


In [4]:
# Perform one-hot encoding on categorical variables
dataset = pd.get_dummies(dataset, dtype=int, drop_first=True)

# Separate features and target
X = dataset.drop(columns=['charges'])
y = dataset['charges']

In [5]:
# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [6]:
# Function to print results in tabular format with borders
def print_results(table_name, results, best_params, best_score):
    df = pd.DataFrame(results.cv_results_)
    df = df[['params', 'mean_test_score', 'rank_test_score']]
    df['mean_test_score'] = df['mean_test_score'].round(4)
    df = df.rename(columns={'params': 'Hyperparameters', 'mean_test_score': 'Mean R2 Score', 'rank_test_score': 'Rank'})
    df_sorted = df.sort_values(by='Mean R2 Score', ascending=False).reset_index(drop=True)
    print(f"\n{table_name}")
    print(tabulate(df_sorted, headers='keys', tablefmt='grid', showindex=False))
    print(f"\nBest combination for {table_name}:")
    print(f"Parameters: {best_params}")
    print(f"Best Mean R2 Score: {best_score:.4f}")

In [7]:
# Simple Linear Regression
simple_model = LinearRegression()
simple_model.fit(X_train, y_train)
y_pred_simple = simple_model.predict(X_test)
r2_simple = r2_score(y_test, y_pred_simple)
print("\nSimple Linear Regression")
print(f"R2 Score: {r2_simple:.4f}")



Simple Linear Regression
R2 Score: 0.7811


In [8]:
# Support Vector Regression with GridSearchCV
svr_param_grid = {
    'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
    'C': [0.1, 1, 10, 100,1000],
    'epsilon': [0.1, 0.2, 0.5,0.10]
}
svr_grid_search = GridSearchCV(SVR(), param_grid=svr_param_grid, cv=5, scoring='r2')
svr_grid_search.fit(X_train, y_train)
print_results("Support Vector Regression", svr_grid_search, svr_grid_search.best_params_, svr_grid_search.best_score_)



Support Vector Regression
+--------------------------------------------------+-----------------+--------+
| Hyperparameters                                  |   Mean R2 Score |   Rank |
| {'C': 1000, 'epsilon': 0.5, 'kernel': 'linear'}  |          0.6331 |      1 |
+--------------------------------------------------+-----------------+--------+
| {'C': 1000, 'epsilon': 0.1, 'kernel': 'linear'}  |          0.6331 |      2 |
+--------------------------------------------------+-----------------+--------+
| {'C': 1000, 'epsilon': 0.2, 'kernel': 'linear'}  |          0.6331 |      4 |
+--------------------------------------------------+-----------------+--------+
| {'C': 1000, 'epsilon': 0.1, 'kernel': 'linear'}  |          0.6331 |      2 |
+--------------------------------------------------+-----------------+--------+
| {'C': 100, 'epsilon': 0.1, 'kernel': 'linear'}   |          0.5019 |      5 |
+--------------------------------------------------+-----------------+--------+
| {'C': 100, 

In [9]:
# Decision Tree Regression with GridSearchCV
dt_param_grid = {
    'max_depth': [None, 10, 20, 30],
    'criterion':['squared_error', 'friedman_mse', 'absolute_error', 'poisson']
}
dt_grid_search = GridSearchCV(DecisionTreeRegressor(random_state=42), param_grid=dt_param_grid, cv=5, scoring='r2')
dt_grid_search.fit(X_train, y_train)
print_results("Decision Tree Regression", dt_grid_search, dt_grid_search.best_params_, dt_grid_search.best_score_)



Decision Tree Regression
+----------------------------------------------------+-----------------+--------+
| Hyperparameters                                    |   Mean R2 Score |   Rank |
| {'criterion': 'squared_error', 'max_depth': 10}    |          0.7153 |      1 |
+----------------------------------------------------+-----------------+--------+
| {'criterion': 'absolute_error', 'max_depth': 10}   |          0.7149 |      2 |
+----------------------------------------------------+-----------------+--------+
| {'criterion': 'friedman_mse', 'max_depth': 10}     |          0.714  |      3 |
+----------------------------------------------------+-----------------+--------+
| {'criterion': 'poisson', 'max_depth': 10}          |          0.7    |      4 |
+----------------------------------------------------+-----------------+--------+
| {'criterion': 'squared_error', 'max_depth': 30}    |          0.6832 |      5 |
+----------------------------------------------------+-----------------+

In [12]:
# Random Forest Regression with GridSearchCV
rf_param_grid = {
    'n_estimators': [50, 100, 150, 200,400],
    'max_depth': [None, 5,10, 20, 30],
    'random_state':[0,13],
    # 'criterion':['squared_error', 'friedman_mse', 'absolute_error', 'poisson'],
    'min_samples_split':[0,1,2,3,4,5],
    # 'min_samples_leaf':[0,1,2,3,4,5],
    # 'min_weight_fraction_leaf':[0.1,0.2,0.5,0.10]
}
rf_grid_search = GridSearchCV(RandomForestRegressor(random_state=42), param_grid=rf_param_grid, cv=5, scoring='r2')
rf_grid_search.fit(X_train, y_train)
print_results("Random Forest Regression", rf_grid_search, rf_grid_search.best_params_, rf_grid_search.best_score_)

500 fits failed out of a total of 1500.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
250 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Python312\Lib\site-packages\sklearn\model_selection\_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Python312\Lib\site-packages\sklearn\base.py", line 1466, in wrapper
    estimator._validate_params()
  File "c:\Python312\Lib\site-packages\sklearn\base.py", line 666, in _validate_params
    validate_parameter_constraints(
  File "c:\Python312\Lib\site-packages\sklearn\utils\_param_validation.py", line 95, in validate_parameter_constraints
    raise InvalidParameterError(
sklearn.utils._param_validation.Inval


Random Forest Regression
+--------------------------------------------------------------------------------------+-----------------+--------+
| Hyperparameters                                                                      |   Mean R2 Score |   Rank |
| {'max_depth': 5, 'min_samples_split': 3, 'n_estimators': 100, 'random_state': 0}     |          0.8444 |      1 |
+--------------------------------------------------------------------------------------+-----------------+--------+
| {'max_depth': 5, 'min_samples_split': 2, 'n_estimators': 100, 'random_state': 0}     |          0.8443 |      2 |
+--------------------------------------------------------------------------------------+-----------------+--------+
| {'max_depth': 5, 'min_samples_split': 4, 'n_estimators': 100, 'random_state': 0}     |          0.8443 |      3 |
+--------------------------------------------------------------------------------------+-----------------+--------+
| {'max_depth': 5, 'min_samples_split': 5, 'n_