In [62]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.metrics import r2_score
import itertools
from tabulate import tabulate

In [42]:
# Load the dataset
file_path = 'insurance_pre.csv'
dataset = pd.read_csv(file_path)

# Perform one-hot encoding on categorical variables
dataset = pd.get_dummies(dataset, dtype=int, drop_first=True)


In [43]:
dataset

Unnamed: 0,age,bmi,children,charges,sex_male,smoker_yes
0,19,27.900,0,16884.92400,0,1
1,18,33.770,1,1725.55230,1,0
2,28,33.000,3,4449.46200,1,0
3,33,22.705,0,21984.47061,1,0
4,32,28.880,0,3866.85520,1,0
...,...,...,...,...,...,...
1333,50,30.970,3,10600.54830,1,0
1334,18,31.920,0,2205.98080,0,0
1335,18,36.850,0,1629.83350,0,0
1336,21,25.800,0,2007.94500,0,0


In [44]:
# Separate features and target
X = dataset.drop(columns=['charges'])
y = dataset['charges']

In [45]:
X

Unnamed: 0,age,bmi,children,sex_male,smoker_yes
0,19,27.900,0,0,1
1,18,33.770,1,1,0
2,28,33.000,3,1,0
3,33,22.705,0,1,0
4,32,28.880,0,1,0
...,...,...,...,...,...
1333,50,30.970,3,1,0
1334,18,31.920,0,0,0
1335,18,36.850,0,0,0
1336,21,25.800,0,0,0


In [46]:
y

0       16884.92400
1        1725.55230
2        4449.46200
3       21984.47061
4        3866.85520
           ...     
1333    10600.54830
1334     2205.98080
1335     1629.83350
1336     2007.94500
1337    29141.36030
Name: charges, Length: 1338, dtype: float64

In [47]:
# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [54]:
# Function to print results in tabular format with borders
def print_results(table_name, results, best_combination):
    df = pd.DataFrame(results, columns=["S.No", "Model", "Hyperparameter (C/E/Max Depth)", "R2 Value"])
    print(f"\n{table_name}")
    print(tabulate(df, headers='keys', tablefmt='grid', showindex=False))
    print(f"\nBest combination for {table_name}:")
    print(f"{best_combination[1]} = {best_combination[2]}, R2 Value = {best_combination[3]}")


In [55]:
# Simple Linear Regression (Single feature: let's choose 'age' for this example)
simple_results = []
X_train_simple = X_train[['age']]
X_test_simple = X_test[['age']]
simple_model = LinearRegression()
simple_model.fit(X_train_simple, y_train)
y_pred_simple = simple_model.predict(X_test_simple)
r2_simple = r2_score(y_test, y_pred_simple)
simple_results.append((1, 'Simple Linear Regression (age)', '-', r2_simple))

print("\nSimple Linear Regression")
df_simple = pd.DataFrame(simple_results, columns=["S.No", "Algorithm", "Hyperparameter (C/E/Max Depth)", "R2 Value"])
print(tabulate(df_simple, headers='keys', tablefmt='grid', showindex=False))


Simple Linear Regression
+--------+--------------------------------+----------------------------------+------------+
|   S.No | Algorithm                      | Hyperparameter (C/E/Max Depth)   |   R2 Value |
|      1 | Simple Linear Regression (age) | -                                |    0.12409 |
+--------+--------------------------------+----------------------------------+------------+


In [56]:
# Multiple Linear Regression
multiple_results = []
multiple_model = LinearRegression()
multiple_model.fit(X_train, y_train)
y_pred_multiple = multiple_model.predict(X_test)
r2_multiple = r2_score(y_test, y_pred_multiple)
multiple_results.append((1, 'Multiple Linear Regression', '-', r2_multiple))

print("\nMultiple Linear Regression")
df_multiple = pd.DataFrame(multiple_results, columns=["S.No", "Algorithm", "Hyperparameter (C/E/Max Depth)", "R2 Value"])
print(tabulate(df_multiple, headers='keys', tablefmt='grid', showindex=False))


Multiple Linear Regression
+--------+----------------------------+----------------------------------+------------+
|   S.No | Algorithm                  | Hyperparameter (C/E/Max Depth)   |   R2 Value |
|      1 | Multiple Linear Regression | -                                |    0.78113 |
+--------+----------------------------+----------------------------------+------------+


In [57]:
# Support Vector Regression with different kernels and parameters
kernels = ['linear', 'poly', 'rbf', 'sigmoid']
C_values = [0.1, 1, 10, 100]
epsilon_values = [0.1, 0.2, 0.5]

for kernel in kernels:
    svr_results = []
    s_no = 1
    best_svr_combination = (s_no, None, None, float('-inf'))
    for C, epsilon in itertools.product(C_values, epsilon_values):
        svr_model = SVR(kernel=kernel, C=C, epsilon=epsilon)
        svr_model.fit(X_train, y_train)
        y_pred_svr = svr_model.predict(X_test)
        r2_svr = r2_score(y_test, y_pred_svr)
        svr_results.append((s_no, f"SVM ({kernel})", f"C={C}, E={epsilon}", r2_svr))
        if r2_svr > best_svr_combination[3]:
            best_svr_combination = (s_no, "SVM", f"C={C}, E={epsilon}", r2_svr)
        s_no += 1
    print_results(f"Support Vector Regression (Kernel={kernel.capitalize()})", svr_results, best_svr_combination)



Support Vector Regression (Kernel=Linear)
+--------+--------------+----------------------------------+------------+
|   S.No | Model        | Hyperparameter (C/E/Max Depth)   |   R2 Value |
|      1 | SVM (linear) | C=0.1, E=0.1                     | -0.0812882 |
+--------+--------------+----------------------------------+------------+
|      2 | SVM (linear) | C=0.1, E=0.2                     | -0.0812914 |
+--------+--------------+----------------------------------+------------+
|      3 | SVM (linear) | C=0.1, E=0.5                     | -0.0813011 |
+--------+--------------+----------------------------------+------------+
|      4 | SVM (linear) | C=1, E=0.1                       | -0.0699032 |
+--------+--------------+----------------------------------+------------+
|      5 | SVM (linear) | C=1, E=0.2                       | -0.0698964 |
+--------+--------------+----------------------------------+------------+
|      6 | SVM (linear) | C=1, E=0.5                       | -0.06986

In [58]:
# Decision Tree Regression with max_depth parameter
dt_results = []
max_depth_values = [None, 10, 20, 30]
best_dt_combination = (s_no, None, None, float('-inf'))
for max_depth in max_depth_values:
    dt_model = DecisionTreeRegressor(max_depth=max_depth, random_state=42)
    dt_model.fit(X_train, y_train)
    y_pred_dt = dt_model.predict(X_test)
    r2_dt = r2_score(y_test, y_pred_dt)
    dt_results.append((s_no, "Decision Tree", f"Max Depth={max_depth}", r2_dt))
    if r2_dt > best_dt_combination[3]:
        best_dt_combination = (s_no, "Decision Tree", f"Max Depth={max_depth}", r2_dt)
    s_no += 1
print_results(f"Decision Tree Regression", dt_results, best_dt_combination)


Decision Tree Regression
+--------+---------------+----------------------------------+------------+
|   S.No | Model         | Hyperparameter (C/E/Max Depth)   |   R2 Value |
|     13 | Decision Tree | Max Depth=None                   |   0.719445 |
+--------+---------------+----------------------------------+------------+
|     14 | Decision Tree | Max Depth=10                     |   0.715913 |
+--------+---------------+----------------------------------+------------+
|     15 | Decision Tree | Max Depth=20                     |   0.719445 |
+--------+---------------+----------------------------------+------------+
|     16 | Decision Tree | Max Depth=30                     |   0.719445 |
+--------+---------------+----------------------------------+------------+

Best combination for Decision Tree Regression:
Decision Tree = Max Depth=None, R2 Value = 0.7194454629447083


In [61]:
# Random Forest Regression with number of estimators parameter
rf_results = []
n_estimators_values = [50, 100, 150, 200,500]
best_rf_combination = (s_no, None, None, float('-inf'))
for n_estimators in n_estimators_values:
    rf_model = RandomForestRegressor(n_estimators=n_estimators, random_state=42)
    rf_model.fit(X_train, y_train)
    y_pred_rf = rf_model.predict(X_test)
    r2_rf = r2_score(y_test, y_pred_rf)
    rf_results.append((s_no, "Random Forest", f"Estimators={n_estimators}", r2_rf))
    if r2_rf > best_rf_combination[3]:
        best_rf_combination = (s_no, "Random Forest", f"Estimators={n_estimators}", r2_rf)
    s_no += 1
print_results(f"Random Forest Regression", rf_results, best_rf_combination)


Random Forest Regression
+--------+---------------+----------------------------------+------------+
|   S.No | Model         | Hyperparameter (C/E/Max Depth)   |   R2 Value |
|     21 | Random Forest | Estimators=50                    |   0.856542 |
+--------+---------------+----------------------------------+------------+
|     22 | Random Forest | Estimators=100                   |   0.857702 |
+--------+---------------+----------------------------------+------------+
|     23 | Random Forest | Estimators=150                   |   0.857946 |
+--------+---------------+----------------------------------+------------+
|     24 | Random Forest | Estimators=200                   |   0.859015 |
+--------+---------------+----------------------------------+------------+
|     25 | Random Forest | Estimators=500                   |   0.85864  |
+--------+---------------+----------------------------------+------------+

Best combination for Random Forest Regression:
Random Forest = Estimators

In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score
from tabulate import tabulate
import joblib

# Load the dataset
file_path = 'insurance_pre.csv'
dataset = pd.read_csv(file_path)

# Perform one-hot encoding on categorical variables
dataset = pd.get_dummies(dataset, dtype=int, drop_first=True)

# Separate features and target
X = dataset.drop(columns=['charges'])
y = dataset['charges']

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Function to print results in tabular format with borders
def print_results(table_name, results, best_params, best_score):
    df = pd.DataFrame(results.cv_results_)
    df = df[['params', 'mean_test_score', 'rank_test_score']]
    df['mean_test_score'] = df['mean_test_score'].round(4)
    df = df.rename(columns={'params': 'Hyperparameters', 'mean_test_score': 'Mean R2 Score', 'rank_test_score': 'Rank'})
    df_sorted = df.sort_values(by='Mean R2 Score', ascending=False).reset_index(drop=True)
    print(f"\n{table_name}")
    print(tabulate(df_sorted, headers='keys', tablefmt='grid', showindex=False))
    print(f"\nBest combination for {table_name}:")
    print(f"Parameters: {best_params}")
    print(f"Best Mean R2 Score: {best_score:.4f}")

# Dictionary to hold the best models and their scores
best_models = {}

# Simple Linear Regression
simple_model = LinearRegression()
simple_model.fit(X_train, y_train)
y_pred_simple = simple_model.predict(X_test)
r2_simple = r2_score(y_test, y_pred_simple)
print("\nSimple Linear Regression")
print(f"R2 Score: {r2_simple:.4f}")
best_models['Simple Linear Regression'] = (simple_model, r2_simple)

# Support Vector Regression with GridSearchCV
svr_param_grid = {
    'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
    'C': [0.1, 1, 10, 100],
    'epsilon': [0.1, 0.2, 0.5]
}
svr_grid_search = GridSearchCV(SVR(), param_grid=svr_param_grid, cv=5, scoring='r2')
svr_grid_search.fit(X_train, y_train)
print_results("Support Vector Regression", svr_grid_search, svr_grid_search.best_params_, svr_grid_search.best_score_)
best_models['Support Vector Regression'] = (svr_grid_search.best_estimator_, svr_grid_search.best_score_)

# Decision Tree Regression with GridSearchCV
dt_param_grid = {
    'max_depth': [None, 10, 20, 30],
    'criterion': ['squared_error', 'friedman_mse', 'absolute_error', 'poisson']
}
dt_grid_search = GridSearchCV(DecisionTreeRegressor(random_state=42), param_grid=dt_param_grid, cv=5, scoring='r2')
dt_grid_search.fit(X_train, y_train)
print_results("Decision Tree Regression", dt_grid_search, dt_grid_search.best_params_, dt_grid_search.best_score_)
best_models['Decision Tree Regression'] = (dt_grid_search.best_estimator_, dt_grid_search.best_score_)

# Random Forest Regression with GridSearchCV
rf_param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20, 30],
}
rf_grid_search = GridSearchCV(RandomForestRegressor(random_state=42), param_grid=rf_param_grid, cv=5, scoring='r2')
rf_grid_search.fit(X_train, y_train)
print_results("Random Forest Regression", rf_grid_search, rf_grid_search.best_params_, rf_grid_search.best_score_)
best_models['Random Forest Regression'] = (rf_grid_search.best_estimator_, rf_grid_search.best_score_)

# Find the best model based on the R2 score
best_model_name = max(best_models, key=lambda k: best_models[k][1])
best_model, best_model_score = best_models[best_model_name]

print(f"\nThe best model is {best_model_name} with an R2 score of {best_model_score:.4f}")

# Save the best model
joblib.dump(best_model, 'best_model.pkl')
print("Best model saved as 'best_model.pkl'")



Simple Linear Regression
R2 Score: 0.7811

Support Vector Regression
+-------------------------------------------------+-----------------+--------+
| Hyperparameters                                 |   Mean R2 Score |   Rank |
| {'C': 100, 'epsilon': 0.5, 'kernel': 'linear'}  |          0.5019 |      3 |
+-------------------------------------------------+-----------------+--------+
| {'C': 100, 'epsilon': 0.1, 'kernel': 'linear'}  |          0.5019 |      1 |
+-------------------------------------------------+-----------------+--------+
| {'C': 100, 'epsilon': 0.2, 'kernel': 'linear'}  |          0.5019 |      2 |
+-------------------------------------------------+-----------------+--------+
| {'C': 10, 'epsilon': 0.2, 'kernel': 'linear'}   |         -0.0513 |      5 |
+-------------------------------------------------+-----------------+--------+
| {'C': 10, 'epsilon': 0.1, 'kernel': 'linear'}   |         -0.0513 |      4 |
+-------------------------------------------------+----------