In [23]:
import pandas as pd

# Load the dataset
data = pd.read_csv('insurance_pre.csv')

# Inspect the dataset
print(data.head())
print(data.info())


   age     sex     bmi  children smoker      charges
0   19  female  27.900         0    yes  16884.92400
1   18    male  33.770         1     no   1725.55230
2   28    male  33.000         3     no   4449.46200
3   33    male  22.705         0     no  21984.47061
4   32    male  28.880         0     no   3866.85520
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 6 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1338 non-null   int64  
 1   sex       1338 non-null   object 
 2   bmi       1338 non-null   float64
 3   children  1338 non-null   int64  
 4   smoker    1338 non-null   object 
 5   charges   1338 non-null   float64
dtypes: float64(2), int64(2), object(2)
memory usage: 62.8+ KB
None


In [24]:
from sklearn.preprocessing import StandardScaler

# One-hot encode categorical variables
data = pd.get_dummies(data, drop_first=True)

# Standardize the numerical features
scaler = StandardScaler()
numerical_features = ['age', 'bmi', 'children']
data[numerical_features] = scaler.fit_transform(data[numerical_features])

# Separate features and target
X = data.drop('charges', axis=1)
y = data['charges']

In [25]:
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the models and their hyperparameters for GridSearchCV
models = {
    'Linear Regression': {
        'model': LinearRegression(),
        'params': {}
    },
    'Support Vector Regression (SVR)': {
        'model': SVR(),
        'params': {
            'C': [0.1, 1, 10],
            'gamma': [0.01, 0.1, 1],
            'kernel': ['rbf', 'linear']
        }
    },
    'Decision Tree Regression': {
        'model': DecisionTreeRegressor(),
        'params': {
            'max_depth': [None, 10, 20, 30],
            'min_samples_split': [2, 5, 10]
        }
    },
    'Random Forest Regression': {
        'model': RandomForestRegressor(),
        'params': {
            'n_estimators': [50, 100, 200],
            'max_depth': [None, 10, 20],
            'min_samples_split': [2, 5, 10]
        }
    }
}

# Function to perform GridSearchCV and return the best model and its R2 score
def perform_grid_search(models, X_train, y_train, X_test, y_test):
    results = {}
    for name, model_info in models.items():
        clf = GridSearchCV(model_info['model'], model_info['params'], cv=5, scoring='r2', n_jobs=-1)
        clf.fit(X_train, y_train)
        y_pred = clf.best_estimator_.predict(X_test)
        r2 = r2_score(y_test, y_pred)
        results[name] = r2
        print(f"Best parameters for {name}: {clf.best_params_}")
        print(f"R² score for {name}: {r2}")
    return results

# Perform grid search for all models and get their R2 scores
results = perform_grid_search(models, X_train, y_train, X_test, y_test)

# Print the results in a table format
results_df = pd.DataFrame(list(results.items()), columns=['Model', 'R² Score'])
print(results_df)


Best parameters for Linear Regression: {}
R² score for Linear Regression: 0.7811302113434095
Best parameters for Support Vector Regression (SVR): {'C': 10, 'gamma': 0.01, 'kernel': 'linear'}
R² score for Support Vector Regression (SVR): 0.04845564888908371
Best parameters for Decision Tree Regression: {'max_depth': 10, 'min_samples_split': 10}
R² score for Decision Tree Regression: 0.7841900691143723
Best parameters for Random Forest Regression: {'max_depth': 10, 'min_samples_split': 10, 'n_estimators': 50}
R² score for Random Forest Regression: 0.8643858349688659
                             Model  R² Score
0                Linear Regression  0.781130
1  Support Vector Regression (SVR)  0.048456
2         Decision Tree Regression  0.784190
3         Random Forest Regression  0.864386
