In [46]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.ensemble import AdaBoostRegressor,GradientBoostingRegressor
import xgboost as xgb
import lightgbm as lgb
from tabulate import tabulate
# Load dataset
data = pd.read_csv('insurance_data.csv')
data = pd.get_dummies(data, dtype=int, drop_first=True)
# Preprocess data (Assuming 'charges' is the target variable)
X = data.drop('charges', axis=1)
y = data['charges']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize the data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Define a function for grid search and evaluation
def grid_search_and_evaluate(model, param_grid):
    grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=3, n_jobs=-1, scoring='neg_mean_squared_error')
    grid_search.fit(X_train_scaled, y_train)
    best_model = grid_search.best_estimator_
    y_pred = best_model.predict(X_test_scaled)
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    print(f'Mean Squared Error: {mse}')
    print(f'R² Score: {r2}')
    print(f'Best Parameters: {grid_search.best_params_}')
    return best_model, r2

# Store R² scores
r2_scores = {}

# AdaBoost
ada_param_grid = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.01, 0.1, 1.0]
}
print("AdaBoost Regressor:")
ada_model = AdaBoostRegressor(random_state=42)
best_ada_model, ada_r2 = grid_search_and_evaluate(ada_model, ada_param_grid)
r2_scores['AdaBoost'] = ada_r2



AdaBoost Regressor:
Mean Squared Error: 22026496.07917349
R² Score: 0.8581212492525863
Best Parameters: {'learning_rate': 0.01, 'n_estimators': 50}


In [47]:
# Gradient Boosting
gbr_param_grid = {
    'n_estimators': [100, 200],
    'learning_rate': [0.01, 0.1, 0.05],
    'max_depth': [3, 4, 5]
}
print("\nGradient Boosting Regressor:")
gbr_model = GradientBoostingRegressor(random_state=42)
best_gbr_model, gbr_r2 = grid_search_and_evaluate(gbr_model, gbr_param_grid)
r2_scores['Gradient Boosting'] = gbr_r2


Gradient Boosting Regressor:
Mean Squared Error: 18762442.407698467
R² Score: 0.8791459213391855
Best Parameters: {'learning_rate': 0.05, 'max_depth': 3, 'n_estimators': 100}


In [48]:
# XGBoost
xgb_param_grid = {
    'n_estimators': [100, 200],
    'learning_rate': [0.01, 0.1, 0.05],
    'max_depth': [3, 4, 5]
}
print("\nXGBoost Regressor:")
xgb_model = xgb.XGBRegressor(objective='reg:squarederror', random_state=42)
best_xgb_model, xgb_r2 = grid_search_and_evaluate(xgb_model, xgb_param_grid)
r2_scores['XGBoost'] = xgb_r2



XGBoost Regressor:
Mean Squared Error: 18052212.982787546
R² Score: 0.8837207054168805
Best Parameters: {'learning_rate': 0.05, 'max_depth': 3, 'n_estimators': 100}


In [49]:
# LightGBM
lgb_param_grid = {
    'n_estimators': [100, 200],
    'learning_rate': [0.01, 0.1, 0.05],
    'num_leaves': [31, 40, 50]
}
print("\nLightGBM Regressor:")
lgb_model = lgb.LGBMRegressor(random_state=42)
best_lgb_model, lgb_r2 = grid_search_and_evaluate(lgb_model, lgb_param_grid)
r2_scores['LightGBM'] = lgb_r2


LightGBM Regressor:
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000102 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 316
[LightGBM] [Info] Number of data points in the train set: 1070, number of used features: 5
[LightGBM] [Info] Start training from score 13346.089733
Mean Squared Error: 20046975.670354202
R² Score: 0.8708718874690679
Best Parameters: {'learning_rate': 0.05, 'n_estimators': 100, 'num_leaves': 31}


In [50]:
# Display the R² scores in a table format
r2_df = pd.DataFrame.from_dict(r2_scores, orient='index', columns=['R² Score'])
r2_df.index.name = 'Algorithm'

# Display the table with borders using tabulate
table = tabulate(r2_df, headers='keys', tablefmt='grid')
print("\nR² Scores Table:")
print(table)


R² Scores Table:
+-------------------+------------+
| Algorithm         |   R² Score |
| AdaBoost          |   0.858121 |
+-------------------+------------+
| Gradient Boosting |   0.879146 |
+-------------------+------------+
| XGBoost           |   0.883721 |
+-------------------+------------+
| LightGBM          |   0.870872 |
+-------------------+------------+


In [51]:
# Identify the best model
best_algorithm = r2_df['R² Score'].idxmax()
best_r2_score = r2_df['R² Score'].max()
print(f"\nBest Algorithm: {best_algorithm} with R² Score: {best_r2_score}")


Best Algorithm: XGBoost with R² Score: 0.8837207054168805


In [52]:
import pickle
# Save the best model and scaler using pickle
if best_algorithm == 'AdaBoost':
    best_model = best_ada_model
elif best_algorithm == 'Gradient Boosting':
    best_model = best_gbr_model
elif best_algorithm == 'XGBoost':
    best_model = best_xgb_model
elif best_algorithm == 'LightGBM':
    best_model = best_lgb_model

with open('best_model.pkl', 'wb') as f:
    pickle.dump(best_model, f)

with open('scaler.pkl', 'wb') as f:
    pickle.dump(scaler, f)

print("\nBest model and scaler saved!")


Best model and scaler saved!


In [53]:
# Function to preprocess new input data
def preprocess_input(new_data):
    print("\nOriginal Input Data:")
    print(new_data)

    new_data = pd.get_dummies(new_data, dtype=int, drop_first=True)
    print("\nInput Data after Applying get_dummies:")
    print(new_data)

    missing_cols = set(X.columns) - set(new_data.columns)
    for col in missing_cols:
        new_data[col] = 0
    print("\nInput Data after Adding Missing Columns:")
    print(new_data)

    new_data = new_data[X.columns]
    print("\nInput Data Aligned with Training Data Columns:")
    print(new_data)

    with open('scaler.pkl', 'rb') as f:
        scaler = pickle.load(f)
    new_data_scaled = scaler.transform(new_data)
    print("\nInput Data after Standardization:")
    print(new_data_scaled)

    return new_data, new_data_scaled

# Function to load the model and make predictions
def predict(new_data):
    with open('best_model.pkl', 'rb') as f:
        model = pickle.load(f)
    new_data_original, new_data_processed = preprocess_input(new_data)
    predictions = model.predict(new_data_processed)
    return new_data_original, new_data_processed, predictions

In [54]:
# Example usage of predict function
new_data = pd.DataFrame([X_test.iloc[0]])  # Example input
predictions = predict(new_data)
print(f"\nPredictions for the new input data: {predictions}")


Original Input Data:
      age     bmi  children  sex_male  smoker_yes
764  45.0  25.175       2.0       0.0         0.0

Input Data after Applying get_dummies:
      age     bmi  children  sex_male  smoker_yes
764  45.0  25.175       2.0       0.0         0.0

Input Data after Adding Missing Columns:
      age     bmi  children  sex_male  smoker_yes
764  45.0  25.175       2.0       0.0         0.0

Input Data Aligned with Training Data Columns:
      age     bmi  children  sex_male  smoker_yes
764  45.0  25.175       2.0       0.0         0.0

Input Data after Standardization:
[[ 0.40114007 -0.89153925  0.73433626 -1.0246016  -0.50874702]]

Predictions for the new input data: (      age     bmi  children  sex_male  smoker_yes
764  45.0  25.175       2.0       0.0         0.0, array([[ 0.40114007, -0.89153925,  0.73433626, -1.0246016 , -0.50874702]]), array([10043.34], dtype=float32))


In [55]:
# Custom input data (example format)
custom_data = {
    'age': [19],
    'sex_male': [1],
    'bmi': [27.9],
    'children': [0],
    'smoker_yes': [1]
}

# Convert custom input data to DataFrame
new_data = pd.DataFrame(custom_data)

# Make predictions
new_data_original, new_data_processed, predictions = predict(new_data)

# Display results
print("\nInput Data for Predictions:")
print(new_data_original)
print("\nPreprocessed Input Data for Predictions:")
print(new_data_processed)
print("\nPredictions for the custom input data:")
print(predictions)


Original Input Data:
   age  sex_male   bmi  children  smoker_yes
0   19         1  27.9         0           1

Input Data after Applying get_dummies:
   age  sex_male   bmi  children  smoker_yes
0   19         1  27.9         0           1

Input Data after Adding Missing Columns:
   age  sex_male   bmi  children  smoker_yes
0   19         1  27.9         0           1

Input Data Aligned with Training Data Columns:
   age   bmi  children  sex_male  smoker_yes
0   19  27.9         0         1           1

Input Data after Standardization:
[[-1.44710717 -0.44042221 -0.91119211  0.97598911  1.96561348]]

Input Data for Predictions:
   age   bmi  children  sex_male  smoker_yes
0   19  27.9         0         1           1

Preprocessed Input Data for Predictions:
[[-1.44710717 -0.44042221 -0.91119211  0.97598911  1.96561348]]

Predictions for the custom input data:
[17789.484]
