In [43]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import cross_val_score
import matplotlib.pyplot as plt
import seaborn as sns
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error


insurance_data = pd.read_csv('insurance.csv')

In [20]:
insurance_data

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.900,0,yes,southwest,16884.92400
1,18,male,33.770,1,no,southeast,1725.55230
2,28,male,33.000,3,no,southeast,4449.46200
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.880,0,no,northwest,3866.85520
...,...,...,...,...,...,...,...
1333,50,male,30.970,3,no,northwest,10600.54830
1334,18,female,31.920,0,no,northeast,2205.98080
1335,18,female,36.850,0,no,southeast,1629.83350
1336,21,female,25.800,0,no,southwest,2007.94500


In [21]:
# These are the columns with missing values within the data set
col_with_missing = [col for col in insurance_data.columns if insurance_data[col].isnull().any()]
print(col_with_missing)

[]


In [22]:
features = ['age', 'sex', 'bmi', 'children', 'smoker', 'region']

In [23]:
X = pd.get_dummies(insurance_data[features])
y = insurance_data['charges']

In [24]:
train_X, val_X, train_y, val_y = train_test_split(X, y)

In [49]:
model = XGBRegressor(n_estimators = 9)
model.fit(train_X, train_y)

In [50]:
#Cross Validation
score = -1 * cross_val_score(model, X, y, cv=5, scoring='neg_mean_absolute_error')
print('MAE scores:\n', score)

MAE scores:
 [2304.63085924 2730.45122824 2131.53879233 2344.57739495 2551.23895034]


In [51]:
print('Average MAE score (across experimennts):')
print(score.mean())

Average MAE score (across experimennts):
2412.4874450185043


In [55]:
#Parameter Tuning (This allows you to see the rmse based on number of trees)
model = XGBRegressor(n_estimators = 9)
model.fit(train_X, train_y, early_stopping_rounds = 5, eval_set = [(val_X, val_y)])

[0]	validation_0-rmse:12451.66402
[1]	validation_0-rmse:9331.89493
[2]	validation_0-rmse:7273.72052
[3]	validation_0-rmse:6034.09838
[4]	validation_0-rmse:5312.47214
[5]	validation_0-rmse:4873.06357
[6]	validation_0-rmse:4666.76067
[7]	validation_0-rmse:4605.71539
[8]	validation_0-rmse:4574.30676


In [56]:
help(XGBRegressor)

Help on class XGBRegressor in module xgboost.sklearn:

class XGBRegressor(XGBModel, sklearn.base.RegressorMixin)
 |  XGBRegressor(*, objective: Union[str, Callable[[numpy.ndarray, numpy.ndarray], Tuple[numpy.ndarray, numpy.ndarray]], NoneType] = 'reg:squarederror', **kwargs: Any) -> None
 |  
 |  Implementation of the scikit-learn API for XGBoost regression.
 |  
 |  
 |  Parameters
 |  ----------
 |  
 |      n_estimators : int
 |          Number of gradient boosted trees.  Equivalent to number of boosting
 |          rounds.
 |  
 |      max_depth :  Optional[int]
 |          Maximum tree depth for base learners.
 |      max_leaves :
 |          Maximum number of leaves; 0 indicates no limit.
 |      max_bin :
 |          If using histogram-based algorithm, maximum number of bins per feature
 |      grow_policy :
 |          Tree growing policy. 0: favor splitting at nodes closest to the node, i.e. grow
 |          depth-wise. 1: favor splitting at nodes with highest loss change.
 | 