# Healthcare Cost Prediction Model for DFW Insurance

### > Please follow the lines by pressing shift + enter to run each section of code. Start at the top and work your way down

### 1. Import all necessary libraries

In [79]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer


#### 2. Our data is in a file named insurance.csv, please make sure that it is downloaded and in the same folder as this file. After finishing this step, you should see the first five rows of data.

In [96]:
names = ['Age','Sex','BMI', 'Children', 'Smoker', 'Region(US)', 'Healthcare Costs']
insurance_data = pd.read_csv('insurance.csv', names = names)
insurance_data.head()

Unnamed: 0,Age,Sex,BMI,Children,Smoker,Region(US),Healthcare Costs
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


#### 3. Create the data and split into X/y for testing and training 

In [99]:
X = insurance_data.drop("Healthcare Costs", axis=1)
y = insurance_data["Healthcare Costs"]


#### 4. We will need to change our categories from categorical to numerical since our model will not be able to understand it. We will be using one-hot encoding to change these categories into numebers.

In [102]:
person_attr = ["Sex", "Children", "Smoker", "Region(US)"]
one_hot = OneHotEncoder()
transformer = ColumnTransformer([("one_hot", one_hot, person_attr)], remainder="passthrough")

#### 5. Now we will need to split the data into training and test sets, we will be using a test set of 20%. We will also create our random seed in this step

In [105]:
np.random.seed(42)
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y, 
                                                    test_size = 0.2)


#### 6. Now we transform our data from categorical to numerical using the transformer that we created in step 4

In [108]:
X_train_transformed = transformer.fit_transform(X_train)
X_test_transformed = transformer.transform(X_test)

#### 7. Fit and score the model

In [123]:
model = RandomForestRegressor(random_state=42)
model.fit(X_train_transformed, y_train)
score = model.score(X_test_transformed, y_test)
print(f"Model R^2 Score: {score:.2}")

Model R^2 Score: 0.86


## Prediction for individual users

#### - User will need to input data into the below structure. 

    - Only change data in the brackets, and press shift + enter

    - 'Sex': ['male'], 
    - 'Children': [1], 
    - 'Smoker': ['no'], 
    - 'Region(US)': ['northeast'], 
    - 'Age': [38],
    - 'BMI': [37.05]

An example has done below for you to view 

#### 1. See below example 

In [146]:
userInput = pd.DataFrame({
    'Sex': ['male'], 
    'Children': [1], 
    'Smoker': ['no'], 
    'Region(US)': ['northeast'], 
    'Age': [38],
    'BMI': [37.05]
})

# Transform the new user input
newUserInput_transformed = transformer.transform(userInput)

# Predict using the trained model
prediction = model.predict(newUserInput_transformed)
print(f"Predicted Healthcare Costs: {prediction[0]:.2f}")

Predicted Healthcare Costs: 7610.98


#### 2. User sample format, just need to re-enter the data and press shift + enter to rerun

In [157]:
userInput = pd.DataFrame({
    'Sex': ['female'], 
    'Children': [0], 
    'Smoker': ['yes'], 
    'Region(US)': ['southwest'], 
    'Age': [19],
    'BMI': [27.900]
})

# Transform the new user input
newUserInput_transformed = transformer.transform(userInput)

# Predict using the trained model
prediction = model.predict(newUserInput_transformed)
print(f"Predicted Healthcare Costs: {prediction[0]:.2f}")

Predicted Healthcare Costs: 17106.20
