In [1]:
# Import the modules
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error, r2_score
import hvplot.pandas



In [2]:
#  Import and read the charity_data.csv.
insurance_df = pd.read_csv("./Resources/insurance_dataset.csv")
insurance_df.head()

Unnamed: 0,age,gender,bmi,children,smoker,region,medical_history,family_medical_history,exercise_frequency,occupation,coverage_level,charges
0,46,male,21.45,5,yes,southeast,Diabetes,,Never,Blue collar,Premium,20460.307669
1,25,female,25.38,2,yes,northwest,Diabetes,High blood pressure,Occasionally,White collar,Premium,20390.899218
2,38,male,44.88,2,yes,southwest,,High blood pressure,Occasionally,Blue collar,Premium,20204.476302
3,25,male,19.89,0,no,northwest,,Diabetes,Rarely,White collar,Standard,11789.029843
4,49,male,38.21,3,yes,northwest,Diabetes,High blood pressure,Rarely,White collar,Standard,19268.309838


In [3]:
#Check for null values in the dataframe
insurance_df.isnull().any()

age                       False
gender                    False
bmi                       False
children                  False
smoker                    False
region                    False
medical_history           False
family_medical_history    False
exercise_frequency        False
occupation                False
coverage_level            False
charges                   False
dtype: bool

In [4]:
#Get the summary statistics for numerical columns in the DataFrame 
insurance_df.describe()

Unnamed: 0,age,bmi,children,charges
count,1000000.0,1000000.0,1000000.0,1000000.0
mean,41.495282,34.001839,2.499886,16735.117481
std,13.855189,9.23168,1.707679,4415.808211
min,18.0,18.0,0.0,3445.011643
25%,29.0,26.02,1.0,13600.372379
50%,41.0,34.0,2.0,16622.127973
75%,53.0,41.99,4.0,19781.46541
max,65.0,50.0,5.0,32561.560374


In [5]:
#Check data types 
insurance_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000000 entries, 0 to 999999
Data columns (total 12 columns):
 #   Column                  Non-Null Count    Dtype  
---  ------                  --------------    -----  
 0   age                     1000000 non-null  int64  
 1   gender                  1000000 non-null  object 
 2   bmi                     1000000 non-null  float64
 3   children                1000000 non-null  int64  
 4   smoker                  1000000 non-null  object 
 5   region                  1000000 non-null  object 
 6   medical_history         1000000 non-null  object 
 7   family_medical_history  1000000 non-null  object 
 8   exercise_frequency      1000000 non-null  object 
 9   occupation              1000000 non-null  object 
 10  coverage_level          1000000 non-null  object 
 11  charges                 1000000 non-null  float64
dtypes: float64(2), int64(2), object(8)
memory usage: 91.6+ MB


As seen in Above table, the dataframe consists of 4 Numerical fields and 8 the categorial Fields

In [6]:
# Look at medical_history value counts for binning
insurance_df["medical_history"].value_counts()

None                   250762
Heart disease          250121
High blood pressure    249782
Diabetes               249335
Name: medical_history, dtype: int64

In [7]:
# Look at family_medical_history value counts for binning
insurance_df['family_medical_history'].value_counts()

None                   250404
Heart disease          250035
High blood pressure    249824
Diabetes               249737
Name: family_medical_history, dtype: int64

In [8]:
# Look at exercise_frequency value counts for binning
insurance_df['exercise_frequency'].value_counts()

Rarely          250538
Occasionally    250362
Frequently      249746
Never           249354
Name: exercise_frequency, dtype: int64

In [9]:
# Look at occupation value counts for binning
insurance_df['occupation'].value_counts()

Unemployed      250571
Student         250279
Blue collar     249825
White collar    249325
Name: occupation, dtype: int64

In [10]:
# Look at occupation coverage_level value counts for binning
insurance_df['coverage_level'].value_counts()

Basic       333515
Standard    333508
Premium     332977
Name: coverage_level, dtype: int64

In [11]:
insurance_df['charges'].value_counts()

20460.307669    1
18052.397610    1
12177.344390    1
19065.593605    1
11276.371195    1
               ..
21941.737409    1
16723.873993    1
16466.975023    1
16959.139174    1
23429.725030    1
Name: charges, Length: 1000000, dtype: int64

In [12]:
#Check the columns for dataframe
insurance_df.columns


Index(['age', 'gender', 'bmi', 'children', 'smoker', 'region',
       'medical_history', 'family_medical_history', 'exercise_frequency',
       'occupation', 'coverage_level', 'charges'],
      dtype='object')

In [13]:
#Convert the Categorial values to Numeric values and Concatenate
concatenated_df = pd.get_dummies(insurance_df)
concatenated_df.columns

Index(['age', 'bmi', 'children', 'charges', 'gender_female', 'gender_male',
       'smoker_no', 'smoker_yes', 'region_northeast', 'region_northwest',
       'region_southeast', 'region_southwest', 'medical_history_Diabetes',
       'medical_history_Heart disease', 'medical_history_High blood pressure',
       'medical_history_None', 'family_medical_history_Diabetes',
       'family_medical_history_Heart disease',
       'family_medical_history_High blood pressure',
       'family_medical_history_None', 'exercise_frequency_Frequently',
       'exercise_frequency_Never', 'exercise_frequency_Occasionally',
       'exercise_frequency_Rarely', 'occupation_Blue collar',
       'occupation_Student', 'occupation_Unemployed',
       'occupation_White collar', 'coverage_level_Basic',
       'coverage_level_Premium', 'coverage_level_Standard'],
      dtype='object')

In [14]:
concatenated_df.head()

Unnamed: 0,age,bmi,children,charges,gender_female,gender_male,smoker_no,smoker_yes,region_northeast,region_northwest,...,exercise_frequency_Never,exercise_frequency_Occasionally,exercise_frequency_Rarely,occupation_Blue collar,occupation_Student,occupation_Unemployed,occupation_White collar,coverage_level_Basic,coverage_level_Premium,coverage_level_Standard
0,46,21.45,5,20460.307669,0,1,0,1,0,0,...,1,0,0,1,0,0,0,0,1,0
1,25,25.38,2,20390.899218,1,0,0,1,0,1,...,0,1,0,0,0,0,1,0,1,0
2,38,44.88,2,20204.476302,0,1,0,1,0,0,...,0,1,0,1,0,0,0,0,1,0
3,25,19.89,0,11789.029843,0,1,1,0,0,1,...,0,0,1,0,0,0,1,0,0,1
4,49,38.21,3,19268.309838,0,1,0,1,0,1,...,0,0,1,0,0,0,1,0,0,1


In [15]:
# Separate the data into labels and features
# Separate the y variable, the labels
y = concatenated_df['charges']

# Separate the X variable, the features
X = concatenated_df.drop(columns="charges")

In [16]:
#Check columns of Features
X.columns

Index(['age', 'bmi', 'children', 'gender_female', 'gender_male', 'smoker_no',
       'smoker_yes', 'region_northeast', 'region_northwest',
       'region_southeast', 'region_southwest', 'medical_history_Diabetes',
       'medical_history_Heart disease', 'medical_history_High blood pressure',
       'medical_history_None', 'family_medical_history_Diabetes',
       'family_medical_history_Heart disease',
       'family_medical_history_High blood pressure',
       'family_medical_history_None', 'exercise_frequency_Frequently',
       'exercise_frequency_Never', 'exercise_frequency_Occasionally',
       'exercise_frequency_Rarely', 'occupation_Blue collar',
       'occupation_Student', 'occupation_Unemployed',
       'occupation_White collar', 'coverage_level_Basic',
       'coverage_level_Premium', 'coverage_level_Standard'],
      dtype='object')

In [17]:
#Review the dataframe 
X.head()

Unnamed: 0,age,bmi,children,gender_female,gender_male,smoker_no,smoker_yes,region_northeast,region_northwest,region_southeast,...,exercise_frequency_Never,exercise_frequency_Occasionally,exercise_frequency_Rarely,occupation_Blue collar,occupation_Student,occupation_Unemployed,occupation_White collar,coverage_level_Basic,coverage_level_Premium,coverage_level_Standard
0,46,21.45,5,0,1,0,1,0,0,1,...,1,0,0,1,0,0,0,0,1,0
1,25,25.38,2,1,0,0,1,0,1,0,...,0,1,0,0,0,0,1,0,1,0
2,38,44.88,2,0,1,0,1,0,0,0,...,0,1,0,1,0,0,0,0,1,0
3,25,19.89,0,0,1,1,0,0,1,0,...,0,0,1,0,0,0,1,0,0,1
4,49,38.21,3,0,1,0,1,0,1,0,...,0,0,1,0,0,0,1,0,0,1


In [18]:
# Import the train_test_learn module
from sklearn.model_selection import train_test_split

# Split the data using train_test_split
# Assign a random_state of 1 to the function
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    test_size= 0.2,
                                                    random_state=1, 
                                                    )
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((800000, 30), (200000, 30), (800000,), (200000,))

In [19]:
# Create a Linear Regression model with scikit-learn
model = LinearRegression()
# Fit the data into the model
model.fit(X_train, y_train)
# Make predictions using the X set
predicted_y_values = model.predict(X_test)
predicted_y_values_train  = model.predict(X_train)

print(f"Training Data Score for Linear Regression: {model.score(X_train, y_train)}")
print(f"Testing Data Score for Linear Regression: {model.score(X_test, y_test)}")

reg_mse_test = mean_squared_error(y_test, predicted_y_values)
reg_rmse_test = mean_squared_error(y_test, predicted_y_values, squared=False)

reg_mse_train = mean_squared_error(y_train, predicted_y_values_train)
reg_rmse_train = mean_squared_error(y_train, predicted_y_values_train, squared=False)

reg_r2_score = r2_score(y_test, predicted_y_values)

# Make a prediction using the testing data
results_linear = pd.DataFrame({"Prediction for Linear Regression": predicted_y_values, "Actual for Linear Regression": y_test}).reset_index(drop=True)

print(f'Mean Squared Error for Linear Regression for Testing Values: {reg_mse_test}')
print(f'Root Mean Squared Error for Linear Regression for Testing Values: {reg_rmse_test}')

print(f'Mean Squared Error for Linear Regression for Training Values: {reg_mse_train}')
print(f'Root Mean Squared Error for Linear Regression for Training Values: {reg_rmse_train}')

print(f'R2 score for Linear Regression: {reg_r2_score}')
results_linear.head()


Training Data Score for Linear Regression: 0.9957266917943283
Testing Data Score for Linear Regression: 0.9957191270914253
Mean Squared Error for Linear Regression for Testing Values: 83546.37958252191
Root Mean Squared Error for Linear Regression for Testing Values: 289.0439059771403
Mean Squared Error for Linear Regression for Training Values: 83308.53864396818
Root Mean Squared Error for Linear Regression for Training Values: 288.63218573812617
R2 score for Linear Regression: 0.9957191270914253


Unnamed: 0,Prediction for Linear Regression,Actual for Linear Regression
0,12379.27417,12481.068956
1,18784.150635,18299.071994
2,18862.621338,18846.795608
3,21283.642822,21597.663069
4,25182.140869,25596.721389


As seen above,the difference between Root Mean Squared Error and Mean Squared Error for Training and testing values is low ,it can be concluded that the data is very slightly overfitting.

In [20]:
# Create a Decision Tree Regression model with scikit-learn
model_tree = DecisionTreeRegressor()

# Fit the data into the model
model_tree.fit(X_train, y_train)
# Make predictions using the X set
predicted_y_values_tree = model_tree.predict(X_test)
predicted_y_values_tree_train  = model.predict(X_train)

print(f"Training Data Score for Decision Tree Regression: {model_tree.score(X_train, y_train)}")
print(f"Testing Data Score for Decision Tree Regression: {model_tree.score(X_test, y_test)}")

reg_mse_tree = mean_squared_error(y_test, predicted_y_values_tree)
reg_rmse_tree = mean_squared_error(y_test, predicted_y_values_tree, squared=False)

reg_mse_tree_train = mean_squared_error(y_train, predicted_y_values_tree_train)
reg_rmse_tree_train = mean_squared_error(y_train, predicted_y_values_tree_train, squared=False)

reg_r2_score_tree = r2_score(y_test, predicted_y_values_tree)

# Make a prediction using the testing data
results_tree = pd.DataFrame({"Prediction for Decision Tree Regression": predicted_y_values_tree, "Actual for Decision Tree Regression": y_test}).reset_index(drop=True)

print(f'Mean Squared Error for Decision Tree Regression: {reg_mse_tree}')
print(f'Root Mean Squared Error for Decision Tree Regression: {reg_rmse_tree}')

print(f'Mean Squared Error for Decision Tree Regression for Training Values: {reg_mse_tree_train}')
print(f'Root Mean Squared Error for Decision Tree Regression for Training Values: {reg_rmse_tree_train}')

print(f'R2 score for Decision Tree Regression: {reg_r2_score_tree}')
results_tree.head()


Training Data Score for Decision Tree Regression: 0.999999805257673
Testing Data Score for Decision Tree Regression: 0.9867882042653239
Mean Squared Error for Decision Tree Regression: 257844.0717558009
Root Mean Squared Error for Decision Tree Regression: 507.78348905394796
Mean Squared Error for Decision Tree Regression for Training Values: 83308.53864396818
Root Mean Squared Error for Decision Tree Regression for Training Values: 288.63218573812617
R2 score for Decision Tree Regression: 0.9867882042653239


Unnamed: 0,Prediction for Decision Tree Regression,Actual for Decision Tree Regression
0,12608.414913,12481.068956
1,19359.470966,18299.071994
2,19172.322317,18846.795608
3,21409.863225,21597.663069
4,25362.394352,25596.721389


As seen above,the difference between Root Mean Squared Error and Mean Squared Error for Training and testing values is very high ,it can be concluded that the data is overfitting.

In [21]:
# Create a line plot of the predicted vs actual values for Linear Regression
sampled_df = results_linear.sample(n = 200, random_state = 2)
plot_linear = sampled_df.hvplot.line(
    x = "Prediction for Linear Regression",
    y = "Actual for Linear Regression"
)
plot_linear

AttributeError: 'DataFrame' object has no attribute 'hvplot'

In [None]:
# Create a line plot of the predicted vs actual values for Decision Tree Regression
sampled_df1 = results_tree.sample(n = 200, random_state = 2)
plot_tree = sampled_df1.hvplot.line(
    x = "Prediction for Decision Tree Regression",
    y = "Actual for Decision Tree Regression"
)
plot_tree

In [None]:
plot_tree * plot_linear