In [1]:
## Import the necessary libraries
import sqlite3
import pandas as pd
import os
import time
import numpy as np

In [5]:
conn = sqlite3.connect("insurance.db")
cursor = conn.cursor()

In [6]:
cursor.execute('SELECT * FROM combined_data')

OperationalError: no such table: combined_data

In [3]:
start = time.time()
df=pd.read_sql("SELECT * FROM combined_data",conn)
end = time.time()
conn.close()

DatabaseError: Execution failed on sql 'SELECT * FROM combined_data': no such table: combined_data

In [None]:
print(f"\nQuery returned {df.shape[0]:,} rows x {df.shape[1]} columns in {end - start:.2f} seconds")
df.head()

In [None]:
df['CUST_ORIG_DATE'] = pd.to_datetime(df['CUST_ORIG_DATE'],errors='coerce')
df['ACCT_SUSPD_DATE'] = pd.to_datetime(df['ACCT_SUSPD_DATE'],errors='coerce')

In [None]:
## Logic applied here: if there is a termination date, then we know there is churn happening
df['Churn'] = np.where(df['ACCT_SUSPD_DATE'].isna(),np.nan,1)

In [None]:
print(df.info())

In [None]:
quantity = df['INDIVIDUAL_ID'].value_counts()
duplicates_in_name = quantity[quantity > 1]
print("Table of 'Individual_ID' duplicate column:\n", duplicates_in_name)

duplicate_rows_count = df[df.duplicated(subset=["INDIVIDUAL_ID"],keep=False)].shape[0]
print("Number of dupliacted customers: ",duplicate_rows_count)

The cell above shows that there are no duplicates in the individual_id column which is good to set this as the true value for all of our modeling

In [None]:
quantity = df['ADDRESS_ID'].value_counts()
duplicates_in_name = quantity[quantity > 1]
print("Table of 'ADDRESS_ID' duplicate column:\n", duplicates_in_name)

duplicate_rows_count = df[df.duplicated(subset=["ADDRESS_ID"],keep=False)].shape[0]
print("Number of duplicated addresses: ",duplicate_rows_count)

As we can see, there are many duplicated addresses.

In [None]:
missing_summary = pd.DataFrame({
    'Missing Values': df.isna().sum(),
    'Percentage': (df.isna().sum() / len(df))*100
})
print(missing_summary)

Notes on missing data:
-The AGE_IN_YEARS would be expected to be 0 since it is from the customer table (the primary table) but there are just nulls in their anyways despiste a COALESCE from the autoinsurance_churn file
-The ACCT_SUSPD_DATE is lean since the termination file is lean on data but that predictor will not be used in modeling anyways

In [None]:
# This will be my helper function for parsing through and determining the median for each range of numbers that I was given
def parse_home_value(val):
    if pd.isna(val):
        return np.nan
    val = str(val).strip()
    if val.upper() == "N/A":
        return np.nan
    if " - " in val:
        lo, hi = val.split(" - ")
        return (float(lo) + float(hi)) / 2
    try:
        return float(val)
    except ValueError:
        return np.nan  

In [None]:
## Now let's get into the feature engineering part
df = df.dropna(subset=['CURR_ANN_AMT']) # Drop the rows without curr_ann_amt
df = df.dropna(subset=['INDIVIDUAL_ID']) # Drop the rows without individual_id's (Only 1 row for some reason)

# Here will be my block of code for correcting out the na's and range values given for home_market_value
df["HOME_MARKET_VALUE"] = df["HOME_MARKET_VALUE"].apply(parse_home_value)
overall_home_median = df["HOME_MARKET_VALUE"].median()
print("Overall Home Median :", type(overall_home_median))
df["HOME_MARKET_VALUE"] = df["HOME_MARKET_VALUE"].fillna(overall_home_median)



print(f"\nQuery returned {df.shape[0]:,} rows x {df.shape[1]} columns")
df.head()

In [None]:
missing_summary = pd.DataFrame({
    'Missing Values': df.isna().sum(),
    'Percentage': (df.isna().sum() / len(df))*100
})
print(missing_summary)

In [None]:
# Define target
y = df['CURR_ANN_AMT']

# choose predictors
features = [
    'age_in_years','income','DAYS_TENURE',
    'LENGTH_OF_RESIDENCE','HOME_MARKET_VALUE',
    'Churn','HAS_CHILDREN','marital_status',
    'HOME_OWNER','COLLEGE_DEGREE','GOOD_CREDIT',
    'STATE','COUNTY'
]

X = df[features]
Columns_Before_One_Hot_Encoding = X.shape[1]

# Function to obtain how many of each county is in the dataset
uniqueCounties = X['COUNTY'].value_counts()

## I want to see the max columns shown in the output channel
pd.set_option('display.max_columns',None)

## This get_dummies function one hot encodes the county column and completely drops the state column since Texas is the only option
X = pd.get_dummies(X,drop_first=True,dtype=int)

# This yields me 13 columns - I looked through and the county that was removed was Collin so if a row is all 0's, then it will be assumed to be Collin county
Columns_After_One_Hot_Encoding = X.shape[1]
New_Columns = Columns_After_One_Hot_Encoding - Columns_Before_One_Hot_Encoding
print(f"Number of Columns Added: ",New_Columns)
print(f"Chart of County Counts: ",uniqueCounties)
X.head()

In [None]:
missing_summary = pd.DataFrame({
    'Missing Values': X.isna().sum(),
    'Percentage': (X.isna().sum() / len(df))*100
})
print(missing_summary)

In [None]:
missing_summary = pd.DataFrame({
    'Missing Values': X.isna().sum(),
    'Percentage': (X.isna().sum() / len(df))*100
})
print(missing_summary)

For missing values, let's implement a "was_missing" feature so the model can learn if missingness correlates with premiums

In [None]:
## For the missing and n/a values for 'Has_Children', 'Home_Owner', 'College_Degree', and 'Good_Credit': since the missing values only represent 7% of the dataset, let's put in a "Is_missing" category to see if missingness correlates with premiums.
for col in ['HAS_CHILDREN','HOME_OWNER','COLLEGE_DEGREE','GOOD_CREDIT']:
    X[col + '_missing'] = X[col].isna().astype(int)
    X[col] = X[col].fillna(0)

missing_summary = pd.DataFrame({
    'Missing Values': X.isna().sum(),
    'Percentage': (X.isna().sum() / len(df))*100
})
print(missing_summary)

In [None]:
## For age, income, and length of residence: fill in the missing values with the median values
for col in ['age_in_years','income','LENGTH_OF_RESIDENCE']:
    X[col + '_missing'] = X[col].isna().astype(int)
    X[col] = X[col].fillna(X[col].median())

missing_summary = pd.DataFrame({
    'Missing Values': X.isna().sum(),
    'Percentage': (X.isna().sum() / len(df))*100
})
print(missing_summary)

In [None]:
#pip install scikit-learn

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=42)

In [None]:
#pip install statsmodels

In [None]:
import statsmodels.api as sm

X_train_const = sm.add_constant(X_train)

glm_gamma = sm.GLM(
    y_train,
    X_train_const,
    family=sm.families.Gamma(sm.families.links.log())
)

results=glm_gamma.fit()
print(results.summary())

In [None]:
X_test_const = sm.add_constant(X_test)
y_pred = results.predict(X_test_const)

from sklearn.metrics import mean_squared_error, mean_absolute_error
rmse = np.sqrt(mean_squared_error(y_test,y_pred))
mae = mean_absolute_error(y_test,y_pred)

print("RMSE:",rmse)
print("MAE:",mae)

pd.set_option("display.float_format","{:.2f}".format)
y_summary_stats = y.describe()
print(y_summary_stats)

mean_premium = y.mean()
rel_mae = mae / mean_premium * 100
rel_rmse = rmse / mean_premium * 100
print(f"Relative MAE: {rel_mae:.3f}%")
print(f"Relative RMSE: {rel_rmse:.3f}%")

baseline_pred = np.full_like(y_test,fill_value=mean_premium)
baseline_mae = mean_absolute_error(y_test,baseline_pred)
print("Baseline MAE:",baseline_mae)