In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt 
import seaborn as sns

# Load files into dataframe and split data

In [None]:
file = 'Root_Insurance_Data.csv'
root = pd.read_csv(file)
root_copy = root.copy()
X = root_copy[['Currently Insured','Number of Vehicles','Number of Drivers','Marital Status','rank']]
y = root_copy[['click','policies_sold']]
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.25,
                                                    random_state = 614, 
                                                    shuffle=True,
                                                    stratify=y)

# Using one-hot converting 'insured or not' and 'marital status' column 

In [None]:
X1_train = X_train.loc[:,['Currently Insured','Number of Vehicles','Number of Drivers','Marital Status','rank']].copy()
X1_train['Married'] = pd.get_dummies(X1_train.loc[:,'Marital Status'])['M']
X1_train['Insured'] = pd.get_dummies(X1_train.loc[:,'Currently Insured'])['Y']
X1_train['NotInsured'] = pd.get_dummies(X1_train.loc[:,'Currently Insured'])['N']
X1_train = X1_train.loc[:,['Number of Vehicles','rank','Insured','NotInsured','Number of Drivers','Married']]
click_train = y_train.loc[:,'click'].copy()
policy_train = y_train.loc[:,'policies_sold'].copy()

# Logistic regression for clicking 

In [None]:
from sklearn.linear_model import LogisticRegression
log_reg = LogisticRegression()
log_reg.fit(X1_train,click_train)
"""
The code below is for checking the accuracy,precision and recall:

click_prob = log_reg.predict_proba(X1_train)[:,1]
cutoffs = np.arange(0,1.01,.01)
accs = []
for cutoff in cutoffs:
    click_train_pred = 1*(click_prob > cutoff)
    accs.append(np.sum(click_train_pred == click_train)/len(click_train))
    
plt.figure(figsize=(6,4),dpi=100)
plt.scatter(cutoffs,accs,s=10,c='k')
plt.xlabel("Cutoff",fontsize=10)
plt.ylabel("Training Accuracy",fontsize=10)
plt.show() 

cutoff = 0.5
click_train_pred = 1*(click_prob > cutoff)
click_df = pd.DataFrame({'click_train':click_train,'click_predict':click_train_pred})
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
display(confusion_matrix(click_train, click_train_pred))
print(precision_score(click_train, click_train_pred))
print(recall_score(click_train, click_train_pred))
print(np.sum(click_train_pred == click_train)/len(click_train))
"""  

# Conduct a feature dataframe (180 in total)

In [None]:
feature = []
for i in [1,2,3]:
    for j in [1,2,3,4,5]:
        for k in [0,1]:
                for n in [1,2]:
                    for m in [0,1]:
                        if k == 0:
                            feature.append([i,j,k,0,n,m])
                            feature.append([i,j,k,1,n,m])
                        else:
                            feature.append([i,j,k,0,n,m])

# Save the result of click probability

In [None]:
df = pd.DataFrame(np.array(feature),columns=['Number of Vehicles','rank','Insured','NotInsured','Number of Drivers','Married'])
click_prob = log_reg.predict_proba(df)[:,1]
df['click_prob'] = click_prob
df = df.drop_duplicates()
df.to_csv(r'/Users/yushanyang/Documents/Study/summer camp/project/log')

# Logistic regression for policy buying

In [None]:
log_reg = LogisticRegression()
log_reg.fit(X1_train,policy_train)
"""
The code below is for checking the accuracy,precision and recall:
policy_prob = log_reg.predict_proba(X1_train)[:,1]
cutoffs = np.arange(0,1.01,.01)
accs = []
for cutoff in cutoffs:
    policy_train_pred = 1*(policy_prob > cutoff)
    accs.append(np.sum(policy_train_pred == policy_train)/len(policy_train))
    
plt.figure(figsize=(6,4),dpi=100)
plt.scatter(cutoffs,accs,s=10,c='k')
plt.xlabel("Cutoff",fontsize=10)
plt.ylabel("Training Accuracy",fontsize=10)
plt.show() 

cutoff = 0.24
policy_train_pred = 1*(policy_prob > cutoff)
policy_df = pd.DataFrame({'polcy_train':policy_train,'policy_predict':policy_train_pred})
display(confusion_matrix(policy_train, policy_train_pred))
print(precision_score(policy_train, policy_train_pred))
print(recall_score(policy_train, policy_train_pred))
print(np.sum(policy_train_pred == policy_train)/len(policy_train))
"""

# Save the result of policy buying probability

In [None]:
df = pd.DataFrame(np.array(feature),columns=['Number of Vehicles','rank','Insured','NotInsured','Number of Drivers','Married'])
policy_prob = log_reg.predict_proba(df)[:,1]
df['policy_prob'] = policy_prob
df = df.drop_duplicates()
df.to_csv(r'/Users/yushanyang/Documents/Study/summer camp/project/log_policy')

# K-nearest neighbors for clicking

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import StratifiedKFold
kfold = StratifiedKFold(5,shuffle = True,random_state = 614)
def get_acc(model,X,y):
    pred = model.predict(X)
    return np.sum(pred == y)/len(y)

In [None]:
X1_train = X_train.loc[:,['Currently Insured','Number of Vehicles','Number of Drivers','Marital Status','rank']].copy()
X1_train['Married'] = pd.get_dummies(X1_train.loc[:,'Marital Status'])['M']
X1_train['Insured'] = pd.get_dummies(X1_train.loc[:,'Currently Insured'])['Y']
X1_train['NotInsured'] = pd.get_dummies(X1_train.loc[:,'Currently Insured'])['N']
X1_train = X1_train.loc[:,['Number of Vehicles','rank','Insured','NotInsured','Number of Drivers','Married']]
X1_train_num = X1_train.to_numpy()
click_train_num = click_train.to_numpy()
policy_train_num = policy_train.to_numpy()

In [None]:
"""
The code below is for choosing the best number of neighbors:

max_neighbors = 20
accs = np.zeros((5,max_neighbors))
j = 0
for train_index, test_index in kfold.split(X1_train_num,click_train_num):
    X1_train_train, X1_train_test = X1_train_num[train_index], X1_train_num[test_index]
    click_train_train, click_train_test = click_train_num[train_index], click_train_num[test_index]
    for i in range(1,max_neighbors+1):
        knn = KNeighborsClassifier(i)       
        knn.fit(X1_train_train, click_train_train.ravel())
        accs[j,i-1] = get_acc(knn, X1_train_test, click_train_test.ravel())        
    j=j+1 
    
sns.set_style("whitegrid")
plt.figure(figsize=(6,4),dpi=100)
plt.plot(range(1,max_neighbors+1), 100*np.mean(accs, axis=0))
plt.xlabel("Features", fontsize=10)
plt.ylabel("Average CV Accuracy (%)", fontsize=10)
plt.show()  
"""

In [None]:
knn = KNeighborsClassifier(10)
knn.fit(X1_train_num, click_train_num.ravel())
"""
The code below is for finding accuracy, precision and recall:

click_prob = knn.predict_proba(X1_train_num)[:,1]
cutoffs = np.arange(0,1.01,.01)
accs = []
for cutoff in cutoffs:
    click_train_pred = 1*(click_prob > cutoff)
    accs.append(np.sum(click_train_pred == click_train)/len(click_train))
    
plt.figure(figsize=(6,4),dpi=100)
plt.scatter(cutoffs,accs,s=10,c='k')
plt.xlabel("Cutoff",fontsize=10)
plt.ylabel("Training Accuracy",fontsize=10)
plt.show()

cutoff = 0.65
click_train_pred = 1*(click_prob > cutoff)
click_df = pd.DataFrame({'click_train':click_train,'click_predict':click_train_pred})
display(confusion_matrix(click_train, click_train_pred))
print(precision_score(click_train, click_train_pred))
print(recall_score(click_train, click_train_pred))
print(np.sum(click_train_pred == click_train)/len(click_train))
"""

# Save the clicking probability by k-nearest neighbors 

In [None]:
df = pd.DataFrame(np.array(feature),columns=['Number of Vehicles','rank','Insured','NotInsured','Number of Drivers','Married'])
click_prob = knn.predict_proba(feature)[:,1]
df['click_prob'] = click_prob
df = df.drop_duplicates()
df.to_csv(r'/Users/yushanyang/Documents/Study/summer camp/project/knn')

In [None]:
"""
The code below is for choosing the best number of neighbors:

max_neighbors = 20
accs = np.zeros((5,max_neighbors))
j = 0
for train_index, test_index in kfold.split(X1_train_num,policy_train_num):
    X1_train_train, X1_train_test = X1_train_num[train_index], X1_train_num[test_index]
    policy_train_train, policy_train_test = policy_train_num[train_index], policy_train_num[test_index]
    for i in range(1,max_neighbors+1):
        knn = KNeighborsClassifier(i)       
        knn.fit(X1_train_train, policy_train_train.ravel())
        accs[j,i-1] = get_acc(knn, X1_train_test, policy_train_test.ravel())        
    j=j+1 
    
sns.set_style("whitegrid")
plt.figure(figsize=(6,4),dpi=100)
plt.plot(range(1,max_neighbors+1), 100*np.mean(accs, axis=0))
plt.xlabel("Features", fontsize=10)
plt.ylabel("Average CV Accuracy (%)", fontsize=10)
plt.show()  
"""

In [None]:
knn = KNeighborsClassifier(10)
knn.fit(X1_train, policy_train.ravel())
"""
The code below is for finding accuracy, precision and recall:

policy_prob = knn.predict_proba(X1_train)[:,1]
cutoffs = np.arange(0,1.01,.01)
accs = []
for cutoff in cutoffs:
    policy_train_pred = 1*(policy_prob > cutoff)
    accs.append(np.sum(policy_train_pred == policy_train)/len(policy_train))
    
plt.figure(figsize=(6,4),dpi=100)
plt.scatter(cutoffs,accs,s=10,c='k')
plt.xlabel("Cutoff",fontsize=10)
plt.ylabel("Training Accuracy",fontsize=10)
plt.show()    

cutoff = 0.4
policy_train_pred = 1*(policy_prob > cutoff)
policy_df = pd.DataFrame({'polcy_train':policy_train,'policy_predict':policy_train_pred})
display(confusion_matrix(policy_train, policy_train_pred))
print(precision_score(policy_train, policy_train_pred))
print(recall_score(policy_train, policy_train_pred))
print(np.sum(policy_train_pred == policy_train)/len(policy_train))
"""

# Save the policy buying probability by k-nearest neighbors

In [None]:
df = pd.DataFrame(np.array(feature),columns=['Number of Vehicles','rank','Insured','NotInsured','Number of Drivers','Married'])
policy_prob = knn.predict_proba(feature)[:,1]
df['policy_prob'] = policy_prob
df = df.drop_duplicates()
df.to_csv(r'/Users/yushanyang/Documents/Study/summer camp/project/knn_policy')