In [2]:
import pandas as pd
import numpy as np
import sklearn.model_selection as model_selection
import sklearn.cluster as cluster
import sklearn.metrics as metrics
from sklearn.preprocessing import OneHotEncoder
import math 

In [55]:
history = pd.read_csv('claim_history.csv')

In [60]:
history = history[['CAR_TYPE', 'OCCUPATION', 'EDUCATION', 'CAR_USE']].dropna()

#One-hot encoding
education_mapper = {'Doctors': 4 , 'Masters': 3, 'Bachelors': 2, 'High School': 1, 'Below High School': 0}
history['MAPPED_EDUCATION'] = history['EDUCATION'].replace(education_mapper)

X = history[['CAR_TYPE', 'OCCUPATION', 'EDUCATION', 'MAPPED_EDUCATION']]
y = history['CAR_USE']

## QUESTION 1

In [222]:
X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, train_size=0.75, random_state=60616)

train, test = model_selection.train_test_split(history, train_size=0.75, random_state=60616, stratify=history['CAR_USE'])

In [62]:
# Q1A
priv_train, comm_train = y_train.value_counts()

# Train
print( 'Private: {} {}'.format(priv_train, (priv_train/(priv_train+comm_train))) )
print( 'Commerc: {} {}'.format(comm_train, (comm_train/(priv_train+comm_train))) )

Private: 4875 0.6309862800931918
Commerc: 2851 0.3690137199068082


In [63]:
# Q1B
priv_test, comm_test = y_test.value_counts()

# Test
print( 'Private: {} {}'.format(priv_test, (priv_test/(priv_test+comm_test))) )
print( 'Commerc: {} {}'.format(comm_test, (comm_test/(priv_test+comm_test))) )

Private: 1638 0.6358695652173914
Commerc: 938 0.3641304347826087


In [64]:
# Q1C
p_comm_train = y_train.value_counts()['Commercial'] / len(y_train) # P(Commercial | train)
p_comm_test = y_test.value_counts()['Commercial'] / len(y_test)    # P(Commercial | test)
p_train_comm = (p_comm_train * 0.75)/(p_comm_train * 0.75 + p_comm_test * 0.25)

p_train_comm

0.7524894878726821

In [223]:
# Q1D
p_priv_train = y_train.value_counts()['Private'] / len(y_train) # P(Private | train)
p_priv_test = y_test.value_counts()['Private'] / len(y_test)    # P(Private | test)
p_test_priv = (p_priv_test * 0.25)/(p_priv_train * 0.75 + p_priv_test * 0.25)
# p_test_priv
print(p_test_priv)

0.25144828484020054


## QUESTION 2

In [226]:
# 2A: Root Entropy
prob_train_priv = y_train.value_counts()['Private'] / y_train.shape[0]
prob_train_comm = y_train.value_counts()['Commercial'] / y_train.shape[0]

root_entropy = -(prob_train_priv * np.log2(prob_train_priv) + prob_train_comm * np.log2(prob_train_comm))
root_entropy

0.9499117892797907

In [30]:
# 2B: Split Criterion
# CAR_USE ... 9 branches

crossTab1 = pd.crosstab(index = X_train['CAR_TYPE'], columns = y_train, margins = True, dropna = True)   
print(crossTab1)
print('\n')

crossTab2 = pd.crosstab(index = X_train['OCCUPATION'], columns = y_train, margins = True, dropna = True)   
print(crossTab2)
print('\n')

crossTab3 = pd.crosstab(index = X_train['EDUCATION'], columns = y_train, margins = True, dropna = True)   
print(crossTab3)

CAR_USE      Commercial  Private   All
CAR_TYPE                              
Minivan             422     1606  2028
Panel Truck         638        0   638
Pickup              792      512  1304
SUV                 419     1742  2161
Sports Car          146      737   883
Van                 434      278   712
All                2851     4875  7726


CAR_USE       Commercial  Private   All
OCCUPATION                             
Blue Collar         1309      413  1722
Clerical             220      962  1182
Doctor                 0      233   233
Home Maker            38      579   617
Lawyer                 0      777   777
Manager              227      708   935
Professional         273      800  1073
Student              338      350   688
Unknown              446       53   499
All                 2851     4875  7726


CAR_USE            Commercial  Private   All
EDUCATION                                   
Bachelors                 895     1231  2126
Below High School         257 

In [111]:
def EntropyIntervalSplit(inData, split):
    dataTable = inData
    dataTable['LE_Split'] = (dataTable.iloc[:,0] <= split)

    crossTable = pd.crosstab(index = dataTable['LE_Split'], columns = dataTable.iloc[:,1], margins = True, dropna = True)   
    nRows = crossTable.shape[0]
    nColumns = crossTable.shape[1]

    tableEntropy = 0
    for iRow in range(nRows-1):
      rowEntropy = 0
      for iColumn in range(nColumns):
         proportion = crossTable.iloc[iRow,iColumn] / crossTable.iloc[iRow,(nColumns-1)]
         if (proportion > 0):
            rowEntropy -= proportion * np.log2(proportion)
      tableEntropy += rowEntropy *  crossTable.iloc[iRow,(nColumns-1)]
    tableEntropy = tableEntropy /  crossTable.iloc[(nRows-1),(nColumns-1)]

    return(tableEntropy)

def EntropyNominalSplit(inData, split):
    dataTable = inData
    dataTable['LE_Split'] = list(map(lambda x: True if x in split else False, dataTable.iloc[:,0]))

    crossTable = pd.crosstab(index = dataTable['LE_Split'], columns = dataTable.iloc[:,1], margins = True, dropna = True)   
    nRows = crossTable.shape[0]
    nColumns = crossTable.shape[1]

    tableEntropy = 0
    for iRow in range(nRows-1):
      rowEntropy = 0
      for iColumn in range(nColumns):
         proportion = crossTable.iloc[iRow,iColumn] / crossTable.iloc[iRow,(nColumns-1)]
         if (proportion > 0):
            rowEntropy -= proportion * np.log2(proportion)
      tableEntropy += rowEntropy *  crossTable.iloc[iRow,(nColumns-1)]
    tableEntropy = tableEntropy /  crossTable.iloc[(nRows-1),(nColumns-1)]

    return(tableEntropy)

In [123]:
from itertools import combinations 

def getOptimalNominalSplit(inData, splits):
    minEntropy = 1.0
    minCombination = []
    length = len(splits) + 1
    
    for i in range(1, length):
        for comb in list(combinations(splits, i)):
            currCombination = list(comb)
            currEntropy = EntropyNominalSplit(inData, currCombination)

            if currEntropy < minEntropy:
                minEntropy = currEntropy
                minCombination = currCombination
    
    return (minEntropy, minCombination, list(set(splits)-set(minCombination)))

def getOptimalIntervalSplit(inData, splits):
    minEntropy = 1.0
    minSplit = -1.0
    length = len(splits)
    
    for i in range(length):
        currEntropy = EntropyIntervalSplit(inData, splits[i])
        
        if currEntropy < minEntropy:
            minEntropy = currEntropy
            minSplit = splits[i]
    
    return (minEntropy, minSplit) 


In [155]:
occupations = ['Blue Collar', 'Clerical', 'Doctor', 'Home Maker', 'Lawyer', 'Manager', 'Professional',
              'Student', 'Unknown']
car_types = ['Minivan', 'Panel Truck', 'Pickup', 'SUV', 'Sports Car', 'Van']
mapped_education_splits = [0.5, 1.5, 2.5, 3.5, 4.5]

occupation_data = pd.concat([X_train['OCCUPATION'], y_train], axis=1) 
carType_data = pd.concat([X_train['CAR_TYPE'], y_train], axis=1)
education_data = pd.concat([X_train['MAPPED_EDUCATION'], y_train], axis=1)

print( getOptimalNominalSplit(carType_data, car_types) )
print( getOptimalNominalSplit(occupation_data, occupations) )
print( getOptimalIntervalSplit(education_data, mapped_education_splits) )

(0.7685043713026927, ['Minivan', 'SUV', 'Sports Car'], ['Panel Truck', 'Van', 'Pickup'])
(0.7138723890228704, ['Blue Collar', 'Student', 'Unknown'], ['Home Maker', 'Lawyer', 'Doctor', 'Professional', 'Clerical', 'Manager'])
(0.9382318787108813, 0.5)


In [167]:
occupations_left = ['Blue Collar', 'Student', 'Unknown']
occupations_right = ['Home Maker', 'Lawyer', 'Doctor', 'Professional', 'Clerical', 'Manager']

left_data = X_train[X_train['OCCUPATION'].isin(occupations_left)]
right_data = X_train[~X_train['OCCUPATION'].isin(occupations_left)]

# LEFT BRANCH
print( getOptimalNominalSplit(pd.concat([left_data['OCCUPATION'], y_train], axis=1).dropna(), occupations_left) )
print( getOptimalNominalSplit(pd.concat([left_data['CAR_TYPE'], y_train], axis=1).dropna(), car_types) )
print( getOptimalIntervalSplit(pd.concat([left_data['MAPPED_EDUCATION'], y_train], axis=1).dropna(), mapped_education_splits) )
print('\n')
# RIGHT BRANCH
print( getOptimalNominalSplit(pd.concat([right_data['OCCUPATION'], y_train], axis=1).dropna(), occupations_right) )
print( getOptimalNominalSplit(pd.concat([right_data['CAR_TYPE'], y_train], axis=1).dropna(), car_types) )
print( getOptimalIntervalSplit(pd.concat([right_data['MAPPED_EDUCATION'], y_train], axis=1).dropna(), mapped_education_splits) )

(0.8023025606364643, ['Student'], ['Blue Collar', 'Unknown'])
(0.7696653845258998, ['Minivan', 'SUV', 'Sports Car'], ['Panel Truck', 'Van', 'Pickup'])
(0.6736439321725546, 0.5)


(0.5642252500375471, ['Home Maker', 'Lawyer', 'Doctor'], ['Clerical', 'Professional', 'Manager'])
(0.32808686969710504, ['Minivan', 'SUV', 'Sports Car'], ['Panel Truck', 'Van', 'Pickup'])
(0.6178221369120133, 2.5)


In [218]:
def printCounts(data):
    frequency_table = pd.value_counts(data['CAR_USE'])
    num_p = frequency_table['Private']
    num_c = frequency_table['Commercial']
    nums = num_p + num_c
    entropy = -(num_p/nums * np.log2(num_p/nums) + num_c/nums * np.log2(num_c/nums))
    print('Private Count: {}\nCommercial Count: {}\nTotal Count: {}\nEntropy: {}\n'.format(num_p, num_c, nums, entropy))
#     -(prob_train_priv * np.log2(prob_train_priv) + prob_train_comm * np.log2(prob_train_comm))
    

carTypes_left = ['Minivan', 'SUV', 'Sports Car']
carTypes_right = ['Panel Truck', 'Van', 'Pickup']

left_left_data = pd.concat([left_data[left_data['MAPPED_EDUCATION'] <= 0.5], y_train], axis=1).dropna()
left_right_data = pd.concat([left_data[left_data['MAPPED_EDUCATION'] > 0.5], y_train], axis=1).dropna()

rl_data = right_data[right_data['CAR_TYPE'].isin(carTypes_left)]
rr_data = right_data[right_data['CAR_TYPE'].isin(carTypes_right)]

right_left_data = pd.concat([rl_data, y_train], axis=1).dropna()
right_right_data = pd.concat([rr_data, y_train], axis=1).dropna()


printCounts(left_left_data)
printCounts(left_right_data)
printCounts(right_left_data)
printCounts(right_right_data)

Private Count: 460
Commercial Count: 173
Total Count: 633
Entropy: 0.8461626265285531

Private Count: 356
Commercial Count: 1920
Total Count: 2276
Entropy: 0.6256631177932281

Private Count: 3409
Commercial Count: 23
Total Count: 3432
Entropy: 0.05803024570980552

Private Count: 650
Commercial Count: 735
Total Count: 1385
Entropy: 0.9972813343356697



In [219]:
rr_data

Unnamed: 0,CAR_TYPE,OCCUPATION,EDUCATION,MAPPED_EDUCATION
5793,Pickup,Professional,Masters,3
1925,Pickup,Professional,High School,1
6797,Van,Clerical,High School,1
7092,Pickup,Home Maker,Doctors,4
4846,Panel Truck,Manager,Bachelors,2
...,...,...,...,...
7023,Panel Truck,Manager,Bachelors,2
9379,Pickup,Manager,Doctors,4
4814,Pickup,Professional,Bachelors,2
1019,Panel Truck,Manager,Doctors,4
