In [2]:
import pandas as pd
import numpy as np
import sklearn.model_selection as model_selection
import sklearn.cluster as cluster
import sklearn.metrics as metrics
from sklearn.preprocessing import OneHotEncoder
import math 

In [55]:
history = pd.read_csv('claim_history.csv')

In [60]:
history = history[['CAR_TYPE', 'OCCUPATION', 'EDUCATION', 'CAR_USE']].dropna()

#One-hot encoding
education_mapper = {'Doctors': 4 , 'Masters': 3, 'Bachelors': 2, 'High School': 1, 'Below High School': 0}
history['MAPPED_EDUCATION'] = history['EDUCATION'].replace(education_mapper)

X = history[['CAR_TYPE', 'OCCUPATION', 'EDUCATION', 'MAPPED_EDUCATION']]
y = history['CAR_USE']

## QUESTION 1

In [61]:
X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, train_size=0.75, random_state=60616)

In [62]:
# Q1A
priv_train, comm_train = y_train.value_counts()

# Train
print( 'Private: {} {}'.format(priv_train, (priv_train/(priv_train+comm_train))) )
print( 'Commerc: {} {}'.format(comm_train, (comm_train/(priv_train+comm_train))) )

Private: 4875 0.6309862800931918
Commerc: 2851 0.3690137199068082


In [63]:
# Q1B
priv_test, comm_test = y_test.value_counts()

# Test
print( 'Private: {} {}'.format(priv_test, (priv_test/(priv_test+comm_test))) )
print( 'Commerc: {} {}'.format(comm_test, (comm_test/(priv_test+comm_test))) )

Private: 1638 0.6358695652173914
Commerc: 938 0.3641304347826087


In [64]:
# Q1C
p_comm_train = y_train.value_counts()['Commercial'] / len(y_train) # P(Commercial | train)
p_comm_test = y_test.value_counts()['Commercial'] / len(y_test)    # P(Commercial | test)
p_train_comm = (p_comm_train * 0.75)/(p_comm_train * 0.75 + p_comm_test * 0.25)

p_train_comm

0.7524894878726821

In [65]:
# Q1D
p_priv_train = y_train.value_counts()['Private'] / len(y_train) # P(Private | train)
p_priv_test = y_test.value_counts()['Private'] / len(y_test)    # P(Private | test)
p_test_priv = (p_priv_test * 0.25)/(p_priv_train * 0.75 + p_priv_test * 0.25)
# p_test_priv

print(p_priv_train)
print(p_priv_test)
print((p_priv_train * 0.75 + p_priv_test * 0.25))
print(p_test_priv)

0.6309862800931918
0.6358695652173914
0.6322071013742416
0.25144828484020054


## QUESTION 2

In [18]:
def EntropySplit (
   inData,          # input data frame (predictor in column 0 and target in column 1)
   split):          # split value

   dataTable = inData
   dataTable['LE_Split'] = (dataTable.iloc[:,0] <= split)

   crossTable = pd.crosstab(index = dataTable['LE_Split'], columns = dataTable.iloc[:,1], margins = True, dropna = True)   
   print(crossTable)

   nRows = crossTable.shape[0]
   nColumns = crossTable.shape[1]
   
   tableEntropy = 0
   for iRow in range(nRows-1):
      rowEntropy = 0
      for iColumn in range(nColumns):
         proportion = crossTable.iloc[iRow,iColumn] / crossTable.iloc[iRow,(nColumns-1)]
         if (proportion > 0):
            rowEntropy -= proportion * np.log2(proportion)
      print('Row = ', iRow, 'Entropy =', rowEntropy)
      print(' ')
      tableEntropy += rowEntropy *  crossTable.iloc[iRow,(nColumns-1)]
   tableEntropy = tableEntropy /  crossTable.iloc[(nRows-1),(nColumns-1)]
  
   return(tableEntropy)

In [19]:
# 2A: Root Entropy
prob_train_priv = y_train.value_counts()['Private'] / y_train.shape[0]
prob_train_comm = y_train.value_counts()['Commercial'] / y_train.shape[0]

root_entropy = -(prob_train_priv * np.log2(prob_train_priv) + prob_train_comm * np.log2(prob_train_comm))
root_entropy

0.9499117892797907

In [30]:
# 2B: Split Criterion
# CAR_USE ... 9 branches

crossTab1 = pd.crosstab(index = X_train['CAR_TYPE'], columns = y_train, margins = True, dropna = True)   
print(crossTab1)
print('\n')

crossTab2 = pd.crosstab(index = X_train['OCCUPATION'], columns = y_train, margins = True, dropna = True)   
print(crossTab2)
print('\n')

crossTab3 = pd.crosstab(index = X_train['EDUCATION'], columns = y_train, margins = True, dropna = True)   
print(crossTab3)

CAR_USE      Commercial  Private   All
CAR_TYPE                              
Minivan             422     1606  2028
Panel Truck         638        0   638
Pickup              792      512  1304
SUV                 419     1742  2161
Sports Car          146      737   883
Van                 434      278   712
All                2851     4875  7726


CAR_USE       Commercial  Private   All
OCCUPATION                             
Blue Collar         1309      413  1722
Clerical             220      962  1182
Doctor                 0      233   233
Home Maker            38      579   617
Lawyer                 0      777   777
Manager              227      708   935
Professional         273      800  1073
Student              338      350   688
Unknown              446       53   499
All                 2851     4875  7726


CAR_USE            Commercial  Private   All
EDUCATION                                   
Bachelors                 895     1231  2126
Below High School         257 

In [27]:
def computeSplitEntropy(crossTable):
    row = crossTable.shape[0] - 1
    col = crossTable.shape[1] - 1
    entropyList = []
    splitEntropy = 0

    for r in range(row):
        p_comm = crossTable.iloc[r, 0] / crossTable.iloc[r, col]
        p_priv = crossTable.iloc[r, 1] / crossTable.iloc[r, col]
        entr = -(p_comm*np.log2(p_comm) + p_priv*np.log2(p_priv))
        
        if math.isnan(entr):
            entr = 0
        
        entropyList.append(entr)

    for i, entropy in enumerate(entropyList):
        
        splitEntropy += (crossTable.iloc[i, col]/crossTable.iloc[row, col]) * entropy

    return splitEntropy
    
def computeEntropyReduction(crossTable):
    return 1.0 - computeSplitEntropy(crossTable) 

In [77]:
def EntropyIntervalSplit(inData, split):
    dataTable = inData
    dataTable['LE_Split'] = (dataTable.iloc[:,0] <= split)

    crossTable = pd.crosstab(index = dataTable['LE_Split'], columns = dataTable.iloc[:,1], margins = True, dropna = True)   
    print(crossTable)

    nRows = crossTable.shape[0]
    nColumns = crossTable.shape[1]

    tableEntropy = 0
    for iRow in range(nRows-1):
      rowEntropy = 0
      for iColumn in range(nColumns):
         proportion = crossTable.iloc[iRow,iColumn] / crossTable.iloc[iRow,(nColumns-1)]
         if (proportion > 0):
            rowEntropy -= proportion * np.log2(proportion)
      print('Row = ', iRow, 'Entropy =', rowEntropy)
      print(' ')
      tableEntropy += rowEntropy *  crossTable.iloc[iRow,(nColumns-1)]
    tableEntropy = tableEntropy /  crossTable.iloc[(nRows-1),(nColumns-1)]

    return(tableEntropy)

def EntropyNominalSplit(inData, split):
    dataTable = inData
    dataTable['LE_Split'] = list(map(lambda x: True if x in split else False, dataTable.iloc[:,0]))

    crossTable = pd.crosstab(index = dataTable['LE_Split'], columns = dataTable.iloc[:,1], margins = True, dropna = True)   
    print(crossTable)

    nRows = crossTable.shape[0]
    nColumns = crossTable.shape[1]

    tableEntropy = 0
    for iRow in range(nRows-1):
      rowEntropy = 0
      for iColumn in range(nColumns):
         proportion = crossTable.iloc[iRow,iColumn] / crossTable.iloc[iRow,(nColumns-1)]
         if (proportion > 0):
            rowEntropy -= proportion * np.log2(proportion)
      print('Row = ', iRow, 'Entropy =', rowEntropy)
      print(' ')
      tableEntropy += rowEntropy *  crossTable.iloc[iRow,(nColumns-1)]
    tableEntropy = tableEntropy /  crossTable.iloc[(nRows-1),(nColumns-1)]

    return(tableEntropy)

In [78]:
for i in [0.5, 1.5, 2.5, 3.5, 4.5]:
    print( EntropyIntervalSplit(pd.concat([X_train['MAPPED_EDUCATION'], y_train], axis=1), i) )

CAR_USE   Commercial  Private   All
LE_Split                           
False           2594     3994  6588
True             257      881  1138
All             2851     4875  7726
Row =  0 Entropy = 0.9671746016358668
 
Row =  1 Entropy = 0.770679454607363
 
0.9382318787108813
CAR_USE   Commercial  Private   All
LE_Split                           
False           1520     2855  4375
True            1331     2020  3351
All             2851     4875  7726
Row =  0 Entropy = 0.9317505619436531
 
Row =  1 Entropy = 0.969286027889095
 
0.9480308294019983
CAR_USE   Commercial  Private   All
LE_Split                           
False            625     1624  2249
True            2226     3251  5477
All             2851     4875  7726
Row =  0 Entropy = 0.8525753860947878
 
Row =  1 Entropy = 0.9745861473094304
 
0.9390694242998872
CAR_USE   Commercial  Private   All
LE_Split                           
False            226      471   697
True            2625     4404  7029
All             2851 

In [66]:
pd.concat([X_train['MAPPED_EDUCATION'], y_train], axis=1)

Unnamed: 0,MAPPED_EDUCATION,CAR_USE
724,1,Commercial
5793,3,Private
1939,2,Commercial
796,1,Commercial
6048,0,Private
...,...,...
4814,2,Private
1019,4,Commercial
4806,1,Private
3476,4,Commercial


In [95]:
from itertools import combinations 

l = ['hey', 'ho', 'hoe']

for i in range(1, len(l)+1):
    comb = combinations(l, i)
    
    for j in list(comb):
        print(j)

('hey',)
('ho',)
('hoe',)
('hey', 'ho')
('hey', 'hoe')
('ho', 'hoe')
('hey', 'ho', 'hoe')
