In [1]:
import pandas as pd
import numpy as np
import os
import sys


In [2]:
datafile = '/data/kelleher/MotorInsuranceFraudClaim_clean.csv'
df = pd.read_csv(datafile)
df.head()

Unnamed: 0,ID,InsuranceType,IncomeofPolicyHolder,MaritalStatus,NumClaimants,InjuryType,OvernightHospitalStay,ClaimAmount,TotalClaimed,NumClaims,NumSoftTissue,PercentSoftTissue,ClaimAmountReceived,FraudFlag
0,1,CI,0,,2,Soft Tissue,No,1625,3250,2,2.0,1.0,0,1
1,2,CI,0,,2,Back,Yes,15028,60112,1,0.0,0.0,15028,0
2,3,CI,54613,Married,1,Broken Limb,No,-99999,0,0,0.0,0.0,572,0
3,4,CI,0,,3,Serious,Yes,270200,0,0,0.0,0.0,270200,0
4,5,CI,0,,4,Soft Tissue,No,8869,0,0,0.0,0.0,0,1


In [5]:
df.describe(include='all').T

Unnamed: 0,count,unique,top,freq,mean,std,min,25%,50%,75%,max
ID,500.0,,,,250.5,144.481833,1.0,125.75,250.5,375.25,500.0
InsuranceType,500.0,1.0,CI,500.0,,,,,,,
IncomeofPolicyHolder,500.0,,,,13739.994,20081.535489,0.0,0.0,0.0,33918.5,71284.0
MaritalStatus,170.0,3.0,Married,99.0,,,,,,,
NumClaimants,500.0,,,,1.908,1.012713,1.0,1.0,2.0,3.0,4.0
InjuryType,500.0,4.0,Broken Limb,177.0,,,,,,,
OvernightHospitalStay,500.0,2.0,No,354.0,,,,,,,
ClaimAmount,500.0,,,,16373.204,29426.27696,-99999.0,3322.25,5663.0,12245.5,270200.0
TotalClaimed,500.0,,,,9597.186,35655.68622,0.0,0.0,0.0,11282.75,729792.0
NumClaims,500.0,,,,0.798,2.666724,0.0,0.0,0.0,1.0,56.0


In [6]:
df.columns

Index(['ID', 'InsuranceType', 'IncomeofPolicyHolder', 'MaritalStatus',
       'NumClaimants', 'InjuryType', 'OvernightHospitalStay', 'ClaimAmount',
       'TotalClaimed', 'NumClaims', 'NumSoftTissue', 'PercentSoftTissue',
       'ClaimAmountReceived', 'FraudFlag'],
      dtype='object')

In [9]:
cat_features = [ 'MaritalStatus', 'InjuryType', 'OvernightHospitalStay' ]
target = 'FraudFlag'
df[cat_features]

Unnamed: 0,MaritalStatus,InjuryType,OvernightHospitalStay
0,,Soft Tissue,No
1,,Back,Yes
2,Married,Broken Limb,No
3,,Serious,Yes
4,,Soft Tissue,No
...,...,...,...
495,,Soft Tissue,No
496,Married,Broken Limb,Yes
497,,Broken Limb,Yes
498,Married,Broken Limb,No


In [11]:
df.MaritalStatus.drop_duplicates().values

array([nan, 'Married', 'Single', 'Divorced'], dtype=object)

In [19]:
prob_ms = df.MaritalStatus.value_counts().reset_index()
prob_ms['P'] = prob_ms['count']/prob_ms['count'].sum()
prob_ms

Unnamed: 0,MaritalStatus,count,P
0,Married,99,0.582353
1,Single,48,0.282353
2,Divorced,23,0.135294


In [22]:
df_fraud = df[df.FraudFlag==1]
prob_ms_fraud = df_fraud.MaritalStatus.value_counts().reset_index()
prob_ms['P_fraud'] = prob_ms_fraud['count']/prob_ms_fraud['count'].sum()
df_no_fraud = df[df.FraudFlag==0]
prob_ms_no_fraud = df_no_fraud.MaritalStatus.value_counts().reset_index()
prob_ms['P_no_fraud'] = prob_ms_no_fraud['count']/prob_ms_no_fraud['count'].sum()
prob_ms

Unnamed: 0,MaritalStatus,count,P,P_fraud,P_no_fraud
0,Married,99,0.582353,0.535714,0.605263
1,Single,48,0.282353,0.232143,0.307018
2,Divorced,23,0.135294,0.232143,0.087719


In [24]:
df[cat_features].

Unnamed: 0,MaritalStatus,InjuryType,OvernightHospitalStay
0,,Soft Tissue,No
1,,Back,Yes
2,Married,Broken Limb,No
3,,Serious,Yes
4,,Soft Tissue,No


In [32]:
prob_tabs = []
for fraud in [0, 1]:
    for col in cat_features:
        print(f"Fraud: {fraud} Feature: {col}")
        subdf = df[df.FraudFlag==fraud]
        prob = subdf[col].value_counts().reset_index() \
                .rename({col: 'level'}, axis=1)
        prob['P_cond'] \
            = prob['count']/prob['count'].sum()
        prob['Feature'] = col
        prob['FraudFlag'] = fraud
        prob_tabs.append(prob)


cond_prob_df = pd.concat(prob_tabs)
cond_prob_df

Fraud: 0 Feature: MaritalStatus
Fraud: 0 Feature: InjuryType
Fraud: 0 Feature: OvernightHospitalStay
Fraud: 1 Feature: MaritalStatus
Fraud: 1 Feature: InjuryType
Fraud: 1 Feature: OvernightHospitalStay


Unnamed: 0,level,count,P_cond,Feature,FraudFlag
0,Married,69,0.605263,MaritalStatus,0
1,Single,35,0.307018,MaritalStatus,0
2,Divorced,10,0.087719,MaritalStatus,0
0,Broken Limb,138,0.415663,InjuryType,0
1,Back,82,0.246988,InjuryType,0
2,Soft Tissue,79,0.237952,InjuryType,0
3,Serious,33,0.099398,InjuryType,0
0,No,225,0.677711,OvernightHospitalStay,0
1,Yes,107,0.322289,OvernightHospitalStay,0
0,Married,30,0.535714,MaritalStatus,1


In [37]:
prob_tab = pd.pivot_table(cond_prob_df, index=['level', 'Feature'], columns='FraudFlag', values='P_cond')

In [39]:
prob_tab.head()

Unnamed: 0_level_0,FraudFlag,0,1
level,Feature,Unnamed: 2_level_1,Unnamed: 3_level_1
Back,InjuryType,0.246988,0.202381
Broken Limb,InjuryType,0.415663,0.232143
Divorced,MaritalStatus,0.087719,0.232143
Married,MaritalStatus,0.605263,0.535714
No,OvernightHospitalStay,0.677711,0.767857


In [36]:
df[cat_features].head()

Unnamed: 0,MaritalStatus,InjuryType,OvernightHospitalStay
0,,Soft Tissue,No
1,,Back,Yes
2,Married,Broken Limb,No
3,,Serious,Yes
4,,Soft Tissue,No


In [41]:
cond_prob_df[(cond_prob_df.Feature=='MaritalStatus')&(cond_prob_df.level=='Married')]

Unnamed: 0,level,count,P_cond,Feature,FraudFlag
0,Married,69,0.605263,MaritalStatus,0
0,Married,30,0.535714,MaritalStatus,1
