In [None]:
# import utility modules
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

# load raw data
policies_raw = pd.read_csv('data/policies.csv')
print(policies_raw.info())

Variable Identification:
+ should variables be classified further (i.e., discrete, continuous, ordinal, nominal)?

In [None]:
policies_raw_categoric = policies_raw.select_dtypes(exclude=[np.number])
categoric_cols = policies_raw_categoric.columns.values
policies_raw_numeric = policies_raw.select_dtypes(include=[np.number])
numeric_cols = policies_raw_numeric.columns.values

# Some intervention is required for predictors that were not flagged correctly

print("Categoric variables: ", categoric_cols)
print("Numeric variables: ", numeric_cols)

Display unique values for each class:

In [6]:
print("--Categoric--")
ignore = ['county_name']
for col in categoric_cols:
    if col not in ignore:
        print("{}: ".format(col), policies_raw[col].unique())

--Categoric--
Quote_dt:  ['2015-01-28' '2015-02-12' '2018-09-03' ... '2017-05-13' '2017-09-13'
 '2017-02-08']
discount:  ['Yes' 'No']
Home_policy_ind:  ['Y' 'N']
state_id:  ['NY' 'FL' 'NJ' 'CT' 'MN' 'WI' 'AL' 'GA']
quoted_amt:  ['$5,153' '$9,870' '$3,090' ... '$34,048' '$12,780' '$271']
Prior_carrier_grp:  ['Carrier_1' 'Carrier_4' 'Carrier_7' 'Carrier_3' 'Carrier_5' 'Carrier_6'
 nan 'Carrier_2' 'Carrier_8' 'Other']
Cov_package_type:  ['High' 'Low' 'Medium' nan]
policy_id:  ['policy_87209' 'policy_89288' 'policy_91413' ... 'policy_30163'
 'policy_63982' 'policy_54019']
split:  ['Train' 'Test']
primary_parking:  ['home/driveway' 'unknown' 'parking garage' 'street']


In [8]:
print("--Numeric--")
ignore = ['credit_score']
for col in numeric_cols:
    if col not in ignore:
        print("{}: ".format(col), policies_raw[col].unique())

--Numeric--
Unnamed: 0:  [    1     2     3 ... 49160 49161 49162]
zip:  [10465. 12801. 11548. ... 10011. 10022. 10128.]
Agent_cd:  [15973623. 46978147. 32759856. ... 60001931. 63517701. 20775178.]
CAT_zone:  [ 2.  4.  3.  1.  5. nan]
number_drivers:  [2 1 4 3 5 6]
num_loaned_veh:  [1 2 0 3]
num_owned_veh:  [2 1 3]
num_leased_veh:  [0 2 1]
total_number_veh:  [3 4 5 6 2 7 1 8]
convert_ind:  [ 0. nan  1.]


In [None]:
print("--Categoric--")
for col in categoric_cols:
    print("{}: ".format(col), train_raw[col].unique())

print("--Numeric--")
large = ['zip.code', 'ni.age', 'len.at.res', 'premium']
for col in numeric_cols:
    if col not in large:
        print("{}: ".format(col), train_raw[col].unique())

From the above, one can observe that there are many missing values in the dataset. Additionally, some of the constraints implied in the dataset description are not being followed. A list of proposed constraints is shown below.

Membership Constraints:
- state id must be within the set of ratified US states

Range Constraints:
- credit score must be within standard range (i.e., [300, 850])

Cross-validation Constraints:
- county name must be valid with resepect to state id

Regular Expressions:
- zip code must follow standard US format
- policy ID may follow a certain format
- date must follow a valid format that is consistent
- agent code must follow an 8-digit format
- quoted amount must follow conventional monetary format
    + data type of quote amount may be altered to more useable format

Uniqueness Constraints:
- customer identifiers must be unique
- all observations must be unique

Data-Type Constraints:
- all features should have consistent data-types
- Y/N values are preferred to be consistent across dataset

Analysis of Missing Values:

In [9]:
# missing data percentage list
print("Missing training data:")

total_pct_missing, colnum = 0, 0
for col in policies_raw.columns:
    mean_missing = np.mean(policies_raw[col].isnull())
    pct_missing = round(mean_missing * 100, 5) # round to 5 decimal places
    total_pct_missing += pct_missing
    
    print("{}. {} - {}%".format(colnum, col, pct_missing))
    colnum += 1

print("\n{}% missing in total".format(total_pct_missing))

Missing training data:
0. Unnamed: 0 - 0.0%
1. Quote_dt - 0.0%
2. discount - 0.0%
3. Home_policy_ind - 0.0%
4. zip - 0.96009%
5. state_id - 0.0%
6. county_name - 0.0%
7. Agent_cd - 11.04512%
8. quoted_amt - 0.22782%
9. Prior_carrier_grp - 10.17046%
10. credit_score - 0.61023%
11. Cov_package_type - 1.56625%
12. CAT_zone - 0.50852%
13. policy_id - 0.0%
14. number_drivers - 0.0%
15. num_loaned_veh - 0.0%
16. num_owned_veh - 0.0%
17. num_leased_veh - 0.0%
18. total_number_veh - 0.0%
19. convert_ind - 25.00102%
20. split - 0.0%
21. primary_parking - 0.0%

50.089510000000004% missing in total
