In [None]:
# import utility modules
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

# load raw data
drivers_raw = pd.read_csv('data/drivers.csv')
policies_raw = pd.read_csv('data/policies.csv')
vehicles_raw = pd.read_csv('data/vehicles.csv')

print("--drivers--")
print(drivers_raw.info())

print("--policies--")
print(policies_raw.info())

print("--vehicles--")
print(vehicles_raw.info())

In [None]:
drivers_raw.head() # what is purpose of "Unnamed: 0" col

In [23]:
#policies_raw.head()
pol = policies_raw['Prior_carrier_grp'].unique()
for col in pol:
    hist = policies_raw[[col]].hist(bins=100) # keep number of bins at 100
    plt.title("{}_hist".format(col))
    plt.savefig('figs/histograms/{}_hist.png'.format(col))

array(['Carrier_1', 'Carrier_4', 'Carrier_7', 'Carrier_3', 'Carrier_5',
       'Carrier_6', nan, 'Carrier_2', 'Carrier_8', 'Other'], dtype=object)

In [None]:
vehicles_raw.head()

Variable Identification:
+ should variables be classified further (i.e., discrete, continuous, ordinal, nominal)?

In [None]:
# separate variables by numeric or categorical
train_raw_categorical = train_raw.select_dtypes(exclude=[np.number])
categoric_cols = train_raw_categorical.columns.values
train_raw_numerical = train_raw.select_dtypes(include=[np.number])
numeric_cols = train_raw_numerical.columns.values

categoric_cols = np.insert(categoric_cols, 0, 'cancel') # move 'cancel' to categorical
categoric_cols = np.append(categoric_cols, 'ni.marital.status') # move 'marital.status' to categorical
numeric_cols = numeric_cols[2:] # remove 'id' and 'cancel' from numerical

print("Categoric variables: ", categoric_cols)
print("Numeric variables: ", numeric_cols)

In [None]:
print("--Categoric--")
for col in categoric_cols:
    print("{}: ".format(col), train_raw[col].unique())

print("--Numeric--")
large = ['zip.code', 'ni.age', 'len.at.res', 'premium']
for col in numeric_cols:
    if col not in large:
        print("{}: ".format(col), train_raw[col].unique())

From the above, one can observe that there are many missing values in the dataset. Additionally, some of the constraints implied in the dataset description are not being followed (i.e., values of cancel outside expected range, datatypes that are not consistent). A list of proposed constraints is shown below.

Range Constraints:
- cancel must be within [0, 2]
- age must be within [18, 100] (tentative)
- length at residence cannot be negative
- premium cannot be negative

Cross-validation Constraints:
- tenure cannot exceed age
- length at residence cannot exceed age
- age and marital status must be logically consistent (tentative)

Uniqueness Constraints:
- all observations must be unique (tentative)

Data-Type Constraints:
- all features should have consistent data-types

Analysis of Missing Values:

In [None]:
# missing data percentage list
print("Missing training data:")

total_pct_missing, colnum = 0, 0
for col in train_raw.columns:
    mean_missing = np.mean(train_raw[col].isnull())
    pct_missing = round(mean_missing * 100, 5) # round to 5 decimal places
    total_pct_missing += pct_missing
    
    print("{}. {} - {}%".format(colnum, col, pct_missing))
    colnum += 1

print("\n{}% missing in total".format(total_pct_missing))

In [None]:
# generate histograms
for col in large:
    hist = train_raw[[col]].hist(bins=100) # keep number of bins at 100
    plt.title("{}_hist".format(col))
    plt.savefig('figs/histograms/{}_hist.png'.format(col))