In [36]:
# import utility modules
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

# load raw data
train_raw = pd.read_csv('data/train.csv')
print(train_raw.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1048575 entries, 0 to 1048574
Data columns (total 18 columns):
 #   Column             Non-Null Count    Dtype  
---  ------             --------------    -----  
 0   id                 1048575 non-null  int64  
 1   cancel             1048575 non-null  int64  
 2   year               1048575 non-null  int64  
 3   zip.code           1047624 non-null  float64
 4   house.color        1047630 non-null  object 
 5   ni.age             1047573 non-null  float64
 6   len.at.res         1047608 non-null  float64
 7   credit             1047668 non-null  object 
 8   coverage.type      1047595 non-null  object 
 9   dwelling.type      1047582 non-null  object 
 10  premium            1047618 non-null  float64
 11  sales.channel      1047550 non-null  object 
 12  ni.gender          1047615 non-null  object 
 13  ni.marital.status  1047581 non-null  float64
 14  n.adults           1047644 non-null  float64
 15  n.children         1047637 non-n

Variable Identification:
+ should variables be classified further (i.e., discrete, continuous, ordinal, nominal)?

In [37]:
# separate variables by numeric or categorical
train_raw_categorical = train_raw.select_dtypes(exclude=[np.number])
categoric_cols = train_raw_categorical.columns.values
train_raw_numerical = train_raw.select_dtypes(include=[np.number])
numeric_cols = train_raw_numerical.columns.values

categoric_cols = np.insert(categoric_cols, 0, 'cancel') # move 'cancel' to categorical
categoric_cols = np.append(categoric_cols, 'ni.marital.status') # move 'marital.status' to categorical
numeric_cols = numeric_cols[2:] # remove 'id' and 'cancel' from numerical

print("Categoric variables: ", categoric_cols)
print("Numeric variables: ", numeric_cols)

Categoric variables:  ['cancel' 'house.color' 'credit' 'coverage.type' 'dwelling.type'
 'sales.channel' 'ni.gender' 'ni.marital.status']
Numeric variables:  ['year' 'zip.code' 'ni.age' 'len.at.res' 'premium' 'ni.marital.status'
 'n.adults' 'n.children' 'tenure' 'claim.ind']


In [46]:
print("--Categoric--")
for col in categoric_cols:
    print("{}: ".format(col), train_raw[col].unique())

print("--Numeric--")
large = ['zip.code', 'ni.age', 'len.at.res', 'premium']
for col in numeric_cols:
    if col not in large:
        print("{}: ".format(col), train_raw[col].unique())

--Categoric--
cancel:  [ 0  2  1 -1]
house.color:  ['blue' 'white' 'red' 'yellow' nan]
credit:  ['high' 'medium' 'low' nan]
coverage.type:  ['C' 'A' 'B' nan]
dwelling.type:  ['Tenant' 'Condo' 'House' nan]
sales.channel:  ['Broker' 'Phone' 'Online' nan]
ni.gender:  ['F' 'M' nan]
ni.marital.status:  [ 0.  1. nan]
--Numeric--
year:  [2013 2014 2015 2016]
ni.marital.status:  [ 0.  1. nan]
n.adults:  [ 2.  5.  1.  4.  3.  6.  7. nan  8.  9. 11. 10. 12.]
n.children:  [ 0.  4.  2.  3.  1.  9.  6.  5. 11.  7.  8. 12. 10. nan]
tenure:  [15. 14. 22.  4. 16.  3.  7.  8. 12. 17.  0. 13.  1. 21.  9. 11. 20.  5.
  6. 18.  2. 10. 19. nan 26. 25. 24. 23. 27. 29. 31. 28. 30. 33. 32. 34.]
claim.ind:  [ 0.  1. nan]


From the above, one can observe that there are many missing values in the dataset. Additionally, some of the constraints implied in the dataset description are not being followed (i.e., values of cancel outside expected range, datatypes that are not consistent). A list of proposed constraints is shown below.

Range Constraints:
- cancel must be within [0, 2]
- age must be within [18, 100] (tentative)
- length at residence cannot be negative
- premium cannot be negative

Cross-validation Constraints:
- tenure cannot exceed age
- length at residence cannot exceed age
- age and marital status must be logically consistent (tentative)

Uniqueness Constraints:
- all observations must be unique (tentative)

Data-Type Constraints:
- all features should have consistent data-types

Analysis of Missing Values:

In [45]:
# missing data percentage list
print("Missing training data:")

total_pct_missing, colnum = 0, 0
for col in train_raw.columns:
    mean_missing = np.mean(train_raw[col].isnull())
    pct_missing = round(mean_missing * 100, 5) # round to 5 decimal places
    total_pct_missing += pct_missing
    
    print("{}. {} - {}%".format(colnum, col, pct_missing))
    colnum += 1

print("\n{}% missing in total".format(total_pct_missing))

Missing training data:
0. id - 0.0%
1. cancel - 0.0%
2. year - 0.0%
3. zip.code - 0.09069%
4. house.color - 0.09012%
5. ni.age - 0.09556%
6. len.at.res - 0.09222%
7. credit - 0.0865%
8. coverage.type - 0.09346%
9. dwelling.type - 0.0947%
10. premium - 0.09127%
11. sales.channel - 0.09775%
12. ni.gender - 0.09155%
13. ni.marital.status - 0.0948%
14. n.adults - 0.08879%
15. n.children - 0.08945%
16. tenure - 0.09346%
17. claim.ind - 0.09403%

1.38435% missing in total


In [None]:
""" # generate histograms
for col in numeric_cols:
    hist = train_clean[[col]].hist(bins=100) # keep number of bins at 100
    plt.title("{}_hist".format(col))
    plt.savefig('figs/histograms/{}_hist.png'.format(col)) """