In [1]:
import pandas as pd
import numpy as np

In [2]:
pd.set_option('max_columns', 120)
pd.set_option('max_colwidth', 5000)

In [5]:
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
plt.rcParams['figure.figsize'] = (12, 8)

In [73]:
claim_lines = pd.read_csv('~/Work/machine-learning/CLAIM_LINE.csv', low_memory=False)
half_count = len(claim_lines) / 2
claim_lines = claim_lines.dropna(thresh=half_count, axis=1)
claim_lines = claim_lines.drop(['ID', 'DESCRIPTION'], axis=1)

In [74]:
claim_lines.head()

Unnamed: 0,CLINICAL_CODE,CLAIMED_AMOUNT,AGE_APPLIED,GENDER_CODE
0,E78.9,6187.5,31,F
1,A15,6.71,35,F
2,M54.5,532.62,39,F
3,N76.0,8.46,39,F
4,C68.0,379.13,63,M


In [75]:
claim_lines.shape

(40479, 4)

In [76]:
# Let's normalize our data

claim_lines[['CLAIMED_AMOUNT', 'AGE_APPLIED']] \
    = claim_lines[['CLAIMED_AMOUNT', 'AGE_APPLIED']].apply(lambda x: (x - x.min()) / (x.max() - x.min()))
claim_lines.head()

Unnamed: 0,CLINICAL_CODE,CLAIMED_AMOUNT,AGE_APPLIED,GENDER_CODE
0,E78.9,0.008051,0.319588,F
1,A15,9e-06,0.360825,F
2,M54.5,0.000693,0.402062,F
3,N76.0,1.1e-05,0.402062,F
4,C68.0,0.000493,0.649485,M


In [77]:
claim_lines['CLINICAL_CODE'] = claim_lines['CLINICAL_CODE'].apply(lambda x: x[:3])
claim_lines.head()

Unnamed: 0,CLINICAL_CODE,CLAIMED_AMOUNT,AGE_APPLIED,GENDER_CODE
0,E78,0.008051,0.319588,F
1,A15,9e-06,0.360825,F
2,M54,0.000693,0.402062,F
3,N76,1.1e-05,0.402062,F
4,C68,0.000493,0.649485,M


In [82]:
claim_lines = claim_lines[['CLAIMED_AMOUNT', 'AGE_APPLIED', 'GENDER_CODE', 'CLINICAL_CODE']]
claim_lines = pd.get_dummies(claim_lines)
claim_lines.head()

Unnamed: 0,CLAIMED_AMOUNT,AGE_APPLIED,GENDER_CODE_F,GENDER_CODE_M,CLINICAL_CODE_A01,CLINICAL_CODE_A04,CLINICAL_CODE_A05,CLINICAL_CODE_A06,CLINICAL_CODE_A08,CLINICAL_CODE_A09,CLINICAL_CODE_A15,CLINICAL_CODE_A17,CLINICAL_CODE_A18,CLINICAL_CODE_A19,CLINICAL_CODE_A25,CLINICAL_CODE_A28,CLINICAL_CODE_A41,CLINICAL_CODE_A49,CLINICAL_CODE_A53,CLINICAL_CODE_A60,CLINICAL_CODE_A64,CLINICAL_CODE_A66,CLINICAL_CODE_A86,CLINICAL_CODE_A88,CLINICAL_CODE_A90,CLINICAL_CODE_A91,CLINICAL_CODE_A92,CLINICAL_CODE_A99,CLINICAL_CODE_B00,CLINICAL_CODE_B01,CLINICAL_CODE_B02,CLINICAL_CODE_B06,CLINICAL_CODE_B07,CLINICAL_CODE_B08,CLINICAL_CODE_B09,CLINICAL_CODE_B15,CLINICAL_CODE_B16,CLINICAL_CODE_B17,CLINICAL_CODE_B18,CLINICAL_CODE_B19,CLINICAL_CODE_B26,CLINICAL_CODE_B30,CLINICAL_CODE_B33,CLINICAL_CODE_B34,CLINICAL_CODE_B35,CLINICAL_CODE_B36,CLINICAL_CODE_B37,CLINICAL_CODE_B38,CLINICAL_CODE_B49,CLINICAL_CODE_B65,CLINICAL_CODE_B82,CLINICAL_CODE_B85,CLINICAL_CODE_B86,CLINICAL_CODE_B95,CLINICAL_CODE_B96,CLINICAL_CODE_B99,CLINICAL_CODE_C11,CLINICAL_CODE_C17,CLINICAL_CODE_C18,CLINICAL_CODE_C20,...,CLINICAL_CODE_S74,CLINICAL_CODE_S76,CLINICAL_CODE_S79,CLINICAL_CODE_S80,CLINICAL_CODE_S81,CLINICAL_CODE_S82,CLINICAL_CODE_S83,CLINICAL_CODE_S84,CLINICAL_CODE_S86,CLINICAL_CODE_S89,CLINICAL_CODE_S90,CLINICAL_CODE_S91,CLINICAL_CODE_S92,CLINICAL_CODE_S93,CLINICAL_CODE_S96,CLINICAL_CODE_S97,CLINICAL_CODE_S99,CLINICAL_CODE_T07,CLINICAL_CODE_T14,CLINICAL_CODE_T15,CLINICAL_CODE_T16,CLINICAL_CODE_T17,CLINICAL_CODE_T18,CLINICAL_CODE_T20,CLINICAL_CODE_T21,CLINICAL_CODE_T22,CLINICAL_CODE_T24,CLINICAL_CODE_T25,CLINICAL_CODE_T30,CLINICAL_CODE_T31,CLINICAL_CODE_T58,CLINICAL_CODE_T61,CLINICAL_CODE_T67,CLINICAL_CODE_T78,CLINICAL_CODE_T79,CLINICAL_CODE_T80,CLINICAL_CODE_T88,CLINICAL_CODE_V19,CLINICAL_CODE_V28,CLINICAL_CODE_V29,CLINICAL_CODE_V49,CLINICAL_CODE_W19,CLINICAL_CODE_W53,CLINICAL_CODE_W54,CLINICAL_CODE_W55,CLINICAL_CODE_W57,CLINICAL_CODE_X00,CLINICAL_CODE_Z00,CLINICAL_CODE_Z01,CLINICAL_CODE_Z04,CLINICAL_CODE_Z11,CLINICAL_CODE_Z13,CLINICAL_CODE_Z30,CLINICAL_CODE_Z31,CLINICAL_CODE_Z32,CLINICAL_CODE_Z39,CLINICAL_CODE_Z88,CLINICAL_CODE_Z90,CLINICAL_CODE_Z96,CLINICAL_CODE_Z98
0,0.008051,0.319588,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,9e-06,0.360825,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,0.000693,0.402062,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,1.1e-05,0.402062,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,0.000493,0.649485,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [85]:
# Let's save our cleaned data as CSV

claim_lines.to_csv('~/Work/machine-learning/CLAIM_LINE_CLEAN.csv', index=False)

In [79]:
# Map gender M/F to 0 / 1
# No longer needed due to pd.get_dummies()

#claim_lines['GENDER_CODE'] = claim_lines['GENDER_CODE'].map({'M': 1, 'F': 0})
#claim_lines.head()

In [87]:
pd.read_csv('~/Work/machine-learning/CLAIM_LINE_CLEAN.csv').head()

Unnamed: 0,CLAIMED_AMOUNT,AGE_APPLIED,GENDER_CODE_F,GENDER_CODE_M,CLINICAL_CODE_A01,CLINICAL_CODE_A04,CLINICAL_CODE_A05,CLINICAL_CODE_A06,CLINICAL_CODE_A08,CLINICAL_CODE_A09,CLINICAL_CODE_A15,CLINICAL_CODE_A17,CLINICAL_CODE_A18,CLINICAL_CODE_A19,CLINICAL_CODE_A25,CLINICAL_CODE_A28,CLINICAL_CODE_A41,CLINICAL_CODE_A49,CLINICAL_CODE_A53,CLINICAL_CODE_A60,CLINICAL_CODE_A64,CLINICAL_CODE_A66,CLINICAL_CODE_A86,CLINICAL_CODE_A88,CLINICAL_CODE_A90,CLINICAL_CODE_A91,CLINICAL_CODE_A92,CLINICAL_CODE_A99,CLINICAL_CODE_B00,CLINICAL_CODE_B01,CLINICAL_CODE_B02,CLINICAL_CODE_B06,CLINICAL_CODE_B07,CLINICAL_CODE_B08,CLINICAL_CODE_B09,CLINICAL_CODE_B15,CLINICAL_CODE_B16,CLINICAL_CODE_B17,CLINICAL_CODE_B18,CLINICAL_CODE_B19,CLINICAL_CODE_B26,CLINICAL_CODE_B30,CLINICAL_CODE_B33,CLINICAL_CODE_B34,CLINICAL_CODE_B35,CLINICAL_CODE_B36,CLINICAL_CODE_B37,CLINICAL_CODE_B38,CLINICAL_CODE_B49,CLINICAL_CODE_B65,CLINICAL_CODE_B82,CLINICAL_CODE_B85,CLINICAL_CODE_B86,CLINICAL_CODE_B95,CLINICAL_CODE_B96,CLINICAL_CODE_B99,CLINICAL_CODE_C11,CLINICAL_CODE_C17,CLINICAL_CODE_C18,CLINICAL_CODE_C20,...,CLINICAL_CODE_S74,CLINICAL_CODE_S76,CLINICAL_CODE_S79,CLINICAL_CODE_S80,CLINICAL_CODE_S81,CLINICAL_CODE_S82,CLINICAL_CODE_S83,CLINICAL_CODE_S84,CLINICAL_CODE_S86,CLINICAL_CODE_S89,CLINICAL_CODE_S90,CLINICAL_CODE_S91,CLINICAL_CODE_S92,CLINICAL_CODE_S93,CLINICAL_CODE_S96,CLINICAL_CODE_S97,CLINICAL_CODE_S99,CLINICAL_CODE_T07,CLINICAL_CODE_T14,CLINICAL_CODE_T15,CLINICAL_CODE_T16,CLINICAL_CODE_T17,CLINICAL_CODE_T18,CLINICAL_CODE_T20,CLINICAL_CODE_T21,CLINICAL_CODE_T22,CLINICAL_CODE_T24,CLINICAL_CODE_T25,CLINICAL_CODE_T30,CLINICAL_CODE_T31,CLINICAL_CODE_T58,CLINICAL_CODE_T61,CLINICAL_CODE_T67,CLINICAL_CODE_T78,CLINICAL_CODE_T79,CLINICAL_CODE_T80,CLINICAL_CODE_T88,CLINICAL_CODE_V19,CLINICAL_CODE_V28,CLINICAL_CODE_V29,CLINICAL_CODE_V49,CLINICAL_CODE_W19,CLINICAL_CODE_W53,CLINICAL_CODE_W54,CLINICAL_CODE_W55,CLINICAL_CODE_W57,CLINICAL_CODE_X00,CLINICAL_CODE_Z00,CLINICAL_CODE_Z01,CLINICAL_CODE_Z04,CLINICAL_CODE_Z11,CLINICAL_CODE_Z13,CLINICAL_CODE_Z30,CLINICAL_CODE_Z31,CLINICAL_CODE_Z32,CLINICAL_CODE_Z39,CLINICAL_CODE_Z88,CLINICAL_CODE_Z90,CLINICAL_CODE_Z96,CLINICAL_CODE_Z98
0,0.008051,0.319588,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,9e-06,0.360825,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,0.000693,0.402062,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,1.1e-05,0.402062,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,0.000493,0.649485,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
