In [8]:
import matplotlib.pyplot as plt
%matplotlib inline
import pandas as pd
import numpy as np
import seaborn as sns
from datetime import datetime

In [9]:
pd.options.display.max_columns = None
pd.options.display.max_rows = None

### Load data

In [10]:
train_inpt = pd.read_csv('Train_Inpatientdata.csv', index_col = 0)
train_outpt = pd.read_csv('Train_Outpatientdata.csv', index_col = 0)
train_beneficiary = pd.read_csv('Train_Beneficiarydata.csv', index_col = 0)
train = pd.read_csv('Train_FraudFlag.csv', index_col = 0)

In [11]:
# Create Encounter column to indicate inpatient vs outpatient
train_inpt['Encounter'] = 1
train_outpt['Encounter'] = 0

### Combine all data sets

In [12]:
# Append inpatient and outpatient data set
temp_df = train_inpt.append(train_outpt)

# Combine inpatient and outpatient data with beneficiary data
combined_beneficiaries = pd.merge(temp_df, train_beneficiary, on = 'BeneID', how='outer')

# Combine train provider fraudulent tags with all data
data = pd.merge(combined_beneficiaries, train, on = 'Provider', how='outer')
data.head()


Unnamed: 0,ClaimID,ClaimStartDt,ClaimEndDt,Provider,InscClaimAmtReimbursed,AttendingPhysician,OperatingPhysician,OtherPhysician,AdmissionDt,ClmAdmitDiagnosisCode,DeductibleAmtPaid,DischargeDt,DiagnosisGroupCode,ClmDiagnosisCode_1,ClmDiagnosisCode_2,ClmDiagnosisCode_3,ClmDiagnosisCode_4,ClmDiagnosisCode_5,ClmDiagnosisCode_6,ClmDiagnosisCode_7,ClmDiagnosisCode_8,ClmDiagnosisCode_9,ClmDiagnosisCode_10,ClmProcedureCode_1,ClmProcedureCode_2,ClmProcedureCode_3,ClmProcedureCode_4,ClmProcedureCode_5,ClmProcedureCode_6,Encounter,DOB,DOD,Gender,Race,RenalDiseaseIndicator,State,County,NoOfMonths_PartACov,NoOfMonths_PartBCov,ChronicCond_Alzheimer,ChronicCond_Heartfailure,ChronicCond_KidneyDisease,ChronicCond_Cancer,ChronicCond_ObstrPulmonary,ChronicCond_Depression,ChronicCond_Diabetes,ChronicCond_IschemicHeart,ChronicCond_Osteoporasis,ChronicCond_rheumatoidarthritis,ChronicCond_stroke,IPAnnualReimbursementAmt,IPAnnualDeductibleAmt,OPAnnualReimbursementAmt,OPAnnualDeductibleAmt,PotentialFraud
0,CLM46614,2009-04-12,2009-04-18,PRV55912,26000,PHY390922,,,2009-04-12,7866,1068.0,2009-04-18,201.0,1970,4019,5853,7843,2768,71590,2724,19889.0,5849,,,,,,,,1,1943-01-01,,1,1,0,39,230,12,12,1,2,1,2,2,1,1,1,2,1,1,36000,3204,60,70,Yes
1,CLM565430,2009-09-06,2009-09-06,PRV55912,50,PHY365867,PHY327147,,,,0.0,,,V7183,53081,78959,4280,E8788,78079,79902,25002.0,71848,,,,,,,,0,1932-05-01,,1,1,0,39,310,12,12,1,1,1,2,2,2,2,1,2,2,2,24000,2136,450,200,Yes
2,CLM34721,2009-01-20,2009-02-01,PRV55912,19000,PHY349293,PHY370861,PHY363291,2009-01-20,45340,1068.0,2009-02-01,987.0,4240,2639,2948,40390,45821,28489,5854,2753.0,E9305,,7769.0,5849.0,,,,,1,1913-12-01,,2,1,0,39,230,12,12,1,1,1,2,1,2,2,1,2,2,2,19000,1068,100,20,Yes
3,CLM72336,2009-10-17,2009-11-04,PRV55912,17000,PHY334706,PHY334706,,2009-10-17,V5789,1068.0,2009-11-04,941.0,V5789,4168,73313,7812,7993,78830,72273,43812.0,4019,,9338.0,,,,,,1,1922-10-01,,1,1,0,39,600,12,12,2,2,2,2,2,2,2,1,1,2,2,17000,1068,1050,540,Yes
4,CLM73394,2009-10-25,2009-10-29,PRV55912,13000,PHY390614,PHY323689,PHY363291,2009-10-25,71946,1068.0,2009-10-29,506.0,71535,71960,4019,V1202,4240,2449,2768,,,,8154.0,,,,,,1,1930-07-01,,2,1,0,39,280,12,12,2,1,2,2,1,2,1,1,1,1,2,27000,2136,450,160,Yes


In [83]:
chron_cond = ['ChronicCond_Alzheimer', 'ChronicCond_Heartfailure',
       'ChronicCond_KidneyDisease', 'ChronicCond_Cancer',
       'ChronicCond_ObstrPulmonary', 'ChronicCond_Depression',
       'ChronicCond_Diabetes', 'ChronicCond_IschemicHeart',
       'ChronicCond_Osteoporasis', 'ChronicCond_rheumatoidarthritis',
       'ChronicCond_stroke']
for x in chron_cond:
    data[x].replace({2:0}, inplace=True)

## Examine frequencies of diagnosis codes

In [13]:
diag_data = data[['ClmDiagnosisCode_1','ClmDiagnosisCode_2','ClmDiagnosisCode_3','ClmDiagnosisCode_4','ClmDiagnosisCode_5','ClmDiagnosisCode_6','ClmDiagnosisCode_7','ClmDiagnosisCode_8','ClmDiagnosisCode_9','ClmDiagnosisCode_10']]

In [20]:
diag_data.head()

Unnamed: 0,ClmDiagnosisCode_1,ClmDiagnosisCode_2,ClmDiagnosisCode_3,ClmDiagnosisCode_4,ClmDiagnosisCode_5,ClmDiagnosisCode_6,ClmDiagnosisCode_7,ClmDiagnosisCode_8,ClmDiagnosisCode_9,ClmDiagnosisCode_10
0,1970,4019,5853,7843,2768,71590,2724,19889.0,5849,
1,V7183,53081,78959,4280,E8788,78079,79902,25002.0,71848,
2,4240,2639,2948,40390,45821,28489,5854,2753.0,E9305,
3,V5789,4168,73313,7812,7993,78830,72273,43812.0,4019,
4,71535,71960,4019,V1202,4240,2449,2768,,,


In [22]:
diag_code_freq = diag_data.apply(pd.value_counts).reset_index()

In [30]:
diag_code_freq.head()

Unnamed: 0,index,ClmDiagnosisCode_1,ClmDiagnosisCode_2,ClmDiagnosisCode_3,ClmDiagnosisCode_4,ClmDiagnosisCode_5,ClmDiagnosisCode_6,ClmDiagnosisCode_7,ClmDiagnosisCode_8,ClmDiagnosisCode_9,ClmDiagnosisCode_10,sum_codes
0,10,7.0,,,,,,,,,,7.0
1,11,1.0,,,,,,,,,,1.0
2,19,3.0,,,,,,,,,,3.0
3,20,4.0,,,,,,,,,,4.0
4,21,5.0,,,,,,,,,,5.0


In [24]:
diag_code_freq['sum_codes'] = diag_code_freq.apply(lambda x: x[['ClmDiagnosisCode_1','ClmDiagnosisCode_2','ClmDiagnosisCode_3','ClmDiagnosisCode_4','ClmDiagnosisCode_5','ClmDiagnosisCode_6','ClmDiagnosisCode_7','ClmDiagnosisCode_8','ClmDiagnosisCode_9','ClmDiagnosisCode_10']].sum(), axis=1)


In [25]:
sort_diag_code = diag_code_freq.sort_values('sum_codes', ascending =False).reset_index(drop=True)

In [26]:
sort_diag_code

Unnamed: 0,index,ClmDiagnosisCode_1,ClmDiagnosisCode_2,ClmDiagnosisCode_3,ClmDiagnosisCode_4,ClmDiagnosisCode_5,ClmDiagnosisCode_6,ClmDiagnosisCode_7,ClmDiagnosisCode_8,ClmDiagnosisCode_9,ClmDiagnosisCode_10,sum_codes
0,4019,13886.0,22378.0,14408.0,9188.0,6005.0,4170.0,3014.0,2257.0,1581.0,169.0,77056.0
1,25000,2013.0,11744.0,7946.0,5250.0,3451.0,2506.0,1822.0,1399.0,1100.0,125.0,37356.0
2,2724,3606.0,11272.0,7368.0,4764.0,2945.0,2072.0,1536.0,1178.0,922.0,100.0,35763.0
3,V5869,1030.0,9669.0,6069.0,3367.0,1921.0,1172.0,780.0,528.0,346.0,22.0,24904.0
4,4011,12512.0,4615.0,2884.0,1671.0,915.0,496.0,341.0,214.0,120.0,5.0,23773.0
5,42731,2619.0,5845.0,3738.0,2438.0,1632.0,1312.0,1026.0,809.0,652.0,67.0,20138.0
6,V5861,1218.0,9716.0,4180.0,1893.0,1093.0,704.0,500.0,403.0,276.0,18.0,20001.0
7,2720,3210.0,5117.0,3469.0,2274.0,1377.0,983.0,787.0,554.0,443.0,54.0,18268.0
8,2449,416.0,5687.0,3832.0,2523.0,1647.0,1202.0,912.0,751.0,559.0,71.0,17600.0
9,4280,1337.0,4073.0,2891.0,2043.0,1426.0,1224.0,965.0,795.0,665.0,88.0,15507.0


## Create final list of diagnosis codes that appear more than 10% of the time

In [32]:
final_diag_codes = sort_diag_code[sort_diag_code['sum_codes'] > len(data)*.01]
final_diag_codes = final_diag_codes.rename(columns={'index': 'code'})
final_diag_codes.head()

Unnamed: 0,code,ClmDiagnosisCode_1,ClmDiagnosisCode_2,ClmDiagnosisCode_3,ClmDiagnosisCode_4,ClmDiagnosisCode_5,ClmDiagnosisCode_6,ClmDiagnosisCode_7,ClmDiagnosisCode_8,ClmDiagnosisCode_9,ClmDiagnosisCode_10,sum_codes
0,4019,13886.0,22378.0,14408.0,9188.0,6005.0,4170.0,3014.0,2257.0,1581.0,169.0,77056.0
1,25000,2013.0,11744.0,7946.0,5250.0,3451.0,2506.0,1822.0,1399.0,1100.0,125.0,37356.0
2,2724,3606.0,11272.0,7368.0,4764.0,2945.0,2072.0,1536.0,1178.0,922.0,100.0,35763.0
3,V5869,1030.0,9669.0,6069.0,3367.0,1921.0,1172.0,780.0,528.0,346.0,22.0,24904.0
4,4011,12512.0,4615.0,2884.0,1671.0,915.0,496.0,341.0,214.0,120.0,5.0,23773.0


## Examine frequencies of procedure codes

In [33]:
Proc_data = data[['ClmProcedureCode_1','ClmProcedureCode_2','ClmProcedureCode_3','ClmProcedureCode_4','ClmProcedureCode_5','ClmProcedureCode_6']]


In [34]:
proc_code_freq = Proc_data.apply(pd.value_counts).reset_index()

In [35]:
proc_code_freq.head()

Unnamed: 0,index,ClmProcedureCode_1,ClmProcedureCode_2,ClmProcedureCode_3,ClmProcedureCode_4,ClmProcedureCode_5,ClmProcedureCode_6
0,11.0,1.0,,,,,
1,13.0,8.0,,,,,
2,14.0,6.0,,,,,
3,15.0,1.0,,,,,
4,17.0,27.0,,,,,


In [36]:
proc_code_freq['sum_codes'] = proc_code_freq.apply(lambda x: x[['ClmProcedureCode_1','ClmProcedureCode_2','ClmProcedureCode_3','ClmProcedureCode_4','ClmProcedureCode_5','ClmProcedureCode_6']].sum(), axis=1)


In [37]:
sort_proc_code = proc_code_freq.sort_values('sum_codes', ascending =False).reset_index(drop=True)

In [257]:
sort_proc_code

Unnamed: 0,index,ClmProcedureCode_1,ClmProcedureCode_2,ClmProcedureCode_3,ClmProcedureCode_4,ClmProcedureCode_5,ClmProcedureCode_6,sum_codes
0,4019.0,4.0,1736.0,201.0,18.0,,,1959.0
1,9904.0,1152.0,,,,,,1152.0
2,2724.0,1.0,843.0,192.0,16.0,2.0,,1054.0
3,8154.0,1022.0,,,,,,1022.0
4,66.0,900.0,1.0,,,,,901.0
5,3893.0,854.0,,,,,,854.0
6,3995.0,809.0,,,,,,809.0
7,4516.0,651.0,,,,,,651.0
8,3722.0,589.0,,,,,,589.0
9,8151.0,463.0,,,,,,463.0


In [39]:
print((len(data[data['Encounter'] == 1])*.01))


404.74


In [334]:
len(sort_proc_code[sort_proc_code['sum_codes']>404])

11

## Create final list of top procedure codes that appear more than 10% of the time

In [40]:
final_proc_codes = sort_proc_code[sort_proc_code['sum_codes'] > 404]
final_proc_codes = final_proc_codes.rename(columns={'index': 'code'})
final_proc_codes

Unnamed: 0,code,ClmProcedureCode_1,ClmProcedureCode_2,ClmProcedureCode_3,ClmProcedureCode_4,ClmProcedureCode_5,ClmProcedureCode_6,sum_codes
0,4019.0,4.0,1736.0,201.0,18.0,,,1959.0
1,9904.0,1152.0,,,,,,1152.0
2,2724.0,1.0,843.0,192.0,16.0,2.0,,1054.0
3,8154.0,1022.0,,,,,,1022.0
4,66.0,900.0,1.0,,,,,901.0
5,3893.0,854.0,,,,,,854.0
6,3995.0,809.0,,,,,,809.0
7,4516.0,651.0,,,,,,651.0
8,3722.0,589.0,,,,,,589.0
9,8151.0,463.0,,,,,,463.0


In [42]:
final_proc_codes['code'] = final_proc_codes.apply(lambda x: int(x.code), axis=1)


### Create Columns of Top Claim Codes and Indicate Presence of Code with Binary Codes

### Diagnosis Code Columns

In [43]:
diag_codes = ['4019','25000','2724','V5869','4011','42731','V5861','2720','2449','4280','53081','41401',
                   '496','2859','41400','78079','5990','28521','3051','2809','311','73300','58881','71590',
                   '5859','V4581','2722']
diag_cols = [x for x in data.columns if "ClmDiagnosisCode" in x]

for col in diag_codes:
    data[str(col)+'_diag'] = np.where((data[diag_cols].eq(col)).any(1, skipna=True), 1, 0)


In [44]:
# Drop ClmDiagnosisCode columns
data.drop(data[diag_cols], inplace=True, axis=1)

### Procedure Code Columns

In [47]:
proc_codes = [4019, 9904, 2724, 8154, 66, 3893, 3995, 4516, 3722, 8151, 8872]
proc_cols = [x for x in data.columns if "ClmProcedureCode" in x]

for col in proc_codes:
    data[str(col)+'_proc'] = np.where((data[proc_cols].eq(col)).any(1, skipna=True), 1, 0)

In [48]:
# Drop ClmProcedureCode columns
data.drop(data[proc_cols], inplace=True, axis=1)

In [49]:
data.head()

Unnamed: 0,ClaimID,ClaimStartDt,ClaimEndDt,Provider,InscClaimAmtReimbursed,AttendingPhysician,OperatingPhysician,OtherPhysician,AdmissionDt,ClmAdmitDiagnosisCode,DeductibleAmtPaid,DischargeDt,DiagnosisGroupCode,Encounter,DOB,DOD,Gender,Race,RenalDiseaseIndicator,State,County,NoOfMonths_PartACov,NoOfMonths_PartBCov,ChronicCond_Alzheimer,ChronicCond_Heartfailure,ChronicCond_KidneyDisease,ChronicCond_Cancer,ChronicCond_ObstrPulmonary,ChronicCond_Depression,ChronicCond_Diabetes,ChronicCond_IschemicHeart,ChronicCond_Osteoporasis,ChronicCond_rheumatoidarthritis,ChronicCond_stroke,IPAnnualReimbursementAmt,IPAnnualDeductibleAmt,OPAnnualReimbursementAmt,OPAnnualDeductibleAmt,PotentialFraud,4019_diag,25000_diag,2724_diag,V5869_diag,4011_diag,42731_diag,V5861_diag,2720_diag,2449_diag,4280_diag,53081_diag,41401_diag,496_diag,2859_diag,41400_diag,78079_diag,5990_diag,28521_diag,3051_diag,2809_diag,311_diag,73300_diag,58881_diag,71590_diag,5859_diag,V4581_diag,2722_diag,4019_proc,9904_proc,2724_proc,8154_proc,66_proc,3893_proc,3995_proc,4516_proc,3722_proc,8151_proc,8872_proc
0,CLM46614,2009-04-12,2009-04-18,PRV55912,26000,PHY390922,,,2009-04-12,7866,1068.0,2009-04-18,201.0,1,1943-01-01,,1,1,0,39,230,12,12,1,2,1,2,2,1,1,1,2,1,1,36000,3204,60,70,Yes,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,CLM565430,2009-09-06,2009-09-06,PRV55912,50,PHY365867,PHY327147,,,,0.0,,,0,1932-05-01,,1,1,0,39,310,12,12,1,1,1,2,2,2,2,1,2,2,2,24000,2136,450,200,Yes,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,CLM34721,2009-01-20,2009-02-01,PRV55912,19000,PHY349293,PHY370861,PHY363291,2009-01-20,45340,1068.0,2009-02-01,987.0,1,1913-12-01,,2,1,0,39,230,12,12,1,1,1,2,1,2,2,1,2,2,2,19000,1068,100,20,Yes,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,CLM72336,2009-10-17,2009-11-04,PRV55912,17000,PHY334706,PHY334706,,2009-10-17,V5789,1068.0,2009-11-04,941.0,1,1922-10-01,,1,1,0,39,600,12,12,2,2,2,2,2,2,2,1,1,2,2,17000,1068,1050,540,Yes,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,CLM73394,2009-10-25,2009-10-29,PRV55912,13000,PHY390614,PHY323689,PHY363291,2009-10-25,71946,1068.0,2009-10-29,506.0,1,1930-07-01,,2,1,0,39,280,12,12,2,1,2,2,1,2,1,1,1,1,2,27000,2136,450,160,Yes,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0


### Prepare data for Latent Dirichlet Allocation

In [347]:
data.columns

Index(['ClaimID', 'ClaimStartDt', 'ClaimEndDt', 'Provider',
       'InscClaimAmtReimbursed', 'AttendingPhysician', 'OperatingPhysician',
       'OtherPhysician', 'AdmissionDt', 'ClmAdmitDiagnosisCode',
       'DeductibleAmtPaid', 'DischargeDt', 'DiagnosisGroupCode', 'Encounter',
       'DOB', 'DOD', 'Gender', 'Race', 'RenalDiseaseIndicator', 'State',
       'County', 'NoOfMonths_PartACov', 'NoOfMonths_PartBCov',
       'ChronicCond_Alzheimer', 'ChronicCond_Heartfailure',
       'ChronicCond_KidneyDisease', 'ChronicCond_Cancer',
       'ChronicCond_ObstrPulmonary', 'ChronicCond_Depression',
       'ChronicCond_Diabetes', 'ChronicCond_IschemicHeart',
       'ChronicCond_Osteoporasis', 'ChronicCond_rheumatoidarthritis',
       'ChronicCond_stroke', 'IPAnnualReimbursementAmt',
       'IPAnnualDeductibleAmt', 'OPAnnualReimbursementAmt',
       'OPAnnualDeductibleAmt', 'PotentialFraud', '4019_diag', '25000_diag',
       '2724_diag', 'V5869_diag', '4011_diag', '42731_diag', 'V5861_diag',
    

In [87]:
del_col = ['Provider','ClaimID', 'ClaimStartDt', 'ClaimEndDt',
       'InscClaimAmtReimbursed', 'AttendingPhysician', 'OperatingPhysician',
       'OtherPhysician', 'AdmissionDt', 'ClmAdmitDiagnosisCode',
       'DeductibleAmtPaid', 'DischargeDt', 'DiagnosisGroupCode',
       'Encounter', 'DOB', 'DOD', 'Gender', 'Race', 'RenalDiseaseIndicator',
       'State', 'County', 'NoOfMonths_PartACov', 'NoOfMonths_PartBCov',
       'IPAnnualReimbursementAmt',
       'IPAnnualDeductibleAmt', 'OPAnnualReimbursementAmt',
       'OPAnnualDeductibleAmt', 'PotentialFraud']
df_toLDA = data.drop(del_col, axis=1)

In [70]:
df_toLDA.head()

Unnamed: 0,ChronicCond_Alzheimer,ChronicCond_Heartfailure,ChronicCond_KidneyDisease,ChronicCond_Cancer,ChronicCond_ObstrPulmonary,ChronicCond_Depression,ChronicCond_Diabetes,ChronicCond_IschemicHeart,ChronicCond_Osteoporasis,ChronicCond_rheumatoidarthritis,ChronicCond_stroke,4019_diag,25000_diag,2724_diag,V5869_diag,4011_diag,42731_diag,V5861_diag,2720_diag,2449_diag,4280_diag,53081_diag,41401_diag,496_diag,2859_diag,41400_diag,78079_diag,5990_diag,28521_diag,3051_diag,2809_diag,311_diag,73300_diag,58881_diag,71590_diag,5859_diag,V4581_diag,2722_diag,4019_proc,9904_proc,2724_proc,8154_proc,66_proc,3893_proc,3995_proc,4516_proc,3722_proc,8151_proc,8872_proc
0,1,2,1,2,2,1,1,1,2,1,1,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,1,1,1,2,2,2,2,1,2,2,2,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,1,1,1,2,1,2,2,1,2,2,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,2,2,2,2,2,2,2,1,1,2,2,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,2,1,2,2,1,2,1,1,1,1,2,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0


In [71]:
df_toLDA.shape

(558211, 49)

In [96]:
import gensim
import gensim.corpora as corpora


In [99]:
# Function to create dictionary of code and word
def my_dict(df):
    my_dictionary = {}
    i = 0
    for col in df.columns:
        my_dictionary.update({i:col})
        i += 1
    return my_dictionary

my_dictionary = my_dict(df_toLDA)

In [100]:
my_dictionary

{0: 'ChronicCond_Alzheimer',
 1: 'ChronicCond_Heartfailure',
 2: 'ChronicCond_KidneyDisease',
 3: 'ChronicCond_Cancer',
 4: 'ChronicCond_ObstrPulmonary',
 5: 'ChronicCond_Depression',
 6: 'ChronicCond_Diabetes',
 7: 'ChronicCond_IschemicHeart',
 8: 'ChronicCond_Osteoporasis',
 9: 'ChronicCond_rheumatoidarthritis',
 10: 'ChronicCond_stroke',
 11: '4019_diag',
 12: '25000_diag',
 13: '2724_diag',
 14: 'V5869_diag',
 15: '4011_diag',
 16: '42731_diag',
 17: 'V5861_diag',
 18: '2720_diag',
 19: '2449_diag',
 20: '4280_diag',
 21: '53081_diag',
 22: '41401_diag',
 23: '496_diag',
 24: '2859_diag',
 25: '41400_diag',
 26: '78079_diag',
 27: '5990_diag',
 28: '28521_diag',
 29: '3051_diag',
 30: '2809_diag',
 31: '311_diag',
 32: '73300_diag',
 33: '58881_diag',
 34: '71590_diag',
 35: '5859_diag',
 36: 'V4581_diag',
 37: '2722_diag',
 38: '4019_proc',
 39: '9904_proc',
 40: '2724_proc',
 41: '8154_proc',
 42: '66_proc',
 43: '3893_proc',
 44: '3995_proc',
 45: '4516_proc',
 46: '3722_proc',


In [103]:
# list_tuples = tuple(df_toLDA.itertuples(index=False))
df_toLDA_copy = df_toLDA.copy()
df_toLDA.columns = range(df_toLDA.shape[1])


In [75]:
df_toLDA.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48
0,1,2,1,2,2,1,1,1,2,1,1,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,1,1,1,2,2,2,2,1,2,2,2,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,1,1,1,2,1,2,2,1,2,2,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,2,2,2,2,2,2,2,1,1,2,2,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,2,1,2,2,1,2,1,1,1,1,2,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0


In [77]:
def my_listTuples(df):
    list_tuples = []
    temp_tuples = []
    for i in range(len(df)):
        for col in df.columns:
            temp_tuples.append((col, df[col][i]))
        list_tuples.append(temp_tuples)
        temp_tuples = []
    return list_tuples


In [78]:
my_tuples = my_listTuples(df_toLDA)
my_tuples[0]

[(0, 1),
 (1, 2),
 (2, 1),
 (3, 2),
 (4, 2),
 (5, 1),
 (6, 1),
 (7, 1),
 (8, 2),
 (9, 1),
 (10, 1),
 (11, 1),
 (12, 0),
 (13, 1),
 (14, 0),
 (15, 0),
 (16, 0),
 (17, 0),
 (18, 0),
 (19, 0),
 (20, 0),
 (21, 0),
 (22, 0),
 (23, 0),
 (24, 0),
 (25, 0),
 (26, 0),
 (27, 0),
 (28, 0),
 (29, 0),
 (30, 0),
 (31, 0),
 (32, 0),
 (33, 0),
 (34, 1),
 (35, 0),
 (36, 0),
 (37, 0),
 (38, 0),
 (39, 0),
 (40, 0),
 (41, 0),
 (42, 0),
 (43, 0),
 (44, 0),
 (45, 0),
 (46, 0),
 (47, 0),
 (48, 0)]

In [101]:
%%time
lda_model = gensim.models.LdaMulticore(my_tuples, num_topics=2, id2word = my_dictionary, passes=1)


CPU times: user 6min 57s, sys: 52.2 s, total: 7min 49s
Wall time: 9min 44s


In [421]:
for idx, topic in lda_model.print_topics(-1):
    print('Topic: {} Word: {}\n'.format(idx, topic))

Topic: 0 Word: 0.198*"ChronicCond_IschemicHeart" + 0.100*"ChronicCond_Diabetes" + 0.100*"ChronicCond_Heartfailure" + 0.082*"ChronicCond_Osteoporasis" + 0.071*"ChronicCond_Depression" + 0.062*"ChronicCond_rheumatoidarthritis" + 0.060*"ChronicCond_Alzheimer" + 0.054*"ChronicCond_KidneyDisease" + 0.052*"ChronicCond_ObstrPulmonary" + 0.023*"ChronicCond_Cancer"

Topic: 1 Word: 0.137*"ChronicCond_IschemicHeart" + 0.135*"ChronicCond_Diabetes" + 0.112*"ChronicCond_Heartfailure" + 0.083*"ChronicCond_Depression" + 0.079*"ChronicCond_KidneyDisease" + 0.077*"ChronicCond_Alzheimer" + 0.059*"ChronicCond_ObstrPulmonary" + 0.058*"ChronicCond_rheumatoidarthritis" + 0.058*"ChronicCond_Osteoporasis" + 0.029*"ChronicCond_Cancer"



### Applying LDA to Claim Codes Only

In [104]:
df_toLDA_copy.drop(chron_cond, axis = 1, inplace=True)
df_toLDA_copy.columns
new_dict = my_dict(df_toLDA_copy)
new_dict

{0: '4019_diag',
 1: '25000_diag',
 2: '2724_diag',
 3: 'V5869_diag',
 4: '4011_diag',
 5: '42731_diag',
 6: 'V5861_diag',
 7: '2720_diag',
 8: '2449_diag',
 9: '4280_diag',
 10: '53081_diag',
 11: '41401_diag',
 12: '496_diag',
 13: '2859_diag',
 14: '41400_diag',
 15: '78079_diag',
 16: '5990_diag',
 17: '28521_diag',
 18: '3051_diag',
 19: '2809_diag',
 20: '311_diag',
 21: '73300_diag',
 22: '58881_diag',
 23: '71590_diag',
 24: '5859_diag',
 25: 'V4581_diag',
 26: '2722_diag',
 27: '4019_proc',
 28: '9904_proc',
 29: '2724_proc',
 30: '8154_proc',
 31: '66_proc',
 32: '3893_proc',
 33: '3995_proc',
 34: '4516_proc',
 35: '3722_proc',
 36: '8151_proc',
 37: '8872_proc'}

In [105]:
df_toLDA_copy.columns = range(df_toLDA_copy.shape[1])
df_toLDA_copy.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37
0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0


In [106]:
new_tuples = my_listTuples(df_toLDA_copy)
new_tuples[0]

[(0, 1),
 (1, 0),
 (2, 1),
 (3, 0),
 (4, 0),
 (5, 0),
 (6, 0),
 (7, 0),
 (8, 0),
 (9, 0),
 (10, 0),
 (11, 0),
 (12, 0),
 (13, 0),
 (14, 0),
 (15, 0),
 (16, 0),
 (17, 0),
 (18, 0),
 (19, 0),
 (20, 0),
 (21, 0),
 (22, 0),
 (23, 1),
 (24, 0),
 (25, 0),
 (26, 0),
 (27, 0),
 (28, 0),
 (29, 0),
 (30, 0),
 (31, 0),
 (32, 0),
 (33, 0),
 (34, 0),
 (35, 0),
 (36, 0),
 (37, 0)]

In [90]:
print(len(new_dict))
print(len(new_tuples))

38
558211


In [435]:
new_ldaModel2 = gensim.models.LdaMulticore(new_tuples, num_topics=2, id2word = new_dict, passes=1)


In [436]:
for idx, topic in new_ldaModel2.print_topics(-1):
    print('Topic: {} Word: {}\n'.format(idx, topic))

Topic: 0 Word: 0.116*"4019_diag" + 0.089*"V5861_diag" + 0.087*"4011_diag" + 0.063*"2724_diag" + 0.059*"25000_diag" + 0.054*"V5869_diag" + 0.047*"2720_diag" + 0.044*"2449_diag" + 0.044*"42731_diag" + 0.033*"4280_diag"

Topic: 1 Word: 0.167*"4019_diag" + 0.081*"25000_diag" + 0.077*"2724_diag" + 0.054*"V5869_diag" + 0.050*"4011_diag" + 0.043*"42731_diag" + 0.042*"V5861_diag" + 0.039*"2720_diag" + 0.038*"2449_diag" + 0.033*"4280_diag"



In [None]:
# import pyLDAvis.gensim
# pyLDAvis.enable_notebook()
# pyLDAvis.gensim.prepare(new_ldaModel, new_tuples, new_dict)

### LDA with TF_IDF Penalization

In [91]:
from gensim import models
import warnings
warnings.filterwarnings('ignore')

In [92]:
tfidf = models.TfidfModel(new_tuples)
tfidf_corpus = tfidf[new_tuples]

In [93]:
%%time
lda_model_tfidf = gensim.models.LdaMulticore(tfidf_corpus, num_topics=2, id2word=new_dict, passes=1)
lda_model_tfidf.save('lda_tfidf.model')

CPU times: user 34.9 s, sys: 3.75 s, total: 38.6 s
Wall time: 40.4 s


In [94]:
for idx, topic in lda_model_tfidf.print_topics(-1):
    print('Topic: {} Word: {}'.format(idx, topic))

Topic: 0 Word: 0.026*"9904_proc" + 0.026*"8154_proc" + 0.026*"58881_diag" + 0.026*"71590_diag" + 0.026*"5859_diag" + 0.026*"V4581_diag" + 0.026*"2722_diag" + 0.026*"4019_proc" + 0.026*"311_diag" + 0.026*"2724_proc"
Topic: 1 Word: 0.026*"9904_proc" + 0.026*"8154_proc" + 0.026*"58881_diag" + 0.026*"71590_diag" + 0.026*"5859_diag" + 0.026*"V4581_diag" + 0.026*"2722_diag" + 0.026*"4019_proc" + 0.026*"311_diag" + 0.026*"2724_proc"


### LDA with ClmProcedureCode Only

In [118]:
proc_only = df_toLDA[['4019_proc', '9904_proc', '2724_proc', '8154_proc', '66_proc', '3893_proc',
       '3995_proc', '4516_proc', '3722_proc', '8151_proc', '8872_proc']]
proc_only.head()

Unnamed: 0,4019_proc,9904_proc,2724_proc,8154_proc,66_proc,3893_proc,3995_proc,4516_proc,3722_proc,8151_proc,8872_proc
0,0,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,0
4,0,0,0,1,0,0,0,0,0,0,0


In [119]:
proc_dict = my_dict(proc_only)
proc_dict

{0: '4019_proc',
 1: '9904_proc',
 2: '2724_proc',
 3: '8154_proc',
 4: '66_proc',
 5: '3893_proc',
 6: '3995_proc',
 7: '4516_proc',
 8: '3722_proc',
 9: '8151_proc',
 10: '8872_proc'}

In [120]:
proc_only.columns = range(proc_only.shape[1])
proc_only.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
0,0,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,0
4,0,0,0,1,0,0,0,0,0,0,0


In [121]:
proc_tuples = my_listTuples(proc_only)
proc_tuples[0]

[(0, 0),
 (1, 0),
 (2, 0),
 (3, 0),
 (4, 0),
 (5, 0),
 (6, 0),
 (7, 0),
 (8, 0),
 (9, 0),
 (10, 0)]

In [122]:
len(proc_tuples)

558211

In [123]:
new_ldaModel3 = gensim.models.LdaMulticore(proc_tuples, num_topics=2, id2word = proc_dict, passes=1)


In [124]:
for idx, topic in new_ldaModel3.print_topics(-1):
    print('Topic: {} Word: {}\n'.format(idx, topic))

Topic: 0 Word: 0.257*"9904_proc" + 0.222*"2724_proc" + 0.198*"66_proc" + 0.185*"3893_proc" + 0.137*"4516_proc" + 0.001*"4019_proc" + 0.000*"8154_proc" + 0.000*"3722_proc" + 0.000*"8872_proc" + 0.000*"8151_proc"

Topic: 1 Word: 0.381*"4019_proc" + 0.195*"8154_proc" + 0.146*"3995_proc" + 0.117*"3722_proc" + 0.084*"8151_proc" + 0.077*"8872_proc" + 0.001*"66_proc" + 0.000*"2724_proc" + 0.000*"3893_proc" + 0.000*"4516_proc"

