In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import statsmodels.api as sm
import statsmodels.formula.api as smf
from pandas.plotting import scatter_matrix
from matplotlib import pyplot as plt
from sklearn import tree
#import pydotplus
import collections
from IPython.display import Image  
#import pydotplus
from sklearn.tree import export_graphviz

In [2]:
import copy

In [3]:
%matplotlib inline
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor

In [47]:
df = pd.read_stata("Econ_484_data_final.dta")

In [5]:
# df.isin(["''"]).sum(axis=1).sum()

In [48]:
df = df.drop("DAYSWAIT_CHRON", axis=1)

In [49]:
df = df.dropna(axis=0, how='any')

In [50]:
df['year'] = pd.DatetimeIndex(df["TX_DATE"]).year
df['month'] = pd.DatetimeIndex(df["TX_DATE"]).month
df = df.drop("TX_DATE", axis=1)

### Making dummy variables

- gender
- abo{9}
- EXH_PERIT_ACCCESS{3}
- EXH_VASC_ACCESS{3}
- PREV_TX{2}
- PREV_KI_TX{2}
- MALIG_TRR{3}
- PERM_STATE_TRR{51}
- txkid{3}
- ABO_DON{9}
- DON_TY{3}
- GENDER_DON{2}
- HOME_STATE_DON{51}
- ABO_MAT{3}
- GRF_STAT_KI{2}
- DWFG_KI{2}
- TX_PROCEDUR_TY_KI{[did you get one or two? See data_sheet]}
- PRV_TX_ANY{2}
- PX_STAT{4}
- SHARE_TY{4}
- AGE_GROUP{2}
- malig{3} (previous malignancies)
- LT_ONE_WEEK_DON {2} (7 days old or less?)
- RECOV_OUT_US{2}

In [51]:
dummy_columns = ['gender', 'abo', 'EXH_PERIT_ACCESS', 'EXH_VASC_ACCESS', 'PREV_TX', 'PREV_KI_TX',
                 'MALIG_TRR', 'txkid', 'ABO_DON', 'DON_TY', 'GENDER_DON', 'DON_TY', 'GENDER_DON',
                 'HOME_STATE_DON', 'ABO_MAT', 'GRF_STAT_KI', 'DWFG_KI', 'PREV_TX_ANY', 'PX_STAT',
                 'SHARE_TY', 'AGE_GROUP', 'malig', 'LT_ONE_WEEK_DON', 'RECOV_OUT_US', 'year', 'month','ETHCAT_DON','ethcat', 
                 'PERM_STATE']
funky_columns = ['TX_PROCEDUR_TY_KI']

In [52]:
df_non_dummy = df.drop(dummy_columns, axis=1)
df_non_dummy.columns

Index(['NUM_PREV_TX', 'A1', 'A2', 'B1', 'B2', 'DR1', 'DR2', 'REM_CD',
       'END_STAT', 'INIT_AGE', 'ethnicity', 'region', 'RB1', 'npkid',
       'AGE_DON', 'END_STAT_KI', 'age', 'GTIME_KI', 'GSTATUS_KI',
       'DAYSWAIT_CHRON_KI', 'TX_PROCEDUR_TY_KI', 'pstatus', 'ptime'],
      dtype='object')

In [54]:
df[dummy_columns].head()

Unnamed: 0,gender,abo,EXH_PERIT_ACCESS,EXH_VASC_ACCESS,PREV_TX,PREV_KI_TX,MALIG_TRR,txkid,ABO_DON,DON_TY,...,SHARE_TY,AGE_GROUP,malig,LT_ONE_WEEK_DON,RECOV_OUT_US,year,month,ETHCAT_DON,ethcat,PERM_STATE
0,M,O,U,U,N,N,U,R,O,C,...,3,A,U,N,N,1994,4,1,1,MT
1,F,A,U,U,Y,Y,U,R,A1,C,...,3,A,U,N,N,1994,6,1,1,WA
2,F,A,U,U,N,N,U,L,O,L,...,3,A,U,N,N,1994,11,1,1,MI
3,F,A,U,U,N,N,U,R,A1,C,...,3,P,U,N,N,1994,4,1,4,IL
4,F,O,U,U,N,N,U,L,O,C,...,5,A,U,N,N,1997,1,1,1,GA


In [55]:
df_dummified = pd.get_dummies(df[dummy_columns])
df_dummified.head(12)

Unnamed: 0,SHARE_TY,year,month,ETHCAT_DON,ethcat,gender_F,gender_M,abo_A,abo_A1,abo_A1B,...,PERM_STATE_TX,PERM_STATE_UT,PERM_STATE_VA,PERM_STATE_VI,PERM_STATE_VT,PERM_STATE_WA,PERM_STATE_WI,PERM_STATE_WV,PERM_STATE_WY,PERM_STATE_ZZ
0,3,1994,4,1,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,3,1994,6,1,1,1,0,1,0,0,...,0,0,0,0,0,1,0,0,0,0
2,3,1994,11,1,1,1,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
3,3,1994,4,1,4,1,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5,1997,1,1,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,5,1995,1,1,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,3,1995,9,2,2,0,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
7,3,1994,4,1,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,3,1994,6,1,1,0,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
9,3,1995,1,1,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [56]:
df = df_non_dummy.merge(df_dummified, left_index=True, right_index=True)
df.head(12)

Unnamed: 0,NUM_PREV_TX,A1,A2,B1,B2,DR1,DR2,REM_CD,END_STAT,INIT_AGE,...,PERM_STATE_TX,PERM_STATE_UT,PERM_STATE_VA,PERM_STATE_VI,PERM_STATE_VT,PERM_STATE_WA,PERM_STATE_WI,PERM_STATE_WV,PERM_STATE_WY,PERM_STATE_ZZ
0,0,1,26,38,44,1,4,4,4010,38.0,...,0,0,0,0,0,0,0,0,0,0
1,1,2,23,44,62,7,11,4,4010,43.0,...,0,0,0,0,0,1,0,0,0,0
2,0,3,25,7,27,1,13,15,4010,42.0,...,0,0,0,0,0,0,0,0,0,0
3,0,2,68,35,39,4,0,4,4010,9.0,...,0,0,0,0,0,0,0,0,0,0
4,0,1,2,62,27,3,4,4,4010,25.0,...,0,0,0,0,0,0,0,0,0,0
5,0,2,11,35,62,1,2,4,4010,25.0,...,0,0,0,0,0,0,0,0,0,0
6,0,3,19,17,70,11,7,4,4010,43.0,...,0,0,0,0,0,0,0,0,0,0
7,0,2,11,62,0,4,13,15,4010,45.0,...,0,0,0,0,0,0,0,0,0,0
8,0,3,0,7,14,1,2,4,4010,75.0,...,0,0,0,0,0,0,0,0,0,0
9,0,1,31,8,51,11,17,4,4010,65.0,...,0,0,0,0,0,0,0,0,0,0


In [65]:
df.shape

(401748, 221)

In [61]:
y_variable = 'DAYSWAIT_CHRON_KI'
X_variables = list(df.columns[df.columns != y_variable])
X = df[X_variables].to_numpy()
y = df[y_variable].to_numpy()

X_train, X_test, y_train, y_test = train_test_split(X, y)

forest = RandomForestRegressor(n_estimators=5)
model = forest.fit(X_train, y_train)

In [62]:
model.score(X_test,y_test)

0.8251377565327479

In [67]:
num_21 =  X[21,:].copy().reshape(1, -1)
num_21.shape
print(model.predict(num_21))

[56.]


In [68]:
y[21]

56.0

In [73]:
num_21  = pd.DataFrame(num_21, columns = df[X_variables].columns)


In [None]:
to_excel

In [74]:
j = num_21.to_excel(r'C:\Users\jordan79\Desktop\num_21.xlsx', index= None, header=True)

In [71]:

#num_21 =  X[21,:172].copy().reshape(1, -1)
List = []
for i in range(58):
    A = np.zeros((58,), dtype=int)
    A[i]=1
    num_10000 = np.concatenate((X[64224,:172].copy(),A), axis = None).reshape(1,-1)
    List.append(model.predict(num_10000))
    print(model.predict(num_10000))
    

[530.]
[530.]
[530.]
[530.]
[530.]
[533.2]
[530.]
[530.]
[530.]
[530.]
[411.4]
[530.]
[530.]
[530.]
[530.]
[530.]
[682.6]
[530.]
[530.]
[419.4]
[530.]
[530.]
[613.4]
[530.]
[907.6]
[530.]
[530.]
[530.]
[530.]
[530.]
[530.]
[530.]
[530.]
[530.]
[530.]
[527.]
[530.]
[530.]
[584.2]
[530.]
[530.]
[530.]
[530.]
[530.]
[530.]
[417.]
[530.]
[530.]
[654.2]
[530.]
[530.]
[530.]
[530.]
[530.]
[752.]
[530.]
[530.]
[645.8]


In [35]:
state_names = ['AK','AL','AR','AS','AZ','CA','CO','CT','DC','DE','FL','GA',
               'GU','HI','IA','ID','IL','IN','KS','KY','LA','MA','MD','ME',
               'MI','MN','MO','MP','MS','MT','NA','NC','ND','NE','NH','NJ',
               'NM','NV','NY','OH','OK','OR','PA','PR','RI','SC','SD','TN',
               'TX','UT','VA','VI','VT','WA','WI','WV','WY','ZZ']


In [36]:
# Person 1
List = np.asarray(List)
#List.reshape(-1,1)

wl_times = pd.DataFrame()
wl_times['state']= state_names
wl_times['68592']= List


In [38]:
# Person 2
List = np.asarray(List)
wl_times['360018']=List


In [41]:
# Person 3
List = np.asarray(List)
wl_times['300328']=List

In [43]:
# Person 4
List = np.asarray(List)
wl_times['173781']=List

In [45]:
# Person 5
List = np.asarray(List)
wl_times['311525']=List

In [47]:
# Person 6
List = np.asarray(List)
wl_times['336194']=List

In [49]:
# Person 7
List = np.asarray(List)
wl_times['298122']=List

In [51]:
# Person 8
List = np.asarray(List)
wl_times['343924']=List

In [53]:
# Person 9
List = np.asarray(List)
wl_times['22757']=List

In [55]:
# Person 10
List = np.asarray(List)
wl_times['18917']=List