In [1]:
import pandas as pd
import numpy as np

In [2]:
%matplotlib inline
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor

In [3]:
df = pd.read_stata("Econ_484_data_final.dta")

In [4]:
# df.isin(["''"]).sum(axis=1).sum()

In [5]:
df = df.dropna(axis=0, how='any')

In [6]:
df['year'] = pd.DatetimeIndex(df["TX_DATE"]).year
df['month'] = pd.DatetimeIndex(df["TX_DATE"]).month
df = df.drop("TX_DATE", axis=1)

### Making dummy variables

- gender
- abo{9}
- EXH_PERIT_ACCCESS{3}
- EXH_VASC_ACCESS{3}
- PREV_TX{2}
- PREV_KI_TX{2}
- MALIG_TRR{3}
- PERM_STATE_TRR{51}
- txkid{3}
- ABO_DON{9}
- DON_TY{3}
- GENDER_DON{2}
- HOME_STATE_DON{51}
- ABO_MAT{3}
- GRF_STAT_KI{2}
- DWFG_KI{2}
- TX_PROCEDUR_TY_KI{[did you get one or two? See data_sheet]}
- PRV_TX_ANY{2}
- PX_STAT{4}
- SHARE_TY{4}
- AGE_GROUP{2}
- malig{3} (previous malignancies)
- LT_ONE_WEEK_DON {2} (7 days old or less?)
- RECOV_OUT_US{2}

In [7]:
dummy_columns = ['gender', 'abo', 'EXH_PERIT_ACCESS', 'EXH_VASC_ACCESS', 'PREV_TX', 'PREV_KI_TX',
                 'MALIG_TRR', 'txkid', 'ABO_DON', 'DON_TY', 'GENDER_DON', 'DON_TY', 'GENDER_DON',
                 'HOME_STATE_DON', 'ABO_MAT', 'GRF_STAT_KI', 'DWFG_KI', 'PREV_TX_ANY', 'PX_STAT',
                 'SHARE_TY', 'AGE_GROUP', 'malig', 'LT_ONE_WEEK_DON', 'RECOV_OUT_US', 'year', 'month',
                 'PERM_STATE']
funky_columns = ['TX_PROCEDUR_TY_KI']

In [8]:
df_non_dummy = df.drop(dummy_columns, axis=1)
df_non_dummy

Unnamed: 0,NUM_PREV_TX,A1,A2,B1,B2,DR1,DR2,REM_CD,DAYSWAIT_CHRON,END_STAT,...,AGE_DON,ETHCAT_DON,END_STAT_KI,age,GTIME_KI,GSTATUS_KI,DAYSWAIT_CHRON_KI,TX_PROCEDUR_TY_KI,pstatus,ptime
0,0,1,26,38,44,1,4,4,34,4010,...,35.0,1,4010.0,38,3678.0,1.0,34.0,102.0,0,4746.0
1,1,2,23,44,62,7,11,4,97,4010,...,17.0,1,4010.0,43,8634.0,0.0,97.0,102.0,0,8634.0
2,0,3,25,7,27,1,13,15,247,4010,...,44.0,1,4010.0,43,5219.0,1.0,247.0,101.0,0,5219.0
3,0,2,68,35,39,4,0,4,35,4010,...,16.0,1,4010.0,9,2098.0,1.0,35.0,102.0,0,2206.0
4,0,1,2,62,27,3,4,4,1047,4010,...,16.0,1,4010.0,28,2248.0,1.0,1047.0,101.0,1,5141.0
5,0,2,11,35,62,1,2,4,335,4010,...,29.0,1,4010.0,26,5049.0,1.0,335.0,101.0,1,6459.0
6,0,3,19,17,70,11,7,4,575,4010,...,44.0,2,4010.0,45,237.0,1.0,575.0,101.0,1,1710.0
7,0,2,11,62,0,4,13,15,37,4010,...,66.0,1,4010.0,45,1459.0,0.0,37.0,101.0,1,5091.0
8,0,3,0,7,14,1,2,4,101,4010,...,64.0,1,4010.0,76,18.0,1.0,101.0,102.0,1,18.0
9,0,1,31,8,51,11,17,4,321,4010,...,27.0,1,4010.0,66,2635.0,1.0,321.0,101.0,1,2635.0


In [9]:
df_dummified = pd.get_dummies(df[dummy_columns])
df_dummified

Unnamed: 0,SHARE_TY,year,month,gender_F,gender_M,abo_A,abo_A1,abo_A1B,abo_A2,abo_A2B,...,PERM_STATE_TX,PERM_STATE_UT,PERM_STATE_VA,PERM_STATE_VI,PERM_STATE_VT,PERM_STATE_WA,PERM_STATE_WI,PERM_STATE_WV,PERM_STATE_WY,PERM_STATE_ZZ
0,3,1994,4,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,3,1994,6,1,0,1,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
2,3,1994,11,1,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,3,1994,4,1,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5,1997,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,5,1995,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,3,1995,9,0,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,3,1994,4,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,3,1994,6,0,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,3,1995,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [10]:
df = df_non_dummy.merge(df_dummified, left_index=True, right_index=True)
df

Unnamed: 0,NUM_PREV_TX,A1,A2,B1,B2,DR1,DR2,REM_CD,DAYSWAIT_CHRON,END_STAT,...,PERM_STATE_TX,PERM_STATE_UT,PERM_STATE_VA,PERM_STATE_VI,PERM_STATE_VT,PERM_STATE_WA,PERM_STATE_WI,PERM_STATE_WV,PERM_STATE_WY,PERM_STATE_ZZ
0,0,1,26,38,44,1,4,4,34,4010,...,0,0,0,0,0,0,0,0,0,0
1,1,2,23,44,62,7,11,4,97,4010,...,0,0,0,0,0,1,0,0,0,0
2,0,3,25,7,27,1,13,15,247,4010,...,0,0,0,0,0,0,0,0,0,0
3,0,2,68,35,39,4,0,4,35,4010,...,0,0,0,0,0,0,0,0,0,0
4,0,1,2,62,27,3,4,4,1047,4010,...,0,0,0,0,0,0,0,0,0,0
5,0,2,11,35,62,1,2,4,335,4010,...,0,0,0,0,0,0,0,0,0,0
6,0,3,19,17,70,11,7,4,575,4010,...,0,0,0,0,0,0,0,0,0,0
7,0,2,11,62,0,4,13,15,37,4010,...,0,0,0,0,0,0,0,0,0,0
8,0,3,0,7,14,1,2,4,101,4010,...,0,0,0,0,0,0,0,0,0,0
9,0,1,31,8,51,11,17,4,321,4010,...,0,0,0,0,0,0,0,0,0,0


In [11]:
y_variable = 'DAYSWAIT_CHRON_KI'
X_variables = list(df.columns[df.columns != y_variable])
X = df[X_variables].to_numpy()
y = df[y_variable].to_numpy()

X_train, X_test, y_train, y_test = train_test_split(X, y)

forest = RandomForestRegressor(n_estimators=5)
model = forest.fit(X_train, y_train)

In [12]:
model.score(X_test,y_test)

0.9999972899367783

In [13]:
model.get_params

<bound method BaseEstimator.get_params of RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=5, n_jobs=None,
           oob_score=False, random_state=None, verbose=0, warm_start=False)>

In [19]:
people_data = X[1,:]
people_data = pd.DataFrame(people_data)


In [31]:
people_data = X[1,:]


In [32]:
people_data.reshape(1,231)

array([[1.000e+00, 2.000e+00, 2.300e+01, 4.400e+01, 6.200e+01, 7.000e+00,
        1.100e+01, 4.000e+00, 9.700e+01, 4.010e+03, 4.300e+01, 0.000e+00,
        1.000e+00, 6.000e+00, 4.400e+01, 1.000e+00, 1.700e+01, 1.000e+00,
        4.010e+03, 4.300e+01, 8.634e+03, 0.000e+00, 1.020e+02, 0.000e+00,
        8.634e+03, 3.000e+00, 1.994e+03, 6.000e+00, 1.000e+00, 0.000e+00,
        1.000e+00, 0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00,
        0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00, 1.000e+00,
        0.000e+00, 0.000e+00, 0.000e+00, 1.000e+00, 0.000e+00, 0.000e+00,
        1.000e+00, 0.000e+00, 1.000e+00, 0.000e+00, 0.000e+00, 1.000e+00,
        0.000e+00, 0.000e+00, 0.000e+00, 1.000e+00, 0.000e+00, 0.000e+00,
        1.000e+00, 0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00,
        0.000e+00, 0.000e+00, 1.000e+00, 1.000e+00, 0.000e+00, 0.000e+00,
        0.000e+00, 0.000e+00, 1.000e+00, 1.000e+00, 0.000e+00, 0.000e+00,
        1.000e+00, 1.000e+00, 0.000e+0

In [33]:
people_data = pd.DataFrame(people_data)

In [34]:
people_data

Unnamed: 0,0
0,1.0
1,2.0
2,23.0
3,44.0
4,62.0
5,7.0
6,11.0
7,4.0
8,97.0
9,4010.0


In [35]:
model.predict(X[:,1])

ValueError: Expected 2D array, got 1D array instead:
array=[ 1.  2.  3. ... 23. 36.  2.].
Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.

In [None]:
array.reshape(1, -1)