## Modeling

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_curve, auc, roc_auc_score
%matplotlib inline

In [None]:
# Splits training and testing data by a given percentage 

def split_data(data, perc):
    data_copy = data.copy()
    train_df = data_df_copy.sample(frac=perc, random_state=0)
    test_df = df_copy.drop(train_df.index)
    
    return train_df, test_df

In [None]:
# creates and fits a random forest

def create_random_forest(train, test, target, est, sel):
    rf = RandomForestClassifier(criterion='entropy', n_estimators=est, oob_score = True)
    
    # training split
    x_train = train.drop([target], axis = 1)
    y_train = train[target] 
    
    # testing split
    x_test = test.drop([target], axis = 1)
    y_test = test[target] 
    
    rf.fit(x_train, y_train)
    
    rf_feature_importance = rf.feafeature_importances_
    rf_oob_matrix = rf.oob_decision_function_
    rf_pred = rf.predict(x_test)
    rf_predict_prob = rf.predict_proba(x_test)
    
    if sel == 1:
        return rf_feature_importance
    elif sel == 2:
        return rf_oob_matrix
    elif sel == 3:
        return rf_pred
    elif sel == 4:
        return rf_predict_prob
    else:
        return print("Whoops")
    

In [None]:
# graphing feature importance

def graph_feature_importance(train, test, target, est):
    feature_mi = create_random_forest(train, test, target, est, 'feature')
    feature_mi_dict = dict(zip(X.columns.values, feature_mi))
    feat_importances = pd.Series(feature_mi, index=X.columns)
    plt.figure(figsize=(10, 10))
    plt.title("Feature Importance", fontsize = 14)
    plt.xlabel('importance', fontsize=12)
    plt.ylabel('feature', fontsize=12)
    feat_importances.sort_values().plot(kind="bar")
    plt.show()

In [None]:
# testing different esitmators

estimators = []

auc_oob = []
auc_test = []

for n in estimators:
    oob = create_random_forest(train_df, test_df, 'MD_EARN_WNE_P6', n, 2)
    proba = create_random_forest(train_df, test_df,'MD_EARN_WNE_P6' , n, 4)
    auc_oob.append(roc_auc_score(train_df['MD_EARN_WNE_P6'], oob[:,1]))
    auc_test.append(roc_auc_score(test_df['MD_EARN_WNE_P6'], proba[:,1]))
