In [70]:
import pandas as pd 
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import power_transform
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.linear_model import Lasso, ElasticNet, LogisticRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.metrics import auc, roc_auc_score, precision_score, average_precision_score, \
    accuracy_score, balanced_accuracy_score, recall_score, confusion_matrix
from scipy.stats import pearsonr, spearmanr
from matplotlib import pyplot as plt
from imblearn.over_sampling import SMOTE, ADASYN, SMOTENC
%matplotlib inline

In [77]:
# INPUTS
target = 'Outcome'
data = pd.read_csv('../data/external/synthetic-data-sanity-check.csv')
data.drop([col for col in data.columns if 'Unnamed' in col], axis=1, inplace=True)
#------------------------------------------------------

# Define Scenario Data (only Demo/Clinical)
x = data.drop(['ID','Outcome'], axis=1)
y = (data[[target]].copy() + 1) % 2  # Dani defined 1 as majority class, 0 as event class



# Only keep records with non-missing target value
# -- NOTE: Dani's synthetic data does not have any missing y values,
#      so this step is technically unnecessary; left here to be consistent
#      with SMOTE/MIM/L1-LogReg code from previous notebook
valid_target_index = y[target].replace(-999999, np.nan).dropna().index
x = x.loc[valid_target_index,:]
y = y.loc[valid_target_index,:]


# Employ Missing Indicator Method
# -- NOTE: Dani has missing values in EVERY column, whereas real data set
#      only has missing data in most cols
missing_cols = [
    item for item in 
    x.replace(-999999,np.nan).isna().sum().to_frame('missing').query('missing > 0').index.tolist()
]
for col in missing_cols:
    x[col+'_missing'] = [1  if item==-999999 else 0 for item in x[col]]
    x[col] = x[col].replace(-999999, 0)
    
    
# Split Data
x_trn, x_val, y_trn, y_val = train_test_split(x, y, train_size=0.7, stratify=y, random_state=23)


# Scale non-cats (nots)
# -- NOTE:  Dani's original synthetic data set has no categoricals/ordinals,
#      so this code is not technically needed here, however I've emailed her
#      to generate a second synthetic data set w/ such var types in it; keeping
#      this here for consistency
nots = [col for col in x_trn.columns if len(x[col].unique()) >= 10]
for col in nots:
    min_trn = x_trn[col].min()
    max_trn = x_trn[col].max()
    scale = lambda z: (z - min_trn) / (max_trn - min_trn)
    x_trn[col] = x_trn[col].map(scale)
    ##### NOTE2SELF:  when using SAGA solver, you have to scale...which I
    #####   did in project, but forgot to apply to x_val
    x_val[col] = x_val[col].map(scale)

    
# SMOTE the training data
# -- NOTE: since no categoricals in Dani's original data set, it might be
#      argued that we should be using regular SMOTE here (not SMOTENC)
cats = [idx for idx,col in enumerate(x_trn.columns) if len(x[col].unique()) < 10]
#sm = SMOTENC(cats, random_state=37)
sm = SMOTE(random_state=37)
x_trn, y_trn = sm.fit_resample(x_trn.values, y_trn.values.ravel())


# Fit Model
model = LogisticRegression(penalty='l1', solver='liblinear', tol=0.01, C=0.5)#, l1_ratio=1)#'liblinear')
model.fit(x_trn, y_trn)

# Make Predictions
yp_trn = model.predict(x_trn)
yp_val = model.predict(x_val)


#=======================================
print('Classication Metrics')
print('Trn Accuracy:',accuracy_score(y_trn, yp_trn))
print('Val Accuracy:',accuracy_score(y_val, yp_val))
print('-------------------')
print('Trn Bal Accuracy:',balanced_accuracy_score(y_trn, yp_trn))
print('Val Bal Accuracy:',balanced_accuracy_score(y_val, yp_val))
print('-------------------')
print('Trn AUROC:',roc_auc_score(y_trn, yp_trn))
print('Val AUROC:',roc_auc_score(y_val, yp_val))
print('-------------------')
print('Trn AUPRC:',average_precision_score(y_trn, yp_trn))
print('Val AUPRC:',average_precision_score(y_val, yp_val))
print('-------------------')
print('Trn Precision Score:',precision_score(y_trn, yp_trn, average='weighted'), )
print('Val Precision Score:',precision_score(y_val, yp_val, average='weighted'))
print('-------------------')
print('Trn Recall Score:',recall_score(y_trn, yp_trn, average='weighted'), )
print('Val Recall Score:',recall_score(y_val, yp_val, average='weighted'))
print('-------------------')
print('Trn Confusion:\n',confusion_matrix(y_trn, yp_trn), )
print('Val Confusion:\n',confusion_matrix(y_val, yp_val))
print('-------------------')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Classication Metrics
Trn Accuracy: 0.9989231873653984
Val Accuracy: 0.9916666666666667
-------------------
Trn Bal Accuracy: 0.9989231873653984
Val Bal Accuracy: 0.6641541038525963
-------------------
Trn AUROC: 0.9989231873653984
Val AUROC: 0.6641541038525963
-------------------
Trn AUPRC: 0.9978510028653295
Val AUPRC: 0.08666666666666667
-------------------
Trn Precision Score: 0.9989255014326648
Val Precision Score: 0.9929110738255033
-------------------
Trn Recall Score: 0.9989231873653984
Val Recall Score: 0.9916666666666667
-------------------
Trn Confusion:
 [[1390    3]
 [   0 1393]]
Val Confusion:
 [[594   3]
 [  2   1]]
-------------------


# RFs
These do amazing!!!!!!!!

In [79]:
# INPUTS
target = 'Outcome'
data = pd.read_csv('../data/external/synthetic-data-sanity-check.csv')
data.drop([col for col in data.columns if 'Unnamed' in col], axis=1, inplace=True)
#------------------------------------------------------

# Define Scenario Data (only Demo/Clinical)
x = data.drop(['ID','Outcome'], axis=1)
y = (data[[target]].copy() + 1) % 2  # Dani defined 1 as majority class, 0 as event class



# Only keep records with non-missing target value
# -- NOTE: Dani's synthetic data does not have any missing y values,
#      so this step is technically unnecessary; left here to be consistent
#      with SMOTE/MIM/L1-LogReg code from previous notebook
valid_target_index = y[target].replace(-999999, np.nan).dropna().index
x = x.loc[valid_target_index,:]
y = y.loc[valid_target_index,:]


# Employ Missing Indicator Method
# -- NOTE: Dani has missing values in EVERY column, whereas real data set
#      only has missing data in most cols
missing_cols = [
    item for item in 
    x.replace(-999999,np.nan).isna().sum().to_frame('missing').query('missing > 0').index.tolist()
]
for col in missing_cols:
    x[col+'_missing'] = [1  if item==-999999 else 0 for item in x[col]]
    x[col] = x[col].replace(-999999, 0)
    
    
# Split Data
x_trn, x_val, y_trn, y_val = train_test_split(x, y, train_size=0.7, stratify=y, random_state=23)


# Scale non-cats (nots)
# -- NOTE:  Dani's original synthetic data set has no categoricals/ordinals,
#      so this code is not technically needed here, however I've emailed her
#      to generate a second synthetic data set w/ such var types in it; keeping
#      this here for consistency
nots = [col for col in x_trn.columns if len(x[col].unique()) >= 10]
for col in nots:
    min_trn = x_trn[col].min()
    max_trn = x_trn[col].max()
    scale = lambda z: (z - min_trn) / (max_trn - min_trn)
    x_trn[col] = x_trn[col].map(scale)
    ##### NOTE2SELF:  when using SAGA solver, you have to scale...which I
    #####   did in project, but forgot to apply to x_val
    x_val[col] = x_val[col].map(scale)
    
# SMOTE the training data
# -- NOTE: since no categoricals in Dani's original data set, it might be
#      argued that we should be using regular SMOTE here (not SMOTENC)
cats = [idx for idx,col in enumerate(x_trn.columns) if len(x[col].unique()) < 10]
#sm = SMOTENC(cats, random_state=37)
sm = SMOTE(random_state=37)
x_trn, y_trn = sm.fit_resample(x_trn.values, y_trn.values.ravel())


# Fit Model
model = RandomForestClassifier(n_estimators=10000, max_depth=1, n_jobs=-1)
model.fit(x_trn, y_trn)

# Make Predictions
yp_trn = model.predict(x_trn)
yp_val = model.predict(x_val)


#=======================================
print('Classication Metrics')
print('Trn Accuracy:',accuracy_score(y_trn, yp_trn))
print('Val Accuracy:',accuracy_score(y_val, yp_val))
print('-------------------')
print('Trn Bal Accuracy:',balanced_accuracy_score(y_trn, yp_trn))
print('Val Bal Accuracy:',balanced_accuracy_score(y_val, yp_val))
print('-------------------')
print('Trn AUROC:',roc_auc_score(y_trn, yp_trn))
print('Val AUROC:',roc_auc_score(y_val, yp_val))
print('-------------------')
print('Trn AUPRC:',average_precision_score(y_trn, yp_trn))
print('Val AUPRC:',average_precision_score(y_val, yp_val))
print('-------------------')
print('Trn Precision Score:',precision_score(y_trn, yp_trn, average='weighted'), )
print('Val Precision Score:',precision_score(y_val, yp_val, average='weighted'))
print('-------------------')
print('Trn Recall Score:',recall_score(y_trn, yp_trn, average='weighted'), )
print('Val Recall Score:',recall_score(y_val, yp_val, average='weighted'))
print('-------------------')
print('Trn Confusion:\n',confusion_matrix(y_trn, yp_trn), )
print('Val Confusion:\n',confusion_matrix(y_val, yp_val))
print('-------------------')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Classication Metrics
Trn Accuracy: 0.9813352476669059
Val Accuracy: 0.9616666666666667
-------------------
Trn Bal Accuracy: 0.9813352476669059
Val Bal Accuracy: 0.9807370184254607
-------------------
Trn AUROC: 0.9813352476669059
Val AUROC: 0.9807370184254606
-------------------
Trn AUPRC: 0.9640138408304498
Val AUPRC: 0.11538461538461539
-------------------
Trn Precision Score: 0.982006920415225
Val Precision Score: 0.995576923076923
-------------------
Trn Recall Score: 0.9813352476669059
Val Recall Score: 0.9616666666666667
-------------------
Trn Confusion:
 [[1341   52]
 [   0 1393]]
Val Confusion:
 [[574  23]
 [  0   3]]
-------------------
