In [7]:
import random

from sklearn import tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import average_precision_score, roc_auc_score

import graphviz
from snorkel.labeling import LFAnalysis, filter_unlabeled_dataframe
from snorkel.labeling.model import LabelModel, MajorityLabelVoter
import xgboost as xgb

import dmnet.util as u

# Read in data

In [2]:
CAMPAIGN = '43.prod'

tr = pd.read_parquet(f"gs://dmnet/heloc/campaign/{CAMPAIGN}/dmatrix/tr_gbm.parquet")
te = pd.read_parquet(f"gs://dmnet/heloc/campaign/{CAMPAIGN}/dmatrix/te_gbm.parquet")

srv_data = u.read_parquet_ray("gs://jdh-bucket/projects/snorkel_pos/data/serve/2109A.parquet")
snkl_res = u.read_parquet_ray("gs://jdh-bucket/projects/snorkel_pos/data/serve_lbls/2109A")

2022-02-25 17:13:15,339	INFO services.py:1270 -- View the Ray dashboard at [1m[32mhttp://127.0.0.1:8265[39m[22m


[2m[36m(_read_parquet_wrapper pid=56918)[0m 


In [3]:
tr = tr.set_index(['record_nb', 'encrypted_nb'])
te = te.set_index(['record_nb', 'encrypted_nb'])
srv_data = srv_data.reset_index().set_index(['record_nb', 'encrypted_nb'])

## Create some Decision Trees for a Business Rule Proxy

In [4]:
#Filter down to some key features (per ds-utils)
key_cols = [
    "PREMIER_V1_2_IQB9410", "FICECLV9_SCORE", "PREMIER_V1_2_FIP5020", "PREMIER_V1_2_ALL7120", 
    "PREMIER_V1_2_FIP8120", "PREMIER_V1_2_BCA6220", "PREMIER_V1_2_BCC3422", "PREMIER_V1_2_IQM9417",
    "PREMIER_V1_2_IQF9510","PREMIER_V1_2_IQM9540", "PREMIER_V1_2_IQT9526", "PREMIER_V1_2_REV7434",
    "PREMIER_V1_2_PIL0438", "PREMIER_V1_2_BCC7482", "CLTV", "PREMIER_V1_2_MTA7432", "PREMIER_V1_2_ALL5835",
    "PREMIER_V1_2_FIP0437", "PREMIER_V1_2_MTJ5030", "PREMIER_V1_2_MTA0437", "PREMIER_V1_2_REV7800"
]
key_cols = [col.lower() for col in key_cols]

In [7]:
key_cols = set(key_cols).intersection(set(tr.columns.to_list()))
X = tr[key_cols]
y = tr['y']

In [11]:
random_state_dict = {0: 123, 1: 234, 2: 345, 3: 456, 4: 567, 5: 678, 6: 789, 7: 890, 8:901, 9: 101112}

for x in range(10):
    mdl = tree.DecisionTreeRegressor(
        max_depth=3,
        random_state= random_state_dict[x],
        max_features=10
    )
    mdl.fit(X, y)
    
    dot_data = tree.export_graphviz(
        mdl, 
        out_file=None, 
        feature_names=X.columns,  
        class_names=True,  
        filled=True, 
        rounded=True,  
        special_characters=True,
        rotate=True,
        proportion=True
    )  
    graph = graphviz.Source(dot_data)
    
    png_b = graph.pipe(format='png')
    with open(f'./images/rule_{str(x)}_022422.png', 'wb') as f:
        f.write(png_b)
    print (f'done with tree number {x}')

done with tree number 0
done with tree number 1
done with tree number 2
done with tree number 3
done with tree number 4
done with tree number 5
done with tree number 6
done with tree number 7
done with tree number 8
done with tree number 9


# Transition Weak Learner Results to Label Matrix

In [8]:
RANGE_OF_ONES = [6000, 9000]
RANGE_OF_ZEROS = [10_000_000, 15_000_000]

def process_lbl_mtx(df: pd.DataFrame) -> pd.DataFrame:
    res = df.copy()
    for col in ['gbm', 'rf', 'logit']:
        sorted_series = res[col].sort_values(ascending=False)
        num_ones = random.randrange(RANGE_OF_ONES[0], RANGE_OF_ONES[1])
        min_val_for_ones = sorted_series[:num_ones].min()
        
        num_zeros = random.randrange(RANGE_OF_ZEROS[0], RANGE_OF_ZEROS[1])
        max_val_for_zs = sorted_series[-num_zeros:].max()

        conditions = [
            res[col] >= min_val_for_ones,
            res[col] <= max_val_for_zs,
        ]
        choices = [1, 0]
        res['%s_adj' % col] = np.select(conditions, choices, default=-1)
    return res

snkl_res = process_lbl_mtx(snkl_res)

In [15]:
lbl_mtx_for_snkl = snkl_res.drop(['gbm', 'rf', 'rf_pred', 'logit', 'logit_pred'], axis=1)
lbl_mtx_for_snkl = np.array(lbl_mtx_for_snkl.astype(int))
LFAnalysis(lbl_mtx_for_snkl).lf_summary()

Unnamed: 0,Polarity,Coverage,Overlaps,Conflicts
0,[],0.0,0.0,0.0
1,"[0, 1]",1.0,1.0,0.120328
2,[0],0.944419,0.944419,0.064747
3,[],0.0,0.0,0.0
4,"[0, 1]",1.0,1.0,0.120328
5,[0],1.0,1.0,0.120328
6,[1],0.005393,0.005393,0.005393
7,[0],1.0,1.0,0.120328
8,[0],0.937574,0.937574,0.061746
9,"[0, 1]",1.0,1.0,0.120328


# Run Snorkel Generative Model

In [16]:
lbl_mdl = LabelModel(verbose=True)
lbl_mdl.fit(
    L_train=lbl_mtx_for_snkl, 
    class_balance=[0.9988, 0.0012], 
    n_epochs=500, 
    log_freq=100, 
    seed=123
)
snrkl_results = lbl_mdl.predict_proba(L=lbl_mtx_for_snkl)

INFO:root:Computing O...
INFO:root:Estimating \mu...
  0%|                                                | 0/500 [00:00<?, ?epoch/s]INFO:root:[0 epochs]: TRAIN:[loss=84.360]
INFO:root:[100 epochs]: TRAIN:[loss=0.203]
 24%|████████▊                           | 122/500 [00:00<00:00, 1218.19epoch/s]INFO:root:[200 epochs]: TRAIN:[loss=0.203]
 49%|█████████████████▊                  | 247/500 [00:00<00:00, 1233.20epoch/s]INFO:root:[300 epochs]: TRAIN:[loss=0.203]
 74%|██████████████████████████▋         | 371/500 [00:00<00:00, 1234.86epoch/s]INFO:root:[400 epochs]: TRAIN:[loss=0.203]
100%|████████████████████████████████████| 500/500 [00:00<00:00, 1228.66epoch/s]
INFO:root:Finished Training


In [17]:
snrkl_results = pd.Series(snrkl_results[:, 1], index=snkl_res.index)

In [18]:
BPS_THRESH = 0.0004
NUM_ROWS = int(len(snrkl_results) * BPS_THRESH)

topx_snrkl_results = pd.Series(snrkl_results, index=snkl_res.index).head(NUM_ROWS).copy()
topx_idx = topx_snrkl_results.index

In [None]:
len(snkl_res)

In [20]:
addl_ones = srv_data.reindex(topx_idx)

In [21]:
addl_ones['y'] = 1

In [22]:
addl_ones.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,premier_v1_2_iqb9410,ficeclv9_score,premier_v1_2_fip5020,premier_v1_2_all7120,premier_v1_2_fip8120,premier_v1_2_bca6220,premier_v1_2_bcc3422,premier_v1_2_iqm9417,premier_v1_2_iqf9510,premier_v1_2_iqm9540,...,premier_v1_2_mtf0300,premier_v1_2_iln8320,premier_v1_2_mta1380,premier_v1_2_all9125,building_code_ord,pandemic,sin_month,cos_month,applied,y
record_nb,encrypted_nb,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
100274831,BKAPY0107724534C7,0.0,836.0,0.0,71.0,0.0,0.0,0.0,0.0,0.0,0.0,...,2.0,0.0,1.0,0.0,1.0,1,-0.989821,-0.142315,0.0,1
100453487,WYFND0179687866E4,1.0,751.0,0.0,100.0,0.0,1.0,0.0,0.0,0.0,607.0,...,5.0,56.0,2.0,0.0,1.0,1,-0.989821,-0.142315,0.0,1
100763218,XAWTU0125616585G1,0.0,850.0,0.0,66.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,11.0,1.0,0.0,0.0,1,-0.989821,-0.142315,0.0,1
100940666,KGYUM0168964193F5,2.0,797.0,0.0,97.0,0.0,1.0,0.0,1.0,0.0,302.0,...,4.0,54.0,2.0,0.0,0.0,1,-0.989821,-0.142315,0.0,1
100953210,LZCVY0141767009H3,0.0,831.0,0.0,15.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,119.0,0.0,0.0,2.0,1,-0.989821,-0.142315,0.0,1


In [23]:
len(addl_ones)

9400

# Test With and Without Add'l Ones from Snorkel

In [24]:
def rf_fit(data):
    params = {'n_estimators': 500, 
              'max_depth': 8,
              'n_jobs': -1,
              'class_weight': 'balanced_subsample'
             }
    
    X, y = data.drop('y', axis=1).fillna(0), data['y']
    
    rf = RandomForestClassifier(**params)
    rf = rf.fit(X, y)
    return rf

def xgb_fit(data):
    params = {'eta': 0.1, 
              'max_leaves': 25,
              'min_child_weight': 850,
              'eval_metric': 'auc'
             }
    
    dtrain = xgb.DMatrix(data.drop('y', axis=1), data['y'])
    train_kwargs = {
        "params": params,
        "dtrain": dtrain,
        "evals_result": {},
        "evals": [(dtrain, "train")]
    }
    
    train_params = {
        "num_boost_round": 200,
        "early_stopping_rounds": 50, 
        "verbose_eval": 100
    }
    
    clf = xgb.train(**train_kwargs, **train_params)
    return clf


## First Generate Consistent Set of Columns

In [25]:
const_cols = list(set(tr.columns).intersection(set(addl_ones.columns.to_list())))

Xy_tr = tr[const_cols].copy()
Xy_te = te[const_cols].copy()
Xy_tr_plus = pd.concat([Xy_tr, addl_ones[const_cols]], axis=0)

### Now Try a Simple RF 

In [26]:
rf = rf_fit(Xy_tr)
print ('done with first')
rf_plus = rf_fit(Xy_tr_plus)

done with first


In [27]:
u.pickle_dump(rf, '/home/josephhurley/projects/snorkel_poc/snorkel_poc/model/rf_base.mdl')
u.pickle_dump(rf_plus, '/home/josephhurley/projects/snorkel_poc/snorkel_poc/model/rf_plus.mdl')

In [28]:
rf_preds = pd.Series(rf.predict_proba(Xy_tr.drop('y', axis=1).fillna(0))[:, 1], index=Xy_tr.index)
rf_plus_preds = pd.Series(rf_plus.predict_proba(Xy_tr.drop('y', axis=1).fillna(0))[:, 1], index=Xy_tr.index)

roc = roc_auc_score(y_score=rf_preds, y_true=Xy_tr['y'])
aps = average_precision_score(y_score=rf_preds, y_true=Xy_tr['y'])
print (f'training stats for base rf are roc: {roc} and avg prec: {aps}')


roc = roc_auc_score(y_score=rf_plus_preds, y_true=Xy_tr['y'])
aps = average_precision_score(y_score=rf_plus_preds, y_true=Xy_tr['y'])
print (f'training stats for addl ones rf are roc: {roc} and avg prec: {aps}')

print ('##############')

rf_preds = pd.Series(rf.predict_proba(Xy_te.drop('y', axis=1).fillna(0))[:, 1], index=Xy_te.index)
rf_plus_preds = pd.Series(rf_plus.predict_proba(Xy_te.drop('y', axis=1).fillna(0))[:, 1], index=Xy_te.index)

roc = roc_auc_score(y_score=rf_preds, y_true=Xy_te['y'])
aps = average_precision_score(y_score=rf_preds, y_true=Xy_te['y'])
print (f'testing stats for base rf are roc: {roc} and avg prec: {aps}')

roc = roc_auc_score(y_score=rf_plus_preds, y_true=Xy_te['y'])
aps = average_precision_score(y_score=rf_plus_preds, y_true=Xy_te['y'])
print (f'testing stats for addl ones rf are roc: {roc} and avg prec: {aps}')


training stats for base rf are roc: 0.7170464889201145 and avg prec: 0.03840416049647319
training stats for addl ones rf are roc: 0.7070366640308703 and avg prec: 0.03537693793662562
##############
testing stats for base rf are roc: 0.6832085953565563 and avg prec: 0.0038690606018947403
testing stats for addl ones rf are roc: 0.6767891900676622 and avg prec: 0.0037741579311580953


### Now Do a XGB Classifer

In [29]:
xgb_clf = xgb_fit(Xy_tr)
xgb_plus = xgb_fit(Xy_tr_plus)

[0]	train-auc:0.66467
[100]	train-auc:0.75482
[199]	train-auc:0.76872
[0]	train-auc:0.69411
[92]	train-auc:0.73840


In [30]:
u.pickle_dump(xgb_clf, '/home/josephhurley/projects/snorkel_poc/snorkel_poc/model/xgbm_base.mdl')
u.pickle_dump(xgb_plus, '/home/josephhurley/projects/snorkel_poc/snorkel_poc/model/xgbm_plus.mdl')

In [31]:
xgb_preds = pd.Series(xgb_clf.predict(xgb.DMatrix(Xy_tr.drop('y', axis=1))), index=Xy_tr.index)
xgb_plus_preds = pd.Series(xgb_plus.predict(xgb.DMatrix(Xy_tr.drop('y', axis=1))), index=Xy_tr.index)

roc = roc_auc_score(y_score=xgb_preds, y_true=Xy_tr['y'])
aps = average_precision_score(y_score=xgb_preds, y_true=Xy_tr['y'])
print (f'training stats for base rf are roc: {roc} and avg prec: {aps}')

roc = roc_auc_score(y_score=xgb_plus_preds, y_true=Xy_tr['y'])
aps = average_precision_score(y_score=xgb_plus_preds, y_true=Xy_tr['y'])
print (f'training stats for addl ones rf are roc: {roc} and avg prec: {aps}')

print ("#####")

xgb_preds = pd.Series(xgb_clf.predict(xgb.DMatrix(Xy_te.drop('y', axis=1))), index=Xy_te.index)
xgb_plus_preds = pd.Series(xgb_plus.predict(xgb.DMatrix(Xy_te.drop('y', axis=1))), index=Xy_te.index)

roc = roc_auc_score(y_score=xgb_preds, y_true=Xy_te['y'])
aps = average_precision_score(y_score=xgb_preds, y_true=Xy_te['y'])
print (f'testing stats for base rf are roc: {roc} and avg prec: {aps}')

roc = roc_auc_score(y_score=xgb_plus_preds, y_true=Xy_te['y'])
aps = average_precision_score(y_score=xgb_plus_preds, y_true=Xy_te['y'])
print (f'testing stats for addl ones rf are roc: {roc} and avg prec: {aps}')

training stats for base rf are roc: 0.7597738382199718 and avg prec: 0.05814162512236522
training stats for addl ones rf are roc: 0.7395973123592751 and avg prec: 0.044944597392257125
#####
testing stats for base rf are roc: 0.7156126564488273 and avg prec: 0.004940828723240559
testing stats for addl ones rf are roc: 0.70759773369513 and avg prec: 0.004482254616007313


In [110]:
xgb_preds = pd.Series(xgb_clf.predict(xgb.DMatrix(tr.drop('y', axis=1))), index=tr.index)

In [112]:
roc_auc_score(y_score=xgb_preds, y_true=y_tr)

0.7458396476432586

In [113]:
xgb_preds_te = pd.Series(xgb_clf.predict(xgb.DMatrix(te.drop('y', axis=1))), index=te.index)

In [114]:
roc_auc_score(y_score=xgb_preds_te, y_true=y_te)

0.7157362780480941

In [115]:
tr_plus = pd.concat([tr, more_ones], axis=0)
xgb_clf2 = xgb_fit(tr_plus)

[0]	train-auc:0.67910
[100]	train-auc:0.74856
[199]	train-auc:0.75379


In [116]:
xgb2_preds_te = pd.Series(xgb_clf2.predict(xgb.DMatrix(te.drop('y', axis=1))), index=te.index)
roc_auc_score(y_score=xgb2_preds_te, y_true=y_te)

0.7135025803527602

In [None]:
print ('joe')

In [70]:
rf = RandomForestClassifier(n_estimators=500, max_depth=8, n_jobs=-1)

In [35]:
rf

RandomForestClassifier(max_depth=8, n_estimators=500)

In [47]:
tr = pd.read_parquet("gs://dmnet/heloc/campaign/42.prod/dmatrix/tr_gbm.parquet")

In [61]:
te = pd.read_parquet("gs://dmnet/heloc/campaign/42.prod/dmatrix/te_gbm.parquet")

In [50]:
tr = tr.set_index(['record_nb', 'encrypted_nb'])
te = te.set_index(['record_nb', 'encrypted_nb'])

In [63]:
te = te.set_index(['record_nb', 'encrypted_nb'])

In [55]:
cols_to_use = list(set(tr.columns).intersection(set(srv_ones.columns)))

In [57]:
tr = tr[cols_to_use + ['y']] 

In [64]:
te = te[cols_to_use + ['y']] 

In [65]:
te.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,premier_v1_2_bca5020,premier_v1_2_iqb9510,premier_v1_2_cru8320,ficeclv9_score,premier_v1_2_hlc0438,premier_v1_2_mta1380,2nd_position_mtg_term_code_ord,premier_v1_2_cru0416,premier_v1_2_all9125,premier_v1_2_bcc7117,...,premier_v1_2_mta7438,premier_v1_2_bcc3422,premier_v1_2_bcc7228,fips_ord,premier_v1_2_reh3410,premier_v1_2_rej8120,premier_v1_2_all5835,premier_v1_2_iqf9415,premier_v1_2_mta6210,y
record_nb,encrypted_nb,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
4175,1,1269.0,1.0,66.0,667.0,0.0,1.0,0.001197,0.0,0.0,0.0,...,100.0,0.0,55.0,0.001025,7.0,14.0,3008.0,0.0,0.0,0
21448,1,26373.0,24.0,89.0,752.0,0.0,2.0,0.001197,0.0,0.0,0.0,...,22.0,0.0,0.0,0.001025,3.0,77.0,3680.0,0.0,1.0,0
70946,1,39507.0,22.0,44.0,672.0,0.0,1.0,0.001197,4.0,0.0,0.0,...,100.0,8.0,50.0,0.001025,11.0,330.0,3739.0,0.0,0.0,0
137690,1,15295.0,0.0,191.0,776.0,0.0,1.0,0.001197,0.0,0.0,0.0,...,100.0,0.0,50.0,0.001025,4.0,0.0,3961.0,0.0,0.0,0
167714,1,16848.0,16.0,0.0,652.0,0.0,1.0,0.001197,0.0,0.0,75.0,...,50.0,3.0,67.0,0.001396,7.0,0.0,2584.0,0.0,0.0,0


In [95]:
more_ones = srv_ones[cols_to_use].copy()
more_ones['y'] = 1

In [71]:
X_tr, y_tr = tr.drop('y', axis=1), tr['y']
X_tr = X_tr.fillna(0)
rf.fit(X_tr, y_tr)

RandomForestClassifier(max_depth=8, n_estimators=500, n_jobs=-1)

In [72]:
tr_preds = rf.predict_proba(X_tr)[:, 1]

In [76]:
roc_auc_score(y_score=tr_preds, y_true=y_tr)

0.6930789416753094

In [79]:
X_te, y_te = te.drop('y', axis=1), te['y']
X_te = X_te.fillna(0)

In [80]:
te_preds = rf.predict_proba(X_te)[:, 1]

In [81]:
roc_auc_score(y_score=te_preds, y_true=y_te)

0.6849188471706258

In [97]:
rf2 = RandomForestClassifier(n_estimators=500, max_depth=8, n_jobs=-1)
X_tr_plus = pd.concat([tr.drop('y', axis=1), more_ones.drop('y', axis=1)], axis=0)
y_tr_plus = pd.concat([tr['y'], more_ones['y']], axis=0)
X_tr_plus = X_tr_plus.fillna(0)
rf2.fit(X_tr_plus, y_tr_plus)

RandomForestClassifier(max_depth=8, n_estimators=500, n_jobs=-1)

In [98]:
te_preds2 = rf2.predict_proba(X_te)[:, 1]

In [99]:
roc_auc_score(y_score=te_preds2, y_true=y_te)

0.6778944215975258