# Random Forest Classifier

In [4]:
import os
import pickle
import random
import re
import scipy
import torch
import wandb
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

from label_processor import LabelProcessorSimplified

from lime import lime_text
from lime.lime_text import LimeTextExplainer

from sklearn import metrics
from sklearn.base import clone
from sklearn.compose import make_column_transformer
from sklearn.decomposition import  PCA
from sklearn.decomposition import TruncatedSVD
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.inspection import permutation_importance
from sklearn.model_selection import StratifiedKFold, train_test_split, cross_val_score
from sklearn.metrics import make_scorer, matthews_corrcoef
from sklearn.pipeline import make_pipeline
from sklearn.utils.class_weight import compute_class_weight

from tqdm.auto import tqdm, trange

In [5]:
save_directory = "saved/tfidf/"

In [6]:
def intersection_df(df1, df2, col1=None, col2=None):
    col1 = df1.index if col1 is None else df1[col1]
    col2 = df2.index if col2 is None else df2[col2]
    assert col1.name == col2.name
    ids = set(col1)
    ids.intersection_update(col2)
    df1 = df1[col1.isin(ids)]
    df2 = df2[col2.isin(ids)]
    return df1, df2

def predict_decision_threshold(classifier, X_train, X_test, y_train, y_test):
    y_pred_prob_train = classifier.predict_proba(X_train)[:,1]
    decision_threshold = np.quantile(y_pred_prob_train, 1 - y_train.mean())
    y_pred_prob = classifier.predict_proba(X_test)[:,1]
    y_pred = y_pred_prob > decision_threshold
    return y_pred, y_pred_prob


def evaluate(classifier, X_train, X_test, y_train, y_test):
    y_pred, y_pred_prob = predict_decision_threshold(classifier, X_train, X_test, y_train, y_test)
    
    m = {
        "precision": metrics.precision_score(y_test, y_pred),
        "recall" : metrics.recall_score(y_test, y_pred),
        "f1" : metrics.f1_score(y_test, y_pred),
        "roc_auc" : metrics.roc_auc_score(y_test, y_pred_prob),
        "average_precision": metrics.average_precision_score(y_test, y_pred_prob),
        "matthews": metrics.matthews_corrcoef(y_test,y_pred),
        "spearman":scipy.stats.spearmanr(y_pred_prob, y_test)[0]
    }
    return m

def prepend_keys(d, prepend):
    return {prepend + k:v for k,v in d.items()}

def dropna_both(ds, l):
    indxs = l.dropna().index
    return ds[indxs],  l[indxs]

### Do TF-IDF

In [4]:
## gets message dataset
messages = pd.read_pickle("saved/selected_messages.pickle")

In [5]:
## gets texter, counselor and bot messages by conversation ID
conversations_tfidf = messages.replace('bot', np.nan).dropna().groupby('conversation_id').message.agg(" MESSAGESEP ".join)
conversations_tfidf = conversations_tfidf.fillna("")

In [8]:
messages = pd.read_pickle("saved/selected_messages.pickle")
texter_messages = messages[messages.interaction == 'texter'].groupby('conversation_id').message.agg(" MESSAGESEP ".join)
counselor_messages = messages[messages.interaction == 'counselor'].groupby('conversation_id').message.agg(" MESSAGESEP ".join)
bot_messages = messages[messages.interaction == 'bot'].groupby('conversation_id').message.agg(" MESSAGESEP ".join)
conversations_tfidf = pd.DataFrame({"texter": texter_messages,"counselor":counselor_messages, "bot": bot_messages})
conversations_tfidf = conversations_tfidf.fillna("")

In [9]:
## saving the train conversations for lime explanations
train_ids = torch.load(save_directory+"/train_convos.torch")
train_convos = conversations_tfidf.loc[train_ids]
pickle.dump(train_convos, open(save_directory+"train_set_messages.pickle", "wb"))

OSError: [Errno 28] No space left on device

In [13]:
## saving the test conversations for lime explanations
test_ids = torch.load(save_directory+"/test_convos.torch")
test_convos = conversations_tfidf.loc[test_ids]
pickle.dump(test_convos, open(save_directory+"test_set_messages.pickle", "wb"))

In [10]:
del messages

In [11]:
## TFIDF model
tfidf_transformer = make_pipeline( 
#     TfidfVectorizer(min_df=0.01, max_df=0.4),
    make_column_transformer(
        (TfidfVectorizer(min_df=0.01,max_df=0.4),"texter"),
        (TfidfVectorizer(min_df=0.01,max_df=0.4),"counselor"),
        (TfidfVectorizer(min_df=0.05,max_df=0.4),"bot"),
    ),
    TruncatedSVD(1000)
)

In [12]:
## Fitting TF-IDF
print("Fitting TF-IDF")
tfidf_features = tfidf_transformer.fit_transform(conversations_tfidf)

Fitting TF-IDF


In [13]:
## Saving the TF-IDF features and conversation IDs(?)
pickle.dump(tfidf_features, open(save_directory + "dataset_tfidf.pickle","wb"))
conversations_tfidf.index.to_series().to_pickle(save_directory + "conversation_ids.pickle")

In [11]:
pickle.dump(tfidf_transformer, open(save_directory+"tfidf_transformer.pickle","wb"))

## Attach Labels

In [4]:
## get datasets
dataset_tfidf = pickle.load(open(save_directory+"dataset_tfidf.pickle","rb"))
conversation_ids = pd.read_pickle(save_directory+"conversation_ids.pickle")

In [5]:
## get ids
train_ids = torch.load(save_directory+"/train_convos.torch")
test_ids = torch.load(save_directory+"/test_convos.torch")

In [6]:
## maps numerical position in the dataset to the ids so they can be used to extract corresponding data from arrays
conversation_ids_to_idxs = pd.Series(np.arange(len(conversation_ids)), index=conversation_ids.values)
conversation_idxs_to_ids = pd.Series(conversation_ids.values, index=np.arange(len(conversation_ids)))

In [7]:
## gets labelizer df 
labelizer = torch.load("saved/labelizer.torch")
l_df = labelizer.df

In [8]:
## saves the index 
l_index = l_df.index
# with open(save_directory+'/l_index.pickle', 'wb') as f:
#     pickle.dump(l_index, f)

In [9]:
## gets numeric (not hashed) indexes, for retrieval of the feature data in the next step
# l_index = pickle.load(open(save_directory+"/l_index.pickle", "rb"))
train_indxs = conversation_ids_to_idxs.loc[sorted(train_ids.intersection(conversation_ids).intersection(l_index))].values
test_indxs = conversation_ids_to_idxs.loc[sorted(test_ids.intersection(conversation_ids).intersection(l_index))].values

In [10]:
## gets tf-idf feature data for train and test sets 
ds_train = dataset_tfidf[train_indxs]
ds_test = dataset_tfidf[test_indxs]

## train and test conversation IDs
train_convo_ids = conversation_idxs_to_ids.loc[sorted(train_indxs)].values
test_convo_ids = conversation_idxs_to_ids.loc[sorted(test_indxs)].values

In [11]:
## gets the labels of the train and test sets 
l_train = l_df.loc[train_convo_ids].copy()
l_test = l_df.loc[test_convo_ids].copy()

l_train.set_index(np.arange(len(l_train)), inplace=True)
l_test.set_index(np.arange(len(l_test)), inplace=True)

In [12]:
## save data 
## for some reason, can't torch.save ds_train

with open(save_directory+'ds_train.pickle', 'wb') as f:
    pickle.dump(ds_train, f)
with open(save_directory+'ds_test.pickle', 'wb') as f:
    pickle.dump(ds_test, f)

In [13]:
## save labels 
torch.save(l_train, save_directory+"l_train.torch")
torch.save(l_test, save_directory+"l_test.torch")

# Train TF-IDF Model

In [7]:
## loading dataset and labels for the random forest classifier
ds_train = pickle.load(open(save_directory+"/ds_train.pickle","rb"))
ds_test = pickle.load(open(save_directory+"/ds_test.pickle", "rb"))
l_train = torch.load(save_directory+"l_train.torch")
l_test = torch.load(save_directory+"l_test.torch")

In [8]:
l_train = l_train.rename(columns={"69>13 or younger" : "69_13 or younger",
                                  "69>14-24" : "69_14-24",
                                  "69>24 or younger" : "69_24 or younger"})
l_test = l_test.rename(columns={"69>13 or younger" : "69_13 or younger",
                                "69>14-24" : "69_14-24",
                                "69>24 or younger" : "69_24 or younger"})

In [9]:
def fit_with_cv(X, y, X_cv, y_cv, model, min_estimators, max_estimators, step_size, patience, log_wandb=False):
    if log_wandb:
        import wandb

    best_score = None
    no_improvement_steps = 0
    for i in trange(min_estimators, max_estimators + 1, step_size):
        if no_improvement_steps > patience:
            return
        model.n_estimators = i
        model.fit(X, y)
        prob_cv = model.predict_proba(X_cv)
        score = scipy.stats.spearmanr(y_cv, prob_cv[:, 1])
        if log_wandb:
            wandb.log({"score": score, "n_estimators": model.n_estimators})
        if best_score is None or score > best_score:
            no_improvement_steps = 0
            best_score = score
        else:
            no_improvement_steps += 1

## DeepHelp Impl. 

In [10]:
config={"min_estimators":5,
"max_estimators":5000,
"max_depth":5,
"class_weight":"balanced",
"step_size":5,
"patience":3}

## list of labels to build model for
for label in tqdm([
    "18_anxiety", 
    "19_desire_x",
    "19_intent_x",
    "19_capability_x",
    "19_timeframe_x",
#     "69_13 or younger",
#     "69_14-24",
#     "69_24 or younger",
    "18_substance", 
    "18_depressed", 
    "18_self_harm",
#     "18_suicide",
    "64_Yes",
# #     "65>1 (slightly helpful)",
# #     "65>2",
# #     "65>3",
# #     "65>4",
# #     "241_I see a therapist or doctor in person",
#     "74_Heterosexual or Straight",
#     "75_White",
#     "73_Male",
# #     "33_good"
]):

    wandb.init(project="tfidf_baseline",reinit=True,
               config={
                   **config,
                "label":label
               },tags=['tfidf'])
    
    min_estimators = wandb.config.min_estimators
    max_estimators = wandb.config.max_estimators
    max_depth = wandb.config.max_depth
    class_weight = wandb.config.class_weight
    step_size = wandb.config.step_size
    patience = wandb.config.patience

    X, y = dropna_both(ds_train, l_train[label].astype('float'))
    X_test, y_test = dropna_both(ds_test, l_test[label].astype('float'))

    print(X.shape, y.shape)
    print(X_test.shape, y_test.shape)

    X_train, X_val, y_train, y_val = train_test_split(X,y, test_size=0.1)

    u = np.unique(y_train)
    class_weight = {i:j for i,j in zip(u,compute_class_weight('balanced',u, y_train))}
    
    classifier = RandomForestClassifier(
        n_estimators = min_estimators,
        max_depth = max_depth,
        class_weight = class_weight,
        warm_start=True,
        n_jobs=2)
    
    fit_with_cv(X_train,y_train, X_val, y_val,
                model=classifier, 
                min_estimators = min_estimators, 
                max_estimators = max_estimators, 
                step_size = step_size, 
                patience = patience, 
                log_wandb=True)

    wandb.log({
        **prepend_keys(evaluate(classifier, X_train, X_val, y_train, y_val), "val "),
        **prepend_keys(evaluate(classifier, X_train, X_test, y_train, y_test), "test ")
    })
    print(label)
    print(pd.Series(evaluate(classifier, X_train, X_test, y_train, y_test)))
    wandb.join()

#     with open(f"models/rf/{label}_model.pickle", 'wb') as f:
#         pickle.dump(classifier, f)       
        

HBox(children=(FloatProgress(value=0.0, max=9.0), HTML(value='')))

[34m[1mwandb[0m: Currently logged in as: [33mkiatann[0m (use `wandb login --relogin` to force relogin)
[34m[1mwandb[0m: wandb version 0.12.0 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade


(341955, 1000) (341955,)
(17997, 1000) (17997,)


99894     0.0
233564    1.0
121034    0.0
90967     1.0
         ... 
176932    1.0
57622     0.0
132151    0.0
168208    0.0
340571    0.0
Name: 18_anxiety, Length: 307759, dtype: float64 as keyword args. From version 1.0 (renaming of 0.25) passing these as positional arguments will result in an error


HBox(children=(FloatProgress(value=0.0, max=1000.0), HTML(value='')))


18_anxiety
precision            0.545761
recall               0.544058
f1                   0.544908
roc_auc              0.784782
average_precision    0.582128
matthews             0.408153
spearman             0.416057
dtype: float64


VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
n_estimators,95.0
_runtime,317.0
_timestamp,1629260321.0
_step,19.0
val precision,0.53734
val recall,0.52487
val f1,0.53103
val roc_auc,0.77676
val average_precision,0.56938
val matthews,0.3946


0,1
n_estimators,▁▁▂▂▃▃▃▄▄▅▅▅▆▆▆▇▇██
_runtime,▁▁▂▂▃▃▃▄▄▄▅▅▆▆▆▇▇▇██
_timestamp,▁▁▂▂▃▃▃▄▄▄▅▅▆▆▆▇▇▇██
_step,▁▁▂▂▂▃▃▄▄▄▅▅▅▆▆▇▇▇██
val precision,▁
val recall,▁
val f1,▁
val roc_auc,▁
val average_precision,▁
val matthews,▁


[34m[1mwandb[0m: wandb version 0.12.0 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade


(344988, 1000) (344988,)
(18157, 1000) (18157,)


275816    0.0
138556    0.0
304867    0.0
69098     0.0
         ... 
249199    1.0
294995    1.0
115520    1.0
209279    0.0
77944     1.0
Name: 19_desire_x, Length: 310489, dtype: float64 as keyword args. From version 1.0 (renaming of 0.25) passing these as positional arguments will result in an error


HBox(children=(FloatProgress(value=0.0, max=1000.0), HTML(value='')))


19_desire_x
precision            0.689363
recall               0.687665
f1                   0.688513
roc_auc              0.862074
average_precision    0.739343
matthews             0.546859
spearman             0.581615
dtype: float64


VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
n_estimators,85.0
_runtime,276.0
_timestamp,1629260604.0
_step,17.0
val precision,0.68492
val recall,0.68898
val f1,0.68695
val roc_auc,0.85652
val average_precision,0.73875
val matthews,0.54079


0,1
n_estimators,▁▁▂▂▃▃▄▄▅▅▅▆▆▇▇██
_runtime,▁▁▂▂▃▃▄▄▄▅▅▆▆▇▇▇██
_timestamp,▁▁▂▂▃▃▄▄▄▅▅▆▆▇▇▇██
_step,▁▁▂▂▃▃▃▄▄▅▅▆▆▆▇▇██
val precision,▁
val recall,▁
val f1,▁
val roc_auc,▁
val average_precision,▁
val matthews,▁


[34m[1mwandb[0m: wandb version 0.12.0 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade


(344988, 1000) (344988,)
(18157, 1000) (18157,)


273957    0.0
70082     0.0
329618    0.0
311440    0.0
         ... 
91981     0.0
275528    0.0
21811     0.0
323870    1.0
257947    1.0
Name: 19_intent_x, Length: 310489, dtype: float64 as keyword args. From version 1.0 (renaming of 0.25) passing these as positional arguments will result in an error


HBox(children=(FloatProgress(value=0.0, max=1000.0), HTML(value='')))


19_intent_x
precision            0.648103
recall               0.642597
f1                   0.645338
roc_auc              0.894752
average_precision    0.679430
matthews             0.567096
spearman             0.527095
dtype: float64


VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
n_estimators,160.0
_runtime,521.0
_timestamp,1629261132.0
_step,32.0
val precision,0.63846
val recall,0.63345
val f1,0.63595
val roc_auc,0.88914
val average_precision,0.66247
val matthews,0.55386


0,1
n_estimators,▁▁▁▂▂▂▂▃▃▃▃▃▄▄▄▄▅▅▅▅▆▆▆▆▆▇▇▇▇███
_runtime,▁▁▁▂▂▂▂▃▃▃▃▃▄▄▄▄▅▅▅▅▅▆▆▆▆▇▇▇▇▇███
_timestamp,▁▁▁▂▂▂▂▃▃▃▃▃▄▄▄▄▅▅▅▅▅▆▆▆▆▇▇▇▇▇███
_step,▁▁▁▂▂▂▂▃▃▃▃▃▄▄▄▄▅▅▅▅▅▆▆▆▆▆▇▇▇▇███
val precision,▁
val recall,▁
val f1,▁
val roc_auc,▁
val average_precision,▁
val matthews,▁


[34m[1mwandb[0m: wandb version 0.12.0 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade


(344988, 1000) (344988,)
(18157, 1000) (18157,)


18492     1.0
24380     0.0
142901    0.0
159597    0.0
         ... 
117564    0.0
43316     0.0
140919    0.0
93674     0.0
124488    0.0
Name: 19_capability_x, Length: 310489, dtype: float64 as keyword args. From version 1.0 (renaming of 0.25) passing these as positional arguments will result in an error


HBox(children=(FloatProgress(value=0.0, max=1000.0), HTML(value='')))

wandb: Network error (ReadTimeout), entering retry loop.
wandb: ERROR Error while calling W&B API: Error 1040: Too many connections (<Response [500]>)
[34m[1mwandb[0m: Network error resolved after 0:05:43.527669, resuming normal operation.



19_capability_x
precision            0.611890
recall               0.594963
f1                   0.603308
roc_auc              0.906054
average_precision    0.626713
matthews             0.543304
spearman             0.478245
dtype: float64


VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
n_estimators,195.0
_runtime,626.0
_timestamp,1629261766.0
_step,39.0
val precision,0.58967
val recall,0.58826
val f1,0.58896
val roc_auc,0.90131
val average_precision,0.61256
val matthews,0.52554


0,1
n_estimators,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
_runtime,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇████
_timestamp,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇████
_step,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
val precision,▁
val recall,▁
val f1,▁
val roc_auc,▁
val average_precision,▁
val matthews,▁


[34m[1mwandb[0m: wandb version 0.12.0 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade


(344988, 1000) (344988,)
(18157, 1000) (18157,)


128468    0.0
132794    1.0
244955    0.0
205205    0.0
         ... 
321080    0.0
119145    1.0
96618     0.0
6482      0.0
302625    0.0
Name: 19_timeframe_x, Length: 310489, dtype: float64 as keyword args. From version 1.0 (renaming of 0.25) passing these as positional arguments will result in an error


HBox(children=(FloatProgress(value=0.0, max=1000.0), HTML(value='')))


19_timeframe_x
precision            0.525741
recall               0.504869
f1                   0.515094
roc_auc              0.917050
average_precision    0.512849
matthews             0.477565
spearman             0.377063
dtype: float64


VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
n_estimators,80.0
_runtime,266.0
_timestamp,1629262043.0
_step,16.0
val precision,0.50697
val recall,0.50617
val f1,0.50657
val roc_auc,0.91616
val average_precision,0.52166
val matthews,0.46784


0,1
n_estimators,▁▁▂▂▃▃▄▄▅▅▆▆▇▇██
_runtime,▁▁▂▂▃▃▄▄▅▅▆▆▆▇▇██
_timestamp,▁▁▂▂▃▃▄▄▅▅▆▆▆▇▇██
_step,▁▁▂▂▃▃▄▄▅▅▅▆▆▇▇██
val precision,▁
val recall,▁
val f1,▁
val roc_auc,▁
val average_precision,▁
val matthews,▁


[34m[1mwandb[0m: wandb version 0.12.0 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade


(344988, 1000) (344988,)
(18157, 1000) (18157,)


179734    0.0
176479    0.0
22960     0.0
322029    1.0
         ... 
208484    0.0
185284    0.0
27318     0.0
157631    0.0
333076    0.0
Name: 18_substance, Length: 310489, dtype: float64 as keyword args. From version 1.0 (renaming of 0.25) passing these as positional arguments will result in an error


HBox(children=(FloatProgress(value=0.0, max=1000.0), HTML(value='')))


18_substance
precision            0.216216
recall               0.193548
f1                   0.204255
roc_auc              0.888767
average_precision    0.157388
matthews             0.199398
spearman             0.110913
dtype: float64


VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
n_estimators,150.0
_runtime,489.0
_timestamp,1629262539.0
_step,30.0
val precision,0.17961
val recall,0.14859
val f1,0.16264
val roc_auc,0.85692
val average_precision,0.12322
val matthews,0.15784


0,1
n_estimators,▁▁▁▂▂▂▂▃▃▃▃▄▄▄▄▅▅▅▅▆▆▆▆▇▇▇▇███
_runtime,▁▁▁▂▂▂▂▃▃▃▃▄▄▄▄▅▅▅▅▅▆▆▆▆▇▇▇▇███
_timestamp,▁▁▁▂▂▂▂▃▃▃▃▄▄▄▄▅▅▅▅▅▆▆▆▆▇▇▇▇███
_step,▁▁▁▂▂▂▂▃▃▃▃▄▄▄▄▅▅▅▅▅▆▆▆▆▇▇▇▇███
val precision,▁
val recall,▁
val f1,▁
val roc_auc,▁
val average_precision,▁
val matthews,▁


[34m[1mwandb[0m: wandb version 0.12.0 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade


(344988, 1000) (344988,)
(18157, 1000) (18157,)


134549    0.0
284780    0.0
10552     0.0
83828     0.0
         ... 
214448    0.0
323104    0.0
13852     1.0
107123    0.0
24904     0.0
Name: 18_depressed, Length: 310489, dtype: float64 as keyword args. From version 1.0 (renaming of 0.25) passing these as positional arguments will result in an error


HBox(children=(FloatProgress(value=0.0, max=1000.0), HTML(value='')))


18_depressed
precision            0.464809
recall               0.465150
f1                   0.464980
roc_auc              0.739175
average_precision    0.474153
matthews             0.309397
spearman             0.346088
dtype: float64


VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
n_estimators,255.0
_runtime,825.0
_timestamp,1629263373.0
_step,51.0
val precision,0.46539
val recall,0.47266
val f1,0.469
val roc_auc,0.73645
val average_precision,0.47912
val matthews,0.31021


0,1
n_estimators,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇███
_runtime,▁▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇███
_timestamp,▁▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇███
_step,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇███
val precision,▁
val recall,▁
val f1,▁
val roc_auc,▁
val average_precision,▁
val matthews,▁


[34m[1mwandb[0m: wandb version 0.12.0 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade


(344988, 1000) (344988,)
(18157, 1000) (18157,)


13880     0.0
196347    1.0
124988    0.0
259317    0.0
         ... 
6600      0.0
220022    0.0
341856    0.0
57080     0.0
248964    0.0
Name: 18_self_harm, Length: 310489, dtype: float64 as keyword args. From version 1.0 (renaming of 0.25) passing these as positional arguments will result in an error


HBox(children=(FloatProgress(value=0.0, max=1000.0), HTML(value='')))


18_self_harm
precision            0.550955
recall               0.547122
f1                   0.549032
roc_auc              0.898363
average_precision    0.536889
matthews             0.506187
spearman             0.389073
dtype: float64


VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
n_estimators,125.0
_runtime,407.0
_timestamp,1629263791.0
_step,25.0
val precision,0.55798
val recall,0.54167
val f1,0.5497
val roc_auc,0.89026
val average_precision,0.5238
val matthews,0.50795


0,1
n_estimators,▁▁▂▂▂▂▃▃▃▄▄▄▅▅▅▅▆▆▆▇▇▇▇██
_runtime,▁▁▂▂▂▂▃▃▃▄▄▄▄▅▅▅▆▆▆▆▇▇▇███
_timestamp,▁▁▂▂▂▂▃▃▃▄▄▄▄▅▅▅▆▆▆▆▇▇▇███
_step,▁▁▂▂▂▂▃▃▃▄▄▄▄▅▅▅▅▆▆▆▇▇▇▇██
val precision,▁
val recall,▁
val f1,▁
val roc_auc,▁
val average_precision,▁
val matthews,▁


[34m[1mwandb[0m: wandb version 0.12.0 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade


(52568, 1000) (52568,)
(2710, 1000) (2710,)


157416    1.0
171115    0.0
223106    1.0
1148      1.0
         ... 
239396    1.0
85563     1.0
84287     1.0
262507    1.0
319588    1.0
Name: 64_Yes, Length: 47311, dtype: float64 as keyword args. From version 1.0 (renaming of 0.25) passing these as positional arguments will result in an error


HBox(children=(FloatProgress(value=0.0, max=1000.0), HTML(value='')))


64_Yes
precision            0.910504
recall               0.919389
f1                   0.914925
roc_auc              0.783043
average_precision    0.955571
matthews             0.325233
spearman             0.330020
dtype: float64


VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
n_estimators,310.0
_runtime,148.0
_timestamp,1629263946.0
_step,62.0
val precision,0.9062
val recall,0.9159
val f1,0.91103
val roc_auc,0.74488
val average_precision,0.94613
val matthews,0.28576


0,1
n_estimators,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
_runtime,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
_timestamp,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
_step,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇████
val precision,▁
val recall,▁
val f1,▁
val roc_auc,▁
val average_precision,▁
val matthews,▁





## Current

In [7]:
config={"min_estimators":5,
"max_estimators":5000,
"max_depth":5,
"class_weight":"balanced",
"step_size":5,
"patience":3}

## list of labels to build model for
for label in tqdm([
    "18_anxiety", 
    "19_desire_x",
    "19_intent_x",
    "19_capability_x",
    "19_timeframe_x",
#     "69_13 or younger",
#     "69_14-24",
#     "69_24 or younger",
    "18_substance", 
    "18_depressed", 
    "18_self_harm",
#     "18_suicide",
    "64_Yes",
# #     "65>1 (slightly helpful)",
# #     "65>2",
# #     "65>3",
# #     "65>4",
# #     "241_I see a therapist or doctor in person",
#     "74_Heterosexual or Straight",
#     "75_White",
#     "73_Male",
# #     "33_good"
]):

    wandb.init(project="tfidf_baseline",reinit=True,
               config={
                   **config,
                "label":label
               },tags=['tfidf'])
    
    min_estimators = wandb.config.min_estimators
    max_estimators = wandb.config.max_estimators
    max_depth = wandb.config.max_depth
    class_weight = wandb.config.class_weight
    step_size = wandb.config.step_size
    patience = wandb.config.patience

    X, y = dropna_both(ds_train, l_train[label].astype('float'))
    X_test, y_test = dropna_both(ds_test, l_test[label].astype('float'))

    print(X.shape, y.shape)
    print(X_test.shape, y_test.shape)

    X_train, X_val, y_train, y_val = train_test_split(X,y, test_size=0.1)

    u = np.unique(y_train)
    class_weight = {i:j for i,j in zip(u,compute_class_weight('balanced',u, y_train))}
    
    classifier = RandomForestClassifier(
        n_estimators = min_estimators,
        max_depth = max_depth,
        class_weight = class_weight,
        warm_start=True,
        n_jobs=2)
    
    fit_with_cv(X_train,y_train, X_val, y_val,
                model=classifier, 
                min_estimators = min_estimators, 
                max_estimators = max_estimators, 
                step_size = step_size, 
                patience = patience, 
                log_wandb=True)

    wandb.log({
        **prepend_keys(evaluate(classifier, X_train, X_val, y_train, y_val), "val "),
        **prepend_keys(evaluate(classifier, X_train, X_test, y_train, y_test), "test ")
    })
    print(label)
    print(pd.Series(evaluate(classifier, X_train, X_test, y_train, y_test)))
    wandb.join()

#     with open(f"models/rf/{label}_model.pickle", 'wb') as f:
#         pickle.dump(classifier, f)       
        

HBox(children=(FloatProgress(value=0.0, max=9.0), HTML(value='')))

[34m[1mwandb[0m: Currently logged in as: [33mkiatann[0m (use `wandb login --relogin` to force relogin)
[34m[1mwandb[0m: wandb version 0.12.0 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade


(341955, 1000) (341955,)
(17997, 1000) (17997,)


74964     0.0
262405    0.0
297944    1.0
234886    0.0
         ... 
99883     0.0
272188    1.0
283125    1.0
125815    0.0
336372    0.0
Name: 18_anxiety, Length: 307759, dtype: float64 as keyword args. From version 1.0 (renaming of 0.25) passing these as positional arguments will result in an error


HBox(children=(FloatProgress(value=0.0, max=1000.0), HTML(value='')))


18_anxiety
precision            0.544933
recall               0.547419
f1                   0.546173
roc_auc              0.786512
average_precision    0.600602
matthews             0.409117
spearman             0.418585
dtype: float64


VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
n_estimators,100.0
_runtime,325.0
_timestamp,1629214743.0
_step,20.0
val precision,0.53825
val recall,0.53605
val f1,0.53715
val roc_auc,0.78306
val average_precision,0.5866
val matthews,0.40025


0,1
n_estimators,▁▁▂▂▂▃▃▄▄▄▅▅▅▆▆▇▇▇██
_runtime,▁▁▂▂▂▃▃▄▄▄▅▅▅▆▆▆▇▇▇██
_timestamp,▁▁▂▂▂▃▃▄▄▄▅▅▅▆▆▆▇▇▇██
_step,▁▁▂▂▂▃▃▃▄▄▅▅▅▆▆▆▇▇▇██
val precision,▁
val recall,▁
val f1,▁
val roc_auc,▁
val average_precision,▁
val matthews,▁


[34m[1mwandb[0m: wandb version 0.12.0 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade


(344988, 1000) (344988,)
(18157, 1000) (18157,)


2650      1.0
260216    0.0
29605     1.0
224920    1.0
         ... 
6607      0.0
313900    1.0
166523    0.0
296506    0.0
124625    0.0
Name: 19_desire_x, Length: 310489, dtype: float64 as keyword args. From version 1.0 (renaming of 0.25) passing these as positional arguments will result in an error


HBox(children=(FloatProgress(value=0.0, max=1000.0), HTML(value='')))


19_desire_x
precision            0.672663
recall               0.671124
f1                   0.671893
roc_auc              0.849922
average_precision    0.727291
matthews             0.522661
spearman             0.562094
dtype: float64


VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
n_estimators,125.0
_runtime,408.0
_timestamp,1629215159.0
_step,25.0
val precision,0.681
val recall,0.68131
val f1,0.68116
val roc_auc,0.85118
val average_precision,0.73463
val matthews,0.53202


0,1
n_estimators,▁▁▂▂▂▂▃▃▃▄▄▄▅▅▅▅▆▆▆▇▇▇▇██
_runtime,▁▁▂▂▂▂▃▃▃▄▄▄▄▅▅▅▆▆▆▆▇▇▇███
_timestamp,▁▁▂▂▂▂▃▃▃▄▄▄▄▅▅▅▆▆▆▆▇▇▇███
_step,▁▁▂▂▂▂▃▃▃▄▄▄▄▅▅▅▅▆▆▆▇▇▇▇██
val precision,▁
val recall,▁
val f1,▁
val roc_auc,▁
val average_precision,▁
val matthews,▁


[34m[1mwandb[0m: wandb version 0.12.0 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade


(344988, 1000) (344988,)
(18157, 1000) (18157,)


82828     0.0
21162     0.0
151469    0.0
312845    0.0
         ... 
58412     0.0
301232    0.0
288505    0.0
26814     1.0
94688     0.0
Name: 19_intent_x, Length: 310489, dtype: float64 as keyword args. From version 1.0 (renaming of 0.25) passing these as positional arguments will result in an error


HBox(children=(FloatProgress(value=0.0, max=1000.0), HTML(value='')))


19_intent_x
precision            0.604211
recall               0.600728
f1                   0.602465
roc_auc              0.874084
average_precision    0.635796
matthews             0.514610
spearman             0.499497
dtype: float64


VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
n_estimators,90.0
_runtime,303.0
_timestamp,1629215469.0
_step,18.0
val precision,0.6025
val recall,0.60003
val f1,0.60126
val roc_auc,0.87499
val average_precision,0.63652
val matthews,0.51166


0,1
n_estimators,▁▁▂▂▃▃▃▄▄▅▅▆▆▆▇▇██
_runtime,▁▁▂▂▃▃▃▄▄▅▅▅▆▆▇▇▇██
_timestamp,▁▁▂▂▃▃▃▄▄▅▅▅▆▆▇▇▇██
_step,▁▁▂▂▃▃▃▄▄▅▅▅▆▆▆▇▇██
val precision,▁
val recall,▁
val f1,▁
val roc_auc,▁
val average_precision,▁
val matthews,▁


[34m[1mwandb[0m: wandb version 0.12.0 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade


(344988, 1000) (344988,)
(18157, 1000) (18157,)


266855    0.0
166816    0.0
99421     1.0
124191    0.0
         ... 
48719     0.0
244026    0.0
114188    0.0
32967     0.0
226930    0.0
Name: 19_capability_x, Length: 310489, dtype: float64 as keyword args. From version 1.0 (renaming of 0.25) passing these as positional arguments will result in an error


HBox(children=(FloatProgress(value=0.0, max=1000.0), HTML(value='')))


19_capability_x
precision            0.599074
recall               0.587944
f1                   0.593457
roc_auc              0.892218
average_precision    0.605963
matthews             0.531587
spearman             0.461950
dtype: float64


VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
n_estimators,75.0
_runtime,252.0
_timestamp,1629215728.0
_step,15.0
val precision,0.58028
val recall,0.5864
val f1,0.58332
val roc_auc,0.88922
val average_precision,0.59564
val matthews,0.51808


0,1
n_estimators,▁▁▂▃▃▃▄▅▅▅▆▇▇▇█
_runtime,▁▂▂▃▃▄▄▄▅▅▆▆▇▇██
_timestamp,▁▂▂▃▃▄▄▄▅▅▆▆▇▇██
_step,▁▁▂▂▃▃▄▄▅▅▆▆▇▇██
val precision,▁
val recall,▁
val f1,▁
val roc_auc,▁
val average_precision,▁
val matthews,▁


[34m[1mwandb[0m: wandb version 0.12.0 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade


(344988, 1000) (344988,)
(18157, 1000) (18157,)


334182    0.0
309926    0.0
229060    0.0
343847    0.0
         ... 
77899     0.0
68499     0.0
317680    0.0
102397    0.0
303365    0.0
Name: 19_timeframe_x, Length: 310489, dtype: float64 as keyword args. From version 1.0 (renaming of 0.25) passing these as positional arguments will result in an error


HBox(children=(FloatProgress(value=0.0, max=1000.0), HTML(value='')))


19_timeframe_x
precision            0.515873
recall               0.486891
f1                   0.500963
roc_auc              0.905833
average_precision    0.501670
matthews             0.462817
spearman             0.366922
dtype: float64


VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
n_estimators,85.0
_runtime,295.0
_timestamp,1629216029.0
_step,17.0
val precision,0.53038
val recall,0.51797
val f1,0.5241
val roc_auc,0.90333
val average_precision,0.51606
val matthews,0.4878


0,1
n_estimators,▁▁▂▂▃▃▄▄▅▅▅▆▆▇▇██
_runtime,▁▁▂▂▃▃▃▄▄▅▅▆▆▇▇▇██
_timestamp,▁▁▂▂▃▃▃▄▄▅▅▆▆▇▇▇██
_step,▁▁▂▂▃▃▃▄▄▅▅▆▆▆▇▇██
val precision,▁
val recall,▁
val f1,▁
val roc_auc,▁
val average_precision,▁
val matthews,▁


[34m[1mwandb[0m: wandb version 0.12.0 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade


(344988, 1000) (344988,)
(18157, 1000) (18157,)


113958    0.0
87810     0.0
204651    0.0
138513    0.0
         ... 
122301    0.0
217174    0.0
65691     0.0
191168    0.0
329220    0.0
Name: 18_substance, Length: 310489, dtype: float64 as keyword args. From version 1.0 (renaming of 0.25) passing these as positional arguments will result in an error


HBox(children=(FloatProgress(value=0.0, max=1000.0), HTML(value='')))


18_substance
precision            0.266129
recall               0.266129
f1                   0.266129
roc_auc              0.910995
average_precision    0.183463
matthews             0.261083
spearman             0.117254
dtype: float64


VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
n_estimators,155.0
_runtime,516.0
_timestamp,1629216551.0
_step,31.0
val precision,0.28641
val recall,0.22868
val f1,0.25431
val roc_auc,0.89094
val average_precision,0.18346
val matthews,0.25093


0,1
n_estimators,▁▁▁▂▂▂▂▃▃▃▃▄▄▄▄▅▅▅▅▅▆▆▆▆▇▇▇▇███
_runtime,▁▁▁▂▂▂▂▃▃▃▃▃▄▄▄▄▅▅▅▅▅▆▆▆▆▇▇▇▇███
_timestamp,▁▁▁▂▂▂▂▃▃▃▃▃▄▄▄▄▅▅▅▅▅▆▆▆▆▇▇▇▇███
_step,▁▁▁▂▂▂▂▃▃▃▃▃▄▄▄▄▅▅▅▅▆▆▆▆▆▇▇▇▇███
val precision,▁
val recall,▁
val f1,▁
val roc_auc,▁
val average_precision,▁
val matthews,▁


[34m[1mwandb[0m: wandb version 0.12.0 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade


(344988, 1000) (344988,)
(18157, 1000) (18157,)


194679    1.0
93569     1.0
213531    0.0
253878    0.0
         ... 
195009    1.0
323652    0.0
297830    0.0
187034    0.0
217497    0.0
Name: 18_depressed, Length: 310489, dtype: float64 as keyword args. From version 1.0 (renaming of 0.25) passing these as positional arguments will result in an error


HBox(children=(FloatProgress(value=0.0, max=1000.0), HTML(value='')))


18_depressed
precision            0.456213
recall               0.456102
f1                   0.456158
roc_auc              0.726641
average_precision    0.468368
matthews             0.298109
spearman             0.327951
dtype: float64


VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
n_estimators,135.0
_runtime,441.0
_timestamp,1629217000.0
_step,27.0
val precision,0.46608
val recall,0.45405
val f1,0.45998
val roc_auc,0.73235
val average_precision,0.47186
val matthews,0.3032


0,1
n_estimators,▁▁▂▂▂▂▃▃▃▃▄▄▄▅▅▅▅▆▆▆▆▇▇▇▇██
_runtime,▁▁▂▂▂▂▃▃▃▃▄▄▄▄▅▅▅▆▆▆▆▇▇▇▇███
_timestamp,▁▁▂▂▂▂▃▃▃▃▄▄▄▄▅▅▅▆▆▆▆▇▇▇▇███
_step,▁▁▂▂▂▂▃▃▃▃▄▄▄▄▅▅▅▅▆▆▆▆▇▇▇▇██
val precision,▁
val recall,▁
val f1,▁
val roc_auc,▁
val average_precision,▁
val matthews,▁


[34m[1mwandb[0m: wandb version 0.12.0 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade


(344988, 1000) (344988,)
(18157, 1000) (18157,)


344876    1.0
303484    0.0
266764    0.0
131900    0.0
         ... 
211389    0.0
307502    1.0
42924     0.0
59593     0.0
188156    0.0
Name: 18_self_harm, Length: 310489, dtype: float64 as keyword args. From version 1.0 (renaming of 0.25) passing these as positional arguments will result in an error


HBox(children=(FloatProgress(value=0.0, max=1000.0), HTML(value='')))


18_self_harm
precision            0.563522
recall               0.566730
f1                   0.565121
roc_auc              0.916953
average_precision    0.567600
matthews             0.523517
spearman             0.407230
dtype: float64


VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
n_estimators,145.0
_runtime,484.0
_timestamp,1629217491.0
_step,29.0
val precision,0.55841
val recall,0.5633
val f1,0.56084
val roc_auc,0.90729
val average_precision,0.54627
val matthews,0.51929


0,1
n_estimators,▁▁▁▂▂▂▃▃▃▃▃▄▄▄▅▅▅▅▅▆▆▆▇▇▇▇▇██
_runtime,▁▁▁▂▂▂▂▃▃▃▃▄▄▄▄▅▅▅▅▆▆▆▆▇▇▇▇███
_timestamp,▁▁▁▂▂▂▂▃▃▃▃▄▄▄▄▅▅▅▅▆▆▆▆▇▇▇▇███
_step,▁▁▁▂▂▂▂▃▃▃▃▄▄▄▄▅▅▅▅▆▆▆▆▇▇▇▇███
val precision,▁
val recall,▁
val f1,▁
val roc_auc,▁
val average_precision,▁
val matthews,▁


[34m[1mwandb[0m: wandb version 0.12.0 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade


(52568, 1000) (52568,)
(2710, 1000) (2710,)


168278    0.0
281446    1.0
7797      1.0
176774    1.0
         ... 
274848    0.0
201327    1.0
234718    0.0
58675     1.0
313178    0.0
Name: 64_Yes, Length: 47311, dtype: float64 as keyword args. From version 1.0 (renaming of 0.25) passing these as positional arguments will result in an error


HBox(children=(FloatProgress(value=0.0, max=1000.0), HTML(value='')))


64_Yes
precision            0.901879
recall               0.916419
f1                   0.909091
roc_auc              0.745061
average_precision    0.943469
matthews             0.263274
spearman             0.285734
dtype: float64


VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
n_estimators,125.0
_runtime,63.0
_timestamp,1629217563.0
_step,25.0
val precision,0.89839
val recall,0.91493
val f1,0.90658
val roc_auc,0.73496
val average_precision,0.94078
val matthews,0.2502


0,1
n_estimators,▁▁▂▂▂▂▃▃▃▄▄▄▅▅▅▅▆▆▆▇▇▇▇██
_runtime,▁▁▂▂▂▂▃▃▃▃▄▄▄▅▅▅▆▆▆▆▇▇▇███
_timestamp,▁▁▂▂▂▂▃▃▃▃▄▄▄▅▅▅▆▆▆▆▇▇▇███
_step,▁▁▂▂▂▂▃▃▃▄▄▄▄▅▅▅▅▆▆▆▇▇▇▇██
val precision,▁
val recall,▁
val f1,▁
val roc_auc,▁
val average_precision,▁
val matthews,▁



