In [5]:
%load_ext autoreload
%autoreload 2

In [6]:
import random
import unidecode
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import time
import torch.nn.functional as F
from keras.preprocessing import text # depreciated?
from torch.utils.data import Dataset, DataLoader,TensorDataset
from sklearn.model_selection import train_test_split

from tensorflow.keras.preprocessing.sequence import pad_sequences
import gc
import re
import pickle
import csv
from tqdm import tqdm
tqdm.pandas()
from gensim.models import KeyedVectors
from flashtext import KeywordProcessor

CRAWL_EMBEDDING_PATH = 'data/fasttext-crawl-300d-2m/crawl-300d-2M.vec'
PARAD_EMBEDDING_PATH = 'data/paragram-dandrocec/paragram_300_sl999.txt'
GLOVE_EMBEDDING_PATH = 'data/glove.840B.300d.txt'

BATCH_SIZE = 256
EPOCHS = 5
MAX_LEN = 220
NUM_MODEL = 3
SEED = 6089

In [7]:
from utils import seed_everything

# Prepare data

In [8]:
ls ../data

cleaned_reddit.csv
cleaned_reddit_lemmatized_60percent.csv
cleaned_reddit_lemmatized.csv
crawl_matrix_reddit_mental_health_cut.npy
crawl_matrix_reddit_mental_health_cut_v2.npy
crawl_matrix_reddit_mental_health_cut_v3.npy
crawl_matrix_reddit_mental_health_cut_v4.npy
crawl_oov_reddit_mental_health_cut.pickle
crawl_oov_reddit_mental_health_cut_v2.pickle
crawl_oov_reddit_mental_health_cut_v3.pickle
mental_health_sentence_embedding.pickle
test_roberta.npy
train_roberta.npy


In [10]:
df = pd.read_csv('../data/cleaned_reddit_lemmatized_60percent.csv')

In [11]:
train_df = df[df["date_year"].isin([2018, 2019])]
test_df = df[df["date_year"]==2020]

train_df = train_df.reset_index(drop = True)
test_df = test_df.reset_index(drop = True)

In [12]:
print(train_df.shape)
print(test_df.shape)

(452452, 30)
(184143, 30)


In [13]:
train_roberta = np.load('../data/train_roberta.npy')
test_roberta = np.load('../data/test_roberta.npy')

In [14]:
y_train = train_df['label'].values

In [16]:
x_train = train_roberta

In [20]:
x_test = test_roberta

# Model

# Prepare training

In [129]:
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.metrics import f1_score

from model import NeuralNetBase

In [69]:
x_test_tensor = torch.tensor(x_test)
test_data = torch.utils.data.TensorDataset(x_test_tensor)
test_loader = torch.utils.data.DataLoader(test_data, batch_size=BATCH_SIZE, shuffle=False)

In [62]:
device = 'cuda:0'

In [124]:
EPOCHS = 15

In [125]:
final_test = list()
val_f1_score = list()

NFOLDS = 5

skf = StratifiedKFold(n_splits=NFOLDS, shuffle=True, random_state=SEED)

for fold_, (trn_idx, val_idx) in enumerate(skf.split(x_train, y_train)):
        
    print("Fold: {}/{}".format(fold_ + 1, NFOLDS))
     
    y_train_fold = torch.tensor(y_train[trn_idx], dtype=torch.float).to(device)
    y_val = torch.tensor(y_train[val_idx], dtype=torch.float).to(device)
    x_train_roberta = torch.tensor(train_roberta[trn_idx], dtype=torch.float).to(device)
    x_val_roberta = torch.tensor(x_train[val_idx], dtype=torch.float).to(device)
    
    train_data = torch.utils.data.TensorDataset(x_train_roberta, y_train_fold)
    val_data = torch.utils.data.TensorDataset(x_val_roberta, y_val)

    train_loader = torch.utils.data.DataLoader(train_data, batch_size=BATCH_SIZE, shuffle=True)
    val_loader = torch.utils.data.DataLoader(val_data, batch_size=BATCH_SIZE, shuffle=False)

    net = NeuralNetBase(256, 1).to(device)
    loss_fn = torch.nn.BCELoss(reduction='mean')
    optimizer = torch.optim.AdamW(params =  net.parameters(), lr=0.002, weight_decay=1e-7)

    test_checkpoint = list()
    loss_checkpoint = list()
    val_f1_epoch = list()
    
    for epoch in range(EPOCHS): 
        
        start_time = time.time()

        avg_loss = 0.0
        
        net.train()
        for i, data in enumerate(train_loader):
            
            # get the inputs
            inputs_roberta, labels = data
            inputs_roberta = inputs_roberta.to(device)
            
            pred1 = net(inputs_roberta)
            loss1 = loss_fn(pred1, labels.unsqueeze(1))
            
            optimizer.zero_grad()
            loss1.backward()
            optimizer.step()

            avg_loss += loss1.item()

        net.eval()
        
        valid_preds = np.zeros((len(val_idx),))
        true_label = np.zeros((len(val_idx),))

        avg_val_loss = 0.0

        for j, data in enumerate(val_loader):
            
            # get the inputs
            inputs_roberta, labels = data
            inputs_roberta = inputs_roberta.to(device)
            
            pred1 = net(inputs_roberta)
            loss1_val = loss_fn(pred1, labels.unsqueeze(1))
            avg_val_loss += loss1_val.item()
            
            valid_preds[j * BATCH_SIZE:(j+1) * BATCH_SIZE] = (pred1.squeeze().cpu().detach().numpy()>=0.5).astype(float)
            true_label[j * BATCH_SIZE:(j+1) * BATCH_SIZE]  = labels.cpu().detach().numpy()
            
        elapsed_time = time.time() - start_time 

        print('Epoch {}/{} \t loss={:.4f}\t val_loss={:.4f} \t val_f1_score={:.4f} \t time={:.2f}s'.format(
                        epoch+1, EPOCHS, avg_loss/len(train_loader),avg_val_loss/len(val_loader), f1_score(true_label, valid_preds, average='micro'), elapsed_time))
        val_f1_epoch.append(f1_score(true_label, valid_preds, average='micro'))
        
        ## inference
        result = list()
        with torch.no_grad():
            for inputs_roberta in test_loader:
                inputs_roberta = inputs_roberta[0].to(device)
                y_pred = net(inputs_roberta)
                
                y_pred = y_pred.cpu().detach().numpy()
                result.extend(y_pred)

        test_checkpoint.append(result)
        loss_checkpoint.append(avg_val_loss)
        print(f1_score(test_df.label.values, np.array(test_checkpoint[-1]) >= 0.5, average='micro'))
        
        
    final_test.append(test_checkpoint[np.argmin(loss_checkpoint)])
    val_f1_score.append(val_f1_epoch[np.argmin(loss_checkpoint)])
    with open("final_test_{}".format(fold_), "wb") as fp: 
        pickle.dump(final_test, fp)



Fold: 1/5
Epoch 1/15 	 loss=0.1068	 val_loss=0.0934 	 val_f1_score=0.9657 	 time=5.23s
0.9676772942767305
Epoch 2/15 	 loss=0.0850	 val_loss=0.0888 	 val_f1_score=0.9681 	 time=5.46s
0.9690132125576318
Epoch 3/15 	 loss=0.0776	 val_loss=0.0856 	 val_f1_score=0.9691 	 time=5.34s
0.9695779910178502
Epoch 4/15 	 loss=0.0713	 val_loss=0.0870 	 val_f1_score=0.9693 	 time=5.09s
0.9697571995677272
Epoch 5/15 	 loss=0.0662	 val_loss=0.0850 	 val_f1_score=0.9699 	 time=5.24s
0.9688991707531647
Epoch 6/15 	 loss=0.0611	 val_loss=0.0901 	 val_f1_score=0.9688 	 time=5.10s
0.9693553379710333
Epoch 7/15 	 loss=0.0566	 val_loss=0.0892 	 val_f1_score=0.9695 	 time=4.93s
0.9685461842155282
Epoch 8/15 	 loss=0.0531	 val_loss=0.0918 	 val_f1_score=0.9696 	 time=5.13s
0.9690783793030416
Epoch 9/15 	 loss=0.0501	 val_loss=0.0926 	 val_f1_score=0.9694 	 time=5.31s
0.9686656565821128
Epoch 10/15 	 loss=0.0473	 val_loss=0.0988 	 val_f1_score=0.9693 	 time=5.04s
0.9687525455759926
Epoch 11/15 	 loss=0.0443	 va

In [114]:
np.array(test_checkpoint[0]) >= 0.5

array([[ True],
       [ True],
       [False],
       ...,
       [False],
       [False],
       [False]])

In [117]:
print('mean val f1 score:', np.mean(val_f1_score))

mean val f1 score: 0.971537311897616


In [74]:
from sklearn.metrics import f1_score, recall_score, precision_score

def threshold_search_fold(y_true, y_proba):

    binary_best_threshold = 0
    binary_best_score = 0
    
    for threshold in tqdm([i * 0.01 for i in range(100)], disable=True):
 
        binary_score = f1_score(y_true, np.where(y_proba>=threshold , 1 ,0), average='micro')
        if binary_score > binary_best_score:
            binary_best_threshold = threshold
            binary_best_score = binary_score
            
    recall = recall_score(y_true, np.where(y_proba>=binary_best_threshold , 1 ,0), average='micro')
    precission = precision_score(y_true, np.where(y_proba>=binary_best_threshold , 1 ,0), average='micro')
    print('best_threshold_recall:', recall)
    print('best_threshold_precision:', precission)
    
    search_result = {'f1_binary_threshold': binary_best_threshold, 'f1_binary': binary_best_score,}
    return search_result

In [126]:
predicted_prob = np.mean(final_test, axis=0)

In [128]:
search_resutls = threshold_search_fold(test_df.label.values, predicted_prob)
search_resutls

best_threshold_recall: 0.97236386938412
best_threshold_precision: 0.97236386938412


{'f1_binary_threshold': 0.49, 'f1_binary': 0.97236386938412}

In [None]:
predicted_prob = (predicted_prob>=0.5).astype(float)