In [1]:
# import kaggle
# !kaggle datasets download -d lancelotzty/redis-depression

In [1]:
%load_ext autoreload
%autoreload 2
# %pip install jupyter-autotime
%load_ext autotime

In [2]:
import random
import unidecode
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import time
import torch.nn.functional as F
from keras.preprocessing import text
from torch.utils.data import Dataset, DataLoader,TensorDataset
from sklearn.model_selection import train_test_split
from bs4 import BeautifulSoup
import emoji
import string
from keras.preprocessing.sequence import pad_sequences
import gc
import re
import pickle
import csv
from tqdm import tqdm
tqdm.pandas()
from gensim.models import KeyedVectors
from flashtext import KeywordProcessor

In [3]:
from utils import seed_everything

# Config

In [4]:
# different embedding paths
CRAWL_EMBEDDING_PATH = '../input/fasttext-crawl-300d-2m/crawl-300d-2M.vec'
PARAD_EMBEDDING_PATH = '../input/paragram-dandrocec/paragram_300_sl999.txt'
GLOVE_EMBEDDING_PATH = 'glove.840B.300d.txt'

SEED = 6089

seed_everything(SEED)

# Load Data

In [5]:
raw_train_df = pd.read_csv('train_df_cut.csv')
raw_test_df = pd.read_csv('test_df_cut.csv')

In [6]:
print(raw_train_df.shape)
print(raw_test_df.shape)

(848791, 33)
(212198, 33)


In [7]:
# using a subset first to test
train_df = raw_train_df.sample(10000)
# test_df = test_df.sample(frac=0.01)

train_df = train_df.reset_index(drop = True)
test_df = raw_test_df.reset_index(drop = True)

# Process

## Process labels

In [8]:
from sklearn.preprocessing import OneHotEncoder, LabelEncoder

#creating instance of one-hot-encoder
encoder = OneHotEncoder(handle_unknown='ignore')
#perform one-hot encoding on 'team' column 
encoder_df = pd.DataFrame(encoder.fit_transform(train_df[['subreddit']]).toarray())
# #merge one-hot encoded columns back with original DataFrame
train_df = train_df.join(encoder_df)
aux_y_columns = encoder.get_feature_names().tolist()
train_df = train_df.rename(columns = {i: c for i, c in enumerate(aux_y_columns)})

In [10]:
le = LabelEncoder()
le.fit(train_df.subreddit)
train_df['categorical_label'] = le.transform(train_df.subreddit)

In [13]:
weights = np.ones(len(train_df))
y_aux_train = np.array(train_df[aux_y_columns])
y_train = np.vstack([train_df['label'], weights]).T
y_train = np.concatenate((y_train, y_aux_train), axis=1)  # [n, 28]

## Process data

In [14]:
from preprocess import content_preprocessing

In [17]:
train_df['n_words'] = train_df['post'].apply(lambda x: len(x.split(' ')))
test_df['n_words'] = test_df['post'].apply(lambda x: len(x.split(' ')))

np.percentile(train_df.n_words, 84), np.percentile(test_df.n_words, 84)

(292.0, 301.0)

In [16]:
### preprocessing
x_train = train_df["post"].progress_apply(lambda x: content_preprocessing(x))
x_test = test_df["post"].progress_apply(lambda x: content_preprocessing(x))

  text = BeautifulSoup(text, 'lxml').get_text()
100%|██████████| 10000/10000 [00:11<00:00, 850.62it/s]
100%|██████████| 212198/212198 [04:08<00:00, 855.26it/s]


In [18]:
tokenizer = text.Tokenizer(filters='', lower=False)
tokenizer.fit_on_texts(list(x_train)+list(x_test))

x_train = tokenizer.texts_to_sequences(x_train)
x_test = tokenizer.texts_to_sequences(x_test)

x_train = pad_sequences(x_train, maxlen=MAX_LEN, padding='post')
x_test = pad_sequences(x_test, maxlen=MAX_LEN, padding='post')

## Embed data

In [20]:
from train import load_embeddings, build_matrix

In [23]:
glove_embedding = load_embeddings(GLOVE_EMBEDDING_PATH)

glove_matrix , oov2 = build_matrix(tokenizer.word_index, glove_embedding)

100%|██████████| 214816/214816 [00:00<00:00, 262498.91it/s]


In [58]:
device = 'cuda:1'

x_test_tensor = torch.tensor(x_test, dtype=torch.long).to(device)
test_data = torch.utils.data.TensorDataset(x_test_tensor)
test_loader = torch.utils.data.DataLoader(test_data, batch_size=BATCH_SIZE, shuffle=False)

# Train

In [25]:
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.metrics import f1_score

from model import custom_loss, NeuralNet

In [22]:
def prepare_data_loader(x_data, y_data, BATCH_SIZE, device):
    x_data = torch.tensor(x_data, dtype=torch.long).to(device)
    y_data = torch.tensor(y_data, dtype=torch.float).to(device)
    
    data = torch.utils.data.TensorDataset(x_data, y_data)
    data_loader = torch.utils.data.DataLoader(data, batch_size=BATCH_SIZE, shuffle=True)
    
    return data_loader

In [52]:
gc.collect()
torch.cuda.empty_cache()

In [54]:
BATCH_SIZE = 96
EPOCHS = 20
MAX_LEN = 300
NUM_MODEL = 1

In [60]:
final_test = list()
val_f1_score = list()
NFOLDS = 5

skf = StratifiedKFold(n_splits=NFOLDS, shuffle=True, random_state=SEED)

for fold_, (trn_idx, val_idx) in enumerate(skf.split(x_train, train_df['categorical_label'].values)):
        
    print("Fold: {}/{}".format(fold_ + 1, NFOLDS))
     
    train_loader = prepare_data_loader(x_train[trn_idx], y_train[trn_idx], BATCH_SIZE, device)
    val_loader = prepare_data_loader(x_train[val_idx], y_train[val_idx], BATCH_SIZE, device)

    net = NeuralNet(glove_matrix, 300, MAX_LEN).to(device)
    loss_fn = torch.nn.BCELoss(reduction='mean')

    optimizer = torch.optim.AdamW(params=net.parameters(), lr=0.002, weight_decay=1e-7)

    test_checkpoint = list()
    loss_checkpoint = list()
    val_f1_epoch = list()
    
    for epoch in range(EPOCHS): 
        
        start_time = time.time()

        avg_loss = 0.0
        
        net.train()
        for i, data in tqdm(enumerate(train_loader), total=len(train_loader)):

            # get the inputs
            inputs, labels = data
            
            label1 = labels[:,:2]
            label2 = labels[:,2:]

            pred1, pred2 = net(inputs)
            
            loss1 = custom_loss(pred1, label1)
            loss2 = loss_fn(pred2, label2)
            loss_weight = 1
            loss = loss1*loss_weight + loss2
            # loss = loss2
           
            # zero the parameter gradients
            optimizer.zero_grad()

            loss.backward()
            optimizer.step()

            avg_loss += loss1.item()

        net.eval()
        
        valid_preds = np.zeros((len(val_idx),))
        true_label = np.zeros((len(val_idx),))

        avg_val_loss = 0.0

        for j, data in enumerate(val_loader):
            
            # get the inputs
            inputs, labels = data
            
            val_label1 = labels[:, :2]
            val_label2 = labels[:, 2:]

            ## forward + backward + optimize
            pred1, pred2 = net(inputs)
            
            loss1_val = custom_loss(pred1, val_label1)

            avg_val_loss += loss1_val.item()
            
            valid_preds[j * BATCH_SIZE:(j+1) * BATCH_SIZE] = (pred1.squeeze().cpu().detach().numpy()>=0.5).astype(float)
            true_label[j * BATCH_SIZE:(j+1) * BATCH_SIZE]  = labels[:, 0].cpu().detach().numpy()

        elapsed_time = time.time() - start_time 

        print('Epoch {}/{} \t loss={:.4f}\t val_loss={:.4f} \t val_f1_score={:.4f} \t time={:.2f}s'.format(
                        epoch+1, EPOCHS, avg_loss/len(train_loader),avg_val_loss/len(val_loader), f1_score(true_label, valid_preds, average='micro'), elapsed_time))
        val_f1_epoch.append(f1_score(true_label, valid_preds, average='micro'))
        
        ## inference
        result = list()
        with torch.no_grad():
            for (x_batch,) in test_loader:
                y_pred, _ = net(x_batch)
                y_pred = y_pred.cpu().numpy()[:,0]
                result.extend(y_pred)

        test_checkpoint.append(result)
        loss_checkpoint.append(avg_val_loss)
        
        
    final_test.append(test_checkpoint[np.argmin(loss_checkpoint)])
    val_f1_score.append(val_f1_epoch[np.argmin(loss_checkpoint)])
    with open("final_test_{}".format(fold_), "wb") as fp:   #Pickling
        pickle.dump(final_test, fp)

Fold: 1/5


100%|██████████| 84/84 [00:20<00:00,  4.04it/s]


Epoch 1/20 	 loss=0.6948	 val_loss=0.6843 	 val_f1_score=0.5970 	 time=22.41s


100%|██████████| 84/84 [00:21<00:00,  3.99it/s]


Epoch 2/20 	 loss=0.6773	 val_loss=0.7113 	 val_f1_score=0.4030 	 time=22.69s


100%|██████████| 84/84 [00:21<00:00,  3.99it/s]


Epoch 3/20 	 loss=0.6764	 val_loss=0.7170 	 val_f1_score=0.4030 	 time=22.67s


100%|██████████| 84/84 [00:20<00:00,  4.00it/s]


Epoch 4/20 	 loss=0.6760	 val_loss=0.7098 	 val_f1_score=0.4030 	 time=22.61s


100%|██████████| 84/84 [00:20<00:00,  4.00it/s]


Epoch 5/20 	 loss=0.6759	 val_loss=0.7172 	 val_f1_score=0.4030 	 time=22.61s


100%|██████████| 84/84 [00:20<00:00,  4.00it/s]


Epoch 6/20 	 loss=0.6748	 val_loss=0.7164 	 val_f1_score=0.4030 	 time=22.61s


100%|██████████| 84/84 [00:20<00:00,  4.00it/s]


Epoch 7/20 	 loss=0.6749	 val_loss=0.7174 	 val_f1_score=0.4030 	 time=22.60s


100%|██████████| 84/84 [00:20<00:00,  4.00it/s]


Epoch 8/20 	 loss=0.6746	 val_loss=0.7046 	 val_f1_score=0.4030 	 time=22.61s


100%|██████████| 84/84 [00:20<00:00,  4.01it/s]


Epoch 9/20 	 loss=0.6749	 val_loss=0.7148 	 val_f1_score=0.4030 	 time=22.59s


100%|██████████| 84/84 [00:20<00:00,  4.00it/s]


Epoch 10/20 	 loss=0.6746	 val_loss=0.7150 	 val_f1_score=0.4030 	 time=22.61s


100%|██████████| 84/84 [00:20<00:00,  4.01it/s]


Epoch 11/20 	 loss=0.6746	 val_loss=0.7157 	 val_f1_score=0.4030 	 time=22.59s


100%|██████████| 84/84 [00:20<00:00,  4.01it/s]


Epoch 12/20 	 loss=0.6751	 val_loss=0.7044 	 val_f1_score=0.4030 	 time=22.59s


100%|██████████| 84/84 [00:20<00:00,  4.00it/s]


Epoch 13/20 	 loss=0.6747	 val_loss=0.7089 	 val_f1_score=0.4030 	 time=22.62s


100%|██████████| 84/84 [00:20<00:00,  4.01it/s]


Epoch 14/20 	 loss=0.6750	 val_loss=0.6988 	 val_f1_score=0.4030 	 time=22.58s


100%|██████████| 84/84 [00:20<00:00,  4.00it/s]


Epoch 15/20 	 loss=0.6744	 val_loss=0.7022 	 val_f1_score=0.4030 	 time=22.61s


100%|██████████| 84/84 [00:20<00:00,  4.00it/s]


Epoch 16/20 	 loss=0.6748	 val_loss=0.7110 	 val_f1_score=0.4030 	 time=22.61s


100%|██████████| 84/84 [00:20<00:00,  4.00it/s]


Epoch 17/20 	 loss=0.6743	 val_loss=0.7024 	 val_f1_score=0.4030 	 time=22.61s


100%|██████████| 84/84 [00:21<00:00,  4.00it/s]


Epoch 18/20 	 loss=0.6753	 val_loss=0.7057 	 val_f1_score=0.4030 	 time=22.64s


100%|██████████| 84/84 [00:21<00:00,  3.96it/s]


Epoch 19/20 	 loss=0.6749	 val_loss=0.7237 	 val_f1_score=0.4030 	 time=22.84s


100%|██████████| 84/84 [00:21<00:00,  4.00it/s]


Epoch 20/20 	 loss=0.6751	 val_loss=0.7114 	 val_f1_score=0.4030 	 time=22.64s




Fold: 2/5


100%|██████████| 84/84 [00:20<00:00,  4.01it/s]


RuntimeError: CUDA out of memory. Tried to allocate 114.00 MiB (GPU 1; 15.78 GiB total capacity; 12.69 GiB already allocated; 62.00 MiB free; 14.50 GiB reserved in total by PyTorch)

In [31]:
# Helper function

from sklearn.metrics import f1_score, recall_score, precision_score

def threshold_search_fold(y_true, y_proba):

    binary_best_threshold = 0
    binary_best_score = 0
    
    for threshold in tqdm([i * 0.01 for i in range(100)], disable=True):
 
        binary_score = f1_score(y_true, np.where(y_proba>=threshold , 1 ,0), average='micro')
        if binary_score > binary_best_score:
            binary_best_threshold = threshold
            binary_best_score = binary_score
            
    recall = recall_score(y_true, np.where(y_proba>=binary_best_threshold , 1 ,0), average='micro')
    precission = precision_score(y_true, np.where(y_proba>=binary_best_threshold , 1 ,0), average='micro')
    print('best_threshold_recall:', recall)
    print('best_threshold_precision:', precission)
    
    search_result = {'f1_binary_threshold': binary_best_threshold, 'f1_binary': binary_best_score,}
    return search_result

In [None]:
predicted_prob = np.mean(final_test, axis=0)
search_results = threshold_search_fold(test_df.label.values, predicted_prob)
search_results

In [None]:
f1_score(test_df.label.values, (predicted_prob>=search_results['f1_binary_threshold']).astype(float), average='micro')