In [None]:
%%capture
# !pip install text_hammer

In [None]:
import os
import random
import torch
import wandb
import logging
import transformers
import numpy as np
import pandas as pd
import torch.nn as nn
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm
from datasets import load_dataset
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset
from sklearn import metrics, model_selection, preprocessing
from transformers import AdamW, get_linear_schedule_with_warmup

In [None]:
# wandb.login()

In [None]:
%%capture
import re
# import text_hammer as th
# from wordcloud import WordCloud
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from transformers import AutoTokenizer,TFBertModel

import warnings
warnings.filterwarnings("ignore",  category = FutureWarning)
warnings.filterwarnings("ignore", category=UserWarning)


In [None]:
def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    # Some cudnn methods can be random even after fixing the seed 
    # unless you tell it to be deterministic
    torch.backends.cudnn.deterministic = True

seed_everything(1234)

In [None]:
# Loading datasets into DF
df = pd.read_csv("/home/siu856533724/code/source-code/Social-Networks/Trend-Prediction/DataSet/trend-data/trend_dataset_class8_reduced2.csv")

# Checking Shape
display(df.shape)

In [None]:
df.head()

In [None]:
from utils.preprocessing import expand_contractions
# df['text'] = df['text'].apply(expand_contractions)

In [None]:
from utils.preprocessing import get_clean_dataset
df = get_clean_dataset(df)

In [None]:
df.head()

In [None]:
from sklearn.model_selection import train_test_split
train_temp, test = train_test_split(df, test_size=0.2, shuffle=True, random_state=42)

In [None]:
train, valid = train_test_split(train_temp, test_size=0.2, shuffle=True, random_state=42)

In [None]:
# train[[str(i) for i in range(0, 28)]].values.tolist()

🧹Clean Data (Noise Entity Removal) ¶

In [None]:
# def text_preprocessing(df,col_name):
#     tqdm.pandas()
#     df[col_name] = df[col_name].progress_apply(lambda x:str(x).lower())
#     df[col_name] = df[col_name].progress_apply(lambda x: th.remove_emails(x))
#     df[col_name] = df[col_name].progress_apply(lambda x: th.remove_html_tags(x))
#     df[col_name] = df[col_name].progress_apply(lambda x: th.remove_urls(x))
#     df[col_name] = df[col_name].progress_apply(lambda x: th.remove_special_chars(x))
#     df[col_name] = df[col_name].progress_apply(lambda x: th.remove_accented_chars(x))
#     df[col_name] = df[col_name].progress_apply(lambda text: th.cont_exp(text))
#     df[col_name] = df[col_name].progress_apply(lambda x: re.sub("[" 
#         u"\U0001F600-\U0001F64F"  # emoticons
#         u"\U0001F300-\U0001F5FF"  # symbols & pictographs
#         u"\U0001F680-\U0001F6FF"  # transport & map symbols
#         u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
#         u"\U00002702-\U000027B0"
#         u"\U000024C2-\U0001F251"
#         "]+", "", x))
#     df[col_name] = df[col_name].progress_apply(lambda x: re.sub(r'[^a-zA-Z]', ' ', x))
#     df[col_name] = df[col_name].progress_apply(lambda x: ' '.join(x.split()))
    
#     return(df)

In [None]:
# # %%capture
# train = text_preprocessing(train,'text')
# valid = text_preprocessing(valid,'text')
# test = text_preprocessing(test,'text')

🗑️Remove Stop Words ¶

In [None]:
# stop = stopwords.words('english')
# train['text'] = train['text'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))
# valid['text'] = valid['text'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))
# test['text'] = test['text'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))

🌱Lemmatization ¶

In [None]:
# nltk.download('wordnet')
# !unzip "/usr/share/nltk_data/corpora/wordnet.zip" -d "/usr/share/nltk_data/corpora/"

In [None]:
# def word_lemmatizer(text):
#     lemmatizer = WordNetLemmatizer()
#     return ' '.join([lemmatizer.lemmatize(word) for word in text.split()])

# train['text'] = train['text'].apply(lambda text: word_lemmatizer(text))
# valid['text'] = valid['text'].apply(lambda text: word_lemmatizer(text))

In [None]:
from transformers import AutoModelForSequenceClassification
from utils.dataset import Dataset

def build_dataset(tokenizer_max_len, tokenizer):
    train_dataset = Dataset(train.text.tolist(), train[get_classes()].values.tolist(), tokenizer, tokenizer_max_len)
    valid_dataset = Dataset(valid.text.tolist(), valid[get_classes()].values.tolist(), tokenizer, tokenizer_max_len)
    test_dataset = Dataset(test.text.tolist(), test[get_classes()].values.tolist(), tokenizer, tokenizer_max_len)

    return train_dataset, valid_dataset, test_dataset
    # return train_dataset, valid_dataset

def build_dataloader(train_dataset, valid_dataset, test_dataset, batch_size):
    train_data_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=2)
    valid_data_loader = DataLoader(valid_dataset, batch_size=int(batch_size/2), shuffle=True, num_workers=1)
    test_data_loader = DataLoader(test_dataset, batch_size=len(test_dataset), shuffle=True, num_workers=1)

    return train_data_loader, valid_data_loader, test_data_loader
    # return train_data_loader, valid_data_loader

def ret_model(do_prob, model_ckpt, n_labels, is_freeze):
    # model = TextClassification(n_classes=n_labels, dropout=do_prob, model_ckpt=model_ckpt)
    model = AutoModelForSequenceClassification.from_pretrained(model_ckpt, num_labels=n_labels)

    if model_ckpt != 'roberta-large' and is_freeze == True:
        # Freeze all layers except the classification layer
        for param in model.base_model.parameters():
            param.requires_grad = False

        # Modify the classification layer for the new task (target domain)
        model.classifier = torch.nn.Linear(model.classifier.in_features, n_labels)
    return model

In [None]:
def ret_optimizer(model):
    param_optimizer = list(model.named_parameters())
    no_decay = ["bias", "LayerNorm.bias"]
    optimizer_parameters = [
        {
            "params": [
                p for n, p in param_optimizer if not any(nd in n for nd in no_decay)
            ],
            "weight_decay": 0.001,
        },
        {
            "params": [
                p for n, p in param_optimizer if any(nd in n for nd in no_decay)
            ],
            "weight_decay": 0.0,
        },
    ]
    # opt = AdamW(optimizer_parameters, lr=wandb.config.learning_rate)
    opt = AdamW(model.parameters(), lr=wandb.config.learning_rate)
    return opt

In [None]:
def ret_scheduler(optimizer, num_train_steps):
    scheduler  = get_linear_schedule_with_warmup(
        optimizer, num_warmup_steps=0, num_training_steps=num_train_steps)
    return scheduler 

In [None]:
import torch
import torch.nn.functional as F

def loss_fn(y_pred, y_true, loss_func, alpha=0.25, gamma=2.0):
    
    if y_true is None:
        return None
    
    if loss_func == 'cross_entropy':
        return F.cross_entropy(y_pred, y_true.float())
    if loss_func == 'focal_loss':
        logpt = -F.binary_cross_entropy_with_logits(y_pred, y_true.float())
        pt = torch.exp(logpt)
        loss = -((1 - pt) ** gamma) * logpt
        return loss.mean()
    if loss_func == 'hinge_loss':
        return F.hinge_embedding_loss(y_pred, y_true.float())
    if loss_func == 'mse_loss':
         return nn.MSELoss()(y_pred, y_true.float())
    
    if loss_func == 'multi_label_soft_margin':
        return nn.MultiLabelSoftMarginLoss()(y_pred, y_true.float())
    
    # if not all others
    return nn.BCEWithLogitsLoss()(y_pred, y_true.float())

In [None]:
def train_fn(data_loader, model, optimizer, device, scheduler):
    train_loss = 0.0
    model.train()
    for bi, d in tqdm(enumerate(data_loader), total=len(data_loader)):
        ids = d["ids"]
        mask = d["mask"]
        targets = d["labels"]
        # token_type_ids = d['token_type_ids']

        ids = ids.to(device, dtype=torch.long)
        mask = mask.to(device, dtype=torch.long)
        targets = targets.to(device, dtype=torch.float)
        
        optimizer.zero_grad()
        torch.cuda.empty_cache()
        outputs = model(ids, attention_mask=mask, labels=targets)
        # outputs = model(ids, attention_mask=mask)
        # loss = outputs.loss

        loss = loss_fn(y_pred=outputs.logits, y_true=targets, 
                           loss_func=get_loss_func())
        # loss = loss_fn(y_pred=outputs, y_true=targets, 
        #                    loss_func=get_loss_func())
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        scheduler.step()
        
        train_loss += loss.item()
        
    return train_loss
    

def eval_fn(data_loader, model, device):
    eval_loss = 0.0
    model.eval()
    fin_targets = []
    fin_outputs = []
    val_accuracy = 0
    with torch.no_grad():
        for bi, d in tqdm(enumerate(data_loader), total=len(data_loader)):
            ids = d["ids"]
            mask = d["mask"]
            targets = d["labels"]
            # token_type_ids = d['token_type_ids']

            ids = ids.to(device, dtype=torch.long)
            mask = mask.to(device, dtype=torch.long)
            targets = targets.to(device, dtype=torch.float)

            outputs = model(ids, attention_mask=mask, labels=targets)
            # outputs = model(ids, attention_mask=mask)
            loss = loss_fn(y_pred=outputs.logits, y_true=targets, 
                           loss_func=get_loss_func())
            # loss = loss_fn(y_pred=outputs, y_true=targets, 
            #                loss_func=get_loss_func())

            eval_loss += loss.item()
            
            fin_targets.extend(targets)
            fin_outputs.extend(torch.nn.functional.softmax(outputs.logits))
            
    return eval_loss, fin_outputs, fin_targets

def test_fn(data_loader, model, device):
    test_loss = 0.0
    model.eval()
    fin_targets = []
    fin_outputs = []
    test_acc = 0
    with torch.no_grad():
        for bi, d in tqdm(enumerate(data_loader), total=len(data_loader)):
            ids = d["ids"]
            mask = d["mask"]
            targets = d["labels"]
            # token_type_ids = d['token_type_ids']

            ids = ids.to(device, dtype=torch.long)
            mask = mask.to(device, dtype=torch.long)
            targets = targets.to(device, dtype=torch.float)

            outputs = model(ids, attention_mask=mask, labels=targets)
            # outputs = model(ids, attention_mask=mask)
            loss = loss_fn(y_pred=outputs.logits, y_true=targets, 
                           loss_func=get_loss_func())
            # loss = loss_fn(y_pred=outputs, y_true=targets, 
            #                loss_func=get_loss_func())
#             logits = outputs.logits
            
            test_loss += loss.item()
            
            fin_targets.extend(targets)
            fin_outputs.extend(torch.nn.functional.softmax(outputs.logits))

    return test_loss, fin_outputs, fin_targets

In [None]:
# n_labels = 8
def model_info(model_ckpt):
    tokenizer = transformers.AutoTokenizer.from_pretrained(model_ckpt, do_lower_case=True)
    bert_model = transformers.AutoModel.from_pretrained(model_ckpt)
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    return bert_model, tokenizer, device

In [None]:
# using time module
import time
from datetime import datetime  

current_time = datetime.now()
time_stamp = current_time.timestamp()

date_time = datetime.fromtimestamp(time_stamp)
str_date_time = date_time.strftime("%d_%m_%Y-%H_%M_%S")

# dir = './bert_model_save'
output_dir = './model_save/trend_pred/BERT/'

# Create output directory if needed
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

In [None]:
from utils.common_functions import log_metrics

training_stats = []
test_stats = []
def trainer(config=None):
    with wandb.init(config=config):
        config = wandb.config

        _, tokenizer, device = model_info(get_model_ckpt())

        train_dataset, valid_dataset, test_dataset = build_dataset(config.tokenizer_max_len, tokenizer)
        train_data_loader, valid_data_loader, test_data_loader = build_dataloader(train_dataset, valid_dataset, test_dataset, config.batch_size)
        
        print("Length of Train Dataloader: ", len(train_data_loader))
        print("Length of Valid Dataloader: ", len(valid_data_loader))
#         print("Length of Test Dataloader: ", len(test_data_loader))

        n_train_steps = int(len(train_dataset) / config.batch_size * 10)

        model = ret_model(config.dropout, get_model_ckpt(), get_n_labels(), get_Freeze())
        optimizer = ret_optimizer(model)
        scheduler = ret_scheduler(optimizer, n_train_steps)
        model.to(device)
        model = nn.DataParallel(model)
        wandb.watch(model)
        
        n_epochs = config.epochs

        best_val_loss = 100
        for epoch in tqdm(range(n_epochs)):
            train_loss = train_fn(train_data_loader, model, optimizer, device, scheduler)
            eval_loss, preds, labels = eval_fn(valid_data_loader, model, device)
          
            metrics = log_metrics(preds, labels)
            
            avg_train_loss, avg_val_loss = train_loss / len(train_data_loader), eval_loss / len(valid_data_loader)

            wandb.log({
                "epoch": epoch + 1,
                "train_loss": avg_train_loss,
                "val_loss": avg_val_loss,
                "auc_score": metrics['auc'],
            })

            print("AUC score: ", metrics['auc'])
            print("Average Train loss: ", avg_train_loss)
            print("Average Valid loss: ", avg_val_loss)
            print("Valid F1: ", metrics['f1'])
            print("Valid Recall: ", metrics['recall'])
            print("Valid Precision: ", metrics['precision'])
            print("Valid Acc: ", metrics['acc'])

            training_stats.append(
            {
                'Batch': config.batch_size,
                'Max Token': config.tokenizer_max_len,
                'Model':  get_model_ckpt(),
                'LR': config.learning_rate,
                'Loss func': get_loss_func(),
                'Epoch': epoch + 1,
                'Train Loss': avg_train_loss,
                'Valid. Loss': avg_val_loss,
                'Valid. AUC': metrics['auc'],
                'F1 score': metrics['f1'],
                'Recall': metrics['recall'],
                'Precision': metrics['precision'],
                'Accuracy': metrics['acc']
            })

            if avg_val_loss < best_val_loss:
                best_val_loss = avg_val_loss
                torch.save(model, output_dir+f"best_{get_type()}_{get_model_ckpt()}-{str(str_date_time)}.pt")
                tokenizer.save_pretrained(output_dir)
                print("Model saved as current val_loss is: ", best_val_loss)
        
        test_loss, preds, labels = test_fn(test_data_loader, model, device)
        metrics = log_metrics(preds, labels)
        avg_test_loss = test_loss / len(test_data_loader)
        print("Test loss: ", avg_test_loss)
        print("AUC: ", metrics['auc'])
        print("Test F1: ", metrics['f1'])
        print("Test Recall: ", metrics['recall'])
        print("Test Precision: ", metrics['precision'])
        print("Test Acc: ", metrics['acc'])
        test_stats.append({
                    'Batch': config.batch_size,
                    'Max Token': config.tokenizer_max_len,
                    'Model':  get_model_ckpt(),
                    'LR': config.learning_rate,
                    'Loss func': get_loss_func(),
                    'Test. Loss': avg_test_loss,
                    'Test. AUC': metrics['auc'],
                    'F1 score': metrics['f1'],
                    'Recall': metrics['recall'],
                    'Precision': metrics['precision'],
                    'Accuracy': metrics['acc']
                })

In [None]:
def get_project():
    return 'trend-prediction'

In [None]:
sweep_config = {
    'method': 'grid', #grid, random, bayesian
    'metric': {
      'name': 'auc_score',
      'goal': 'maximize'   
    },
    'parameters': {

        'learning_rate': {
            'values': [2e-05]
        },
        'batch_size': {
            'values': [16]
        },
        'epochs':{'value': 3},
        'dropout':{
            'values': [0.3]
        },
        'tokenizer_max_len': {'values': [30]},
    }
}

sweep_id = wandb.sweep(sweep_config, project=get_project())

In [None]:
def get_Freeze():
    return False

def get_model_ckpt():
    #  return "bsingh/roberta_goEmotion"
    #  return 'distilbert-base-uncased'
     return 'bert-large-uncased'
    # return 'bert-base-uncased'
    # return 'cardiffnlp/twitter-roberta-base-emotion-multilabel-latest'
    #  return 'roberta-large'

def get_loss_func():
    # return 'hinge_loss'
    # return 'focal_loss'
    # return 'cross_entropy'
    # return 'mse_loss'
    # return 'multi_label_soft_margin'
    return 'bce_loss'

def get_classes():
    trends = ["approval","toxic","obscene", 'insult', "threat", "hate", "offensive", "neither"]
    emotions = [
    'admiration', 'amusement', 'anger', 'annoyance', 'approval', 'caring',
    'confusion', 'curiosity', 'desire', 'disappointment', 'disapproval',
    'disgust', 'embarrassment', 'excitement', 'fear', 'gratitude', 'grief',
    'joy', 'love', 'nervousness', 'optimism', 'pride', 'realization',
    'relief', 'remorse', 'sadness', 'surprise', 'neutral'
    ]
    return trends
#     return emotions
def get_n_labels():
    return len(get_classes())

def get_type():
    return 'trend'
#     return 'emotion'

In [None]:
# run the sweep
wandb.agent(sweep_id, function=trainer, count=1)

## Print the Training loss, Validation loss and AUC values

In [None]:
import pandas as pd

# Display floats with two decimal places.
# pd.set_option('precision', 2)

# Create a DataFrame from our training statistics.
df_stats = pd.DataFrame(data=training_stats)

# Use the 'epoch' as the row index.
# df_stats = df_stats.set_index('Epoch')

# A hack to force the column headers to wrap.
#df = df.style.set_table_styles([dict(selector="th",props=[('max-width', '70px')])])

# Display the table.
df_stats

In [None]:
df_stats.to_csv('/home/siu856533724/code/source-code/Social-Networks/Trend-Prediction/Main-Implementation/logs/trend_result/train_valid_log_trend_bert_base_non_freeze.csv')

In [None]:
import pandas as pd

# Display floats with two decimal places.
# pd.set_option('precision', 2)

# Create a DataFrame from our training statistics.
df_stats = pd.DataFrame(data=test_stats)

# Use the 'epoch' as the row index.
# df_stats = df_stats.set_index('Epoch')

# A hack to force the column headers to wrap.
#df = df.style.set_table_styles([dict(selector="th",props=[('max-width', '70px')])])

# Display the table.
df_stats

In [None]:
df_stats.to_csv('/home/siu856533724/code/source-code/Social-Networks/Trend-Prediction/Main-Implementation/logs/trend_result/test_log_trend_bert_base_non_freeze.csv')

## Test BERT model using unknown data

In [None]:
# from utils.bert_classifier import get_model_tokenizer, Tokenize, Classification
# # Test sentiment prediction
# test_text = "I love you"
# model, tokenizer = get_model_tokenizer(pred_type='trend_pred', mode_pt_file='best_trend_bert-base-uncased-11_12_2023-14_25_42.pt', 
#                                        model_ckpt='bert-base-uncased', labels=8, is_freez=1, gpu='cuda')
# dict = Tokenize(text=test_text, max_length=30, tokenizer=tokenizer)
# scores = Classification(dict, model, gpu='cuda')
# print(f"Predicted Outputs: {scores}")