In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from google.colab import files
from tqdm import tqdm, trange
from collections import Counter
from itertools import chain 
import regex as re
import seaborn as sns
import io

from transformers import XLNetModel, XLNetTokenizer, XLNetForSequenceClassification
from transformers import BertModel, BertTokenizer, BertForSequenceClassification
from transformers import AdamW

import torch
from torch import Tensor
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from torch.autograd import Variable
import torch.nn as nn
from torch.nn import BCEWithLogitsLoss
import torch.nn.functional as F
import torchvision.transforms as transforms

from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.metrics import f1_score, recall_score, precision_score, multilabel_confusion_matrix

import nltk
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize


nltk.download('stopwords')
nltk.download('punkt')
ps = PorterStemmer()

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_gpu = torch.cuda.device_count()
torch.cuda.get_device_name(0)

In [None]:
# SPLIT DATASET USING SKLEARN TRAIN_TEST SPLIT

In [None]:
raw_dataset = {
    'train': pd.read_csv('../data/revised-dataset/train_revised.csv').reset_index(drop=True),  # """ encoding='cp1252' """ insert between train_data.csv and .reset index as parameter
    'test': pd.read_csv('../data/revised-dataset/test_revised.csv').reset_index(drop=True),
    'val': pd.read_csv('../data/revised-dataset/eval_revised.csv').reset_index(drop=True)
}

LABELS = ['Age', 'Gender', 'Physical', 'Race', 'Religion', 'Others']

class MLTHSDataset(Dataset):
    def __init__(self, data, tokenizer, labels, max_token_len=128):
        self.data = data
        self.tokenizer = tokenizer
        self.labels = labels
        self.max_token_len = max_token_len
        self.encoded_dataset = self.encode_dataset()

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        return self.encoded_dataset[index]
    

    def encode_dataset(self):
        encoded_dataset = []
        for index, data in tqdm(self.data.iterrows()):
            encoded_data = self.encode_data(data)
            encoded_dataset.append(encoded_data)
        return encoded_dataset

    def encode_data(self, data):
        text = data["Text"]
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            return_tensors='pt',
            padding='max_length',
            truncation=True,
            max_length=128,
            return_token_type_ids=False,
            return_attention_mask=True
        )
        labels = [data[label] for label in self.labels]

        representation = {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.FloatTensor(labels)
        }
        return representation

class MLTHSDataLoader:
    def __init__(self, dataset, labels, tokenizer, batch_size=8):
        self.train_dataset = MLTHSDataset(dataset['train'], tokenizer, labels)
        self.val_dataset = MLTHSDataset(dataset['val'], tokenizer, labels)
        self.test_dataset = MLTHSDataset(dataset['test'], tokenizer, labels)
        self.batch_size = batch_size

    def train_dataloader(self):
        return DataLoader(self.train_dataset, batch_size=self.batch_size, num_workers=0, shuffle=True)

    def val_dataloader(self):
        return DataLoader(self.val_dataset, batch_size=self.batch_size, num_workers=0, shuffle=False)

    def test_dataloader(self):
        return DataLoader(self.test_dataset, batch_size=self.batch_size, num_workers=0, shuffle=False)

In [None]:
import torch
import torch.nn as nn
from transformers import BertTokenizer, BertModel, get_cosine_schedule_with_warmup

xlnet_model_name = "xlnet-base-uncased"

xlnet_tokenizer = XLNetTokenizer.from_pretrained(xlnet_model_name, do_lower_case=True)

LABELS = ['Age', 'Gender', 'Physical', 'Race', 'Religion', 'Others']



In [None]:
def preprocess_text(text_column):

    text_column = text_column.apply(lambda x: re.sub(r'[A-Z]', lambda y: y.group(0).lower(), x))
    # Removal of unimportant links
    text_column = text_column.apply(lambda x: re.sub(r'http[s]?://\S+', '', x))

    # emoji 
    text_column = text_column.apply(lambda x: re.sub(r'[^\x00-\x7F]+', '', x))

    # username
    text_column = text_column.apply(lambda x: re.sub(r'@\w+', '', x))

    # punctuations
    text_column = text_column.apply(lambda x: re.sub(r'[^\w\s]', ' ', x))

    # hashtag
    text_column = text_column.apply(lambda x: re.sub(r'#', '', x))
    
    return text_column