In [1]:
import random
import json 
import time
import copy
import os
import re
import emoji

import numpy as np
import pandas as pd

from hatesonar import Sonar # This is the hate speech detection library; it is based on bert

from transformers import pipeline

from torch.utils.data import Dataset, DataLoader

from sklearn.preprocessing import MultiLabelBinarizer

In [2]:
def extract_json_object(filename):
    with open(file, "r", encoding='utf-8') as read_file:
        json_array = json.load(read_file)
    json_data = []    
    #parse the data
    for item in json_array:
        details = {"id":None, "labels":None, "text":None}
        details['id'] = item['id']
        try: 
            details['labels'] = item['labels']
        except KeyError: 
            details['labels'] = []         
        details['text'] = item['text']
        json_data.append(details)

    
    return json_data


def text_preprocessing(text):
    """
    - Remove entity mentions (eg. '@united')
    - Correct errors (eg. '&amp;' to '&')
    @param    text (str): a string to be processed.
    @return   text (Str): the processed string.
    """
    # Remove '@name'
   # text = re.sub(r'(@.*?)[\s]', ' ', text)

    # Replace '&amp;' with '&'
    text = re.sub(r'&amp;', '&', text)

    # Remove trailing whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    
    #remove html
   # text = re.sub(r"(?:\@|http?\://|https?\://|www)\S+", "", text)
    
    #remove emoji
   # text = ''.join(c for c in text if c not in emoji.UNICODE_EMOJI) #Remove Emojis
    
    # remove numbers
    # text = re.sub('[0-9]+', '', text)

    return text


def hate_speech_classifier(df, Class, hate, offensive, neither):
    for i in df['text']:
        sonar_dict = sonar.ping(text=i)
        Class.append(list(sonar_dict.values())[1])
        hate.append(list(list(sonar_dict.values())[2][0].values())[1])
        offensive.append(list(list(sonar_dict.values())[2][1].values())[1])
        neither.append(list(list(sonar_dict.values())[2][2].values())[1])


def sentiment_classifier(df):
    for i,text in enumerate(df['text']):
        result = sentimentanalyzer(text)[0]
        df.loc[[i],'sentiment_class'] = result['label']
    return df

In [3]:
# file paths
lst_file_path=[]

lst_file_path.append("../data/training_data_task3.txt")

lst_file_path.append("../data/validation_data_task3.txt")


# List of keys 
dataset = ["training", "validation"] 
# empty dictionary
dic_datasets =  dict.fromkeys(dataset, pd.DataFrame()) 


In [4]:
for i, file in enumerate(lst_file_path):
    print('loading'+ file)
    dic_datasets[dataset[i]] = pd.DataFrame(extract_json_object(extract_json_object))

loading../data/training_data_task3.txt


FileNotFoundError: [Errno 2] No such file or directory: '../data/training_data_task3.txt'

In [None]:
df_training = pd.DataFrame(dic_datasets['training'][['id','labels','text']])
df_training.labels = df_training.labels.apply(lambda y: ['no_class'] if len(y)==0 else y)

df_val = pd.DataFrame(dic_datasets['validation'][['id','labels','text']])
df_val.labels = df_val.labels.apply(lambda y: ['no_class'] if len(y)==0 else y)


In [None]:
# create auxilary features (hate speech indicator and sentiment )

In [None]:
# Create an object of Sonar Hate Speech Detection
sonar = Sonar()

In [None]:
hate_speech_class = []
hate = []
offensive = []
neither = []

#Function calling 
hate_speech_classifier(df_training, hate_speech_class, hate, offensive, neither)

# Prepare columns to add the scores later
df_training["hate_speech_class"] = hate_speech_class
#df_training["hate"] = hate
#df_training["offensive"] = offensive
#df_training["neither"] = neither
df_training.hate_speech_class.value_counts()

In [None]:
hate_speech_class = []
hate = []
offensive = []
neither = []

hate_speech_classifier(df_val, hate_speech_class, hate, offensive, neither)

# Prepare columns to add the scores later
df_val["hate_speech_class"] = hate_speech_class

#df_val["hate"] = hate
#df_val["offensive"] = offensive
#df_val["neither"] = neither

df_val.hate_speech_class.value_counts()

In [None]:
# sentiment analysis using distillbert

In [None]:
sentimentanalyzer = pipeline("sentiment-analysis")



In [None]:
hate_speech_class = []
hate = []
offensive = []
neither = []

#Function calling 
hate_speech_classifier(df_training, hate_speech_class, hate, offensive, neither)

# Prepare columns to add the scores later
df_training["hate_speech_class"] = hate_speech_class
#df_training["hate"] = hate
#df_training["offensive"] = offensive
#df_training["neither"] = neither
df_training.hate_speech_class.value_counts()

In [None]:
hate_speech_class = []
hate = []
offensive = []
neither = []

hate_speech_classifier(df_val, hate_speech_class, hate, offensive, neither)

# Prepare columns to add the scores later
df_val["hate_speech_class"] = hate_speech_class

#df_val["hate"] = hate
#df_val["offensive"] = offensive
#df_val["neither"] = neither

df_val.hate_speech_class.value_counts()

In [None]:
# sentiment analysis using distillbert

In [None]:
sentimentanalyzer = pipeline("sentiment-analysis")



In [None]:
# generate sentiment
df_training = sentiment_classifier(df_training)
df_val = sentiment_classifier(df_val)





In [None]:
# hot encode auxilary variables
df_training = pd.get_dummies(df_training, columns =["hate_speech_class","sentiment_class"])
df_val = pd.get_dummies(df_val, columns =["hate_speech_class","sentiment_class"])

# hot encode multi - labels (dependant variable)
mlb = MultiLabelBinarizer(sparse_output=True)

df_concat_labels = pd.concat([df_training.pop('labels'), df_val.pop('labels')])
concat_labels = df_training.join(pd.DataFrame.sparse.from_spmatrix(
                    mlb.fit_transform(df_concat_labels),
                    index=df_concat_labels.index,
                    columns=mlb.classes_))


df_training = concat_labels[0:len(df_training)]
df_training.reset_index(inplace=True)

df_val = concat_labels[len(df_training):]
df_val.reset_index(inplace=True)

In [None]:
df_training

In [None]:
class CustomDataset(Dataset):

    def __init__(self, dataframe, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.comment_text = dataframe.text
        
        self.hate = dataframe[['hate_speech_class_hate_speech','hate_speech_class_neither', 'hate_speech_class_offensive_language']].to_numpy()
        self.sentiment = dataframe[['sentiment_class_NEGATIVE','sentiment_class_POSITIVE']].to_numpy()
        
        self.targets = self.data[[
                                    'Appeal to authority',
                                    'Appeal to fear/prejudice',
                                    'Black-and-white Fallacy/Dictatorship',
                                    'Causal Oversimplification',
                                    'Doubt',
                                    'Exaggeration/Minimisation',
                                    'Flag-waving',
                                    'Glittering generalities (Virtue)',
                                    'Loaded Language',
                                    'Misrepresentation of Someone\'s Position (Straw Man)',
                                    'Name calling/Labeling',
                                    'Obfuscation, Intentional vagueness, Confusion',
                                    'Presenting Irrelevant Data (Red Herring)',
                                    'Reductio ad hitlerum',
                                    'Repetition',
                                    'Slogans',
                                    'Smears',
                                    'Thought-terminating cliché',
                                    'Whataboutism',
                                    'Bandwagon',
                                    'Transfer',
                                    'Appeal to (Strong) Emotions'
                                ]].to_numpy()
        self.max_len = max_len

    def __len__(self):
        return len(self.comment_text)

    def __getitem__(self, index):
        #print(index)
        #print(self.comment_text.index)
        comment_text = str(self.comment_text[index])
        comment_text = " ".join(comment_text.split())

        # inputs = self.tokenizer.encode_plus(
        #    comment_text,
        #    None,
        #    add_special_tokens=True,
        #    truncation=True,
        #    max_length=self.max_len,
        #    pad_to_max_length=True,
        #    #padding=True,
        #    #padding='longest',
        #    return_token_type_ids=True
        # )
        #inputs = tokenizer.encode_plus(
        #            comment_text, 
        #            add_special_tokens = True,    
        #            truncation = False, 
        #            max_length=self.max_len,
        #            padding = "max_length",
                    #padding_side='right',
                   # return_attention_mask = True, 
        #            return_tensors = "pt",
        #            return_token_type_ids=True,    
        #)
        
        inputs = tokenizer.encode_plus(
                    comment_text, 
                    text_pair = None,
                    add_special_tokens = True,    
                    max_length=self.max_len,
                    padding = "max_length",
                    pad_to_max_length = True,
                    return_token_type_ids=True,
                    truncation=True
        )
        
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        token_type_ids = inputs["token_type_ids"]


        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
            'targets': torch.tensor(self.targets[index], dtype=torch.float),
            'hate': torch.tensor(self.hate[index], dtype=torch.long), 
            'sentiment': torch.tensor(self.sentiment[index], dtype=torch.long)
        }

In [None]:
#helper functions

def create_data_loader_kfold(df_kf, df_val, trn_ids, tst_ids):
    print('Original Train Dataset: ' + str(len(df_kf)))
    
    cust_Dataset_train = CustomDataset(df_kf, tokenizer, MAX_LEN)
    cust_Dataset_val = CustomDataset(df_val, tokenizer, MAX_LEN)
    

    # Sample elements randomly from a given list of ids, no replacement.
    kfold_train_subsampler = torch.utils.data.SubsetRandomSampler(trn_ids)
    kfold_test_subsampler = torch.utils.data.SubsetRandomSampler(tst_ids)
    
    val_params = {'batch_size': VALID_BATCH_SIZE,
                    'shuffle': True,
                    'num_workers': 0
                    }
    
    
    # Define data loaders for training and testing data in this fold
    kfold_trainloader = torch.utils.data.DataLoader(
                      cust_Dataset_train, 
                      batch_size=VALID_BATCH_SIZE, sampler=kfold_train_subsampler)
    kfold_testloader = torch.utils.data.DataLoader(
                      cust_Dataset_train,
                      batch_size=VALID_BATCH_SIZE, sampler=kfold_test_subsampler)
    
    valloader = DataLoader(cust_Dataset_val, **val_params)
    

    print("KFOLD CROSSVALIDATION TRAIN Dataset: {}".format(len(kfold_trainloader) * kfold_trainloader.batch_size))
    print("KFOLD CROSSVALIDATION TEST Dataset: {}".format(len(kfold_testloader) * kfold_trainloader.batch_size))
    print("VALIDATION Dataset: {}".format(len(valloader) * valloader.batch_size))
    
    
    return kfold_trainloader, kfold_testloader, valloader
     

def create_data_loader(train_dataset,test_dataset):
    print("TRAIN Dataset: {}".format(train_dataset.shape))
    print("TEST Dataset: {}".format(test_dataset.shape))
    
    training_set = CustomDataset(train_dataset, tokenizer, MAX_LEN)
    testing_set = CustomDataset(test_dataset, tokenizer, MAX_LEN)
    dataset_size = len(train_dataset)
    
    train_params = {'batch_size': TRAIN_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

    test_params = {'batch_size': VALID_BATCH_SIZE,
                    'shuffle': True,
                    'num_workers': 0
                    }

    training_loader = DataLoader(training_set, **train_params)
    testing_loader = DataLoader(testing_set, **test_params)


    return training_loader, testing_loader, dataset_size


In [None]:
def validation(model, testing_loader):
    # Put model in evaluation mode to evaluate loss on the validation set
    model.eval()

    #track variables
    true_labels,pred_labels = [],[]

    # Predict
    for _, data in enumerate(testing_loader, 0):
        #print(_)
        
        #prepare data to feed into model
        ids = data['ids'].to(device, dtype = torch.long)
        mask = data['mask'].to(device, dtype = torch.long)
        token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
        
        hate = data['hate'].to(device, dtype = torch.long)
        sent = data['sentiment'
                   ].to(device, dtype = torch.long)
                
        targets = data['targets'].to(device, dtype = torch.float)
        
       
        with torch.no_grad():
           
            
            # Forward pass
            outputs, model_output_object = model(ids, mask, None, hate, sent)
            b_logit_pred = outputs
            pred_label = b_logit_pred

            b_logit_pred = b_logit_pred.detach().cpu().numpy()
            pred_label = pred_label.to('cpu').numpy()
            targets = targets.to('cpu').numpy()

        #tokenized_texts.append(b_input_ids)
        #logit_preds.append(b_logit_pred)
        true_labels.append(targets)
        pred_labels.append(pred_label)

    # Flatten outputs
    #tokenized_texts = [item for sublist in tokenized_texts for item in sublist]
    # print(true_labels)
    # print(pred_labels)
    pred_labels = [item for sublist in pred_labels for item in sublist]
    true_labels = [item for sublist in true_labels for item in sublist]
    # Converting flattened binary values to boolean values
    true_bools = [tl==1 for tl in targets]
    
    return true_labels, true_bools, pred_labels, model_output_object