In [1]:
import json
import pandas as pd

import os
from dotenv import load_dotenv

from transformers import AutoModelForSequenceClassification
from transformers import TFAutoModelForSequenceClassification
from transformers import AutoTokenizer, AutoConfig
import numpy as np

from scipy.special import softmax
from sklearn.preprocessing import LabelEncoder

import emoji
import torch
from torch.nn import ELU

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
load_dotenv()
data_path = os.getenv('DATA_PATH')

In [3]:
task='sentiment'
MODEL = f"cardiffnlp/twitter-roberta-base-{task}"
labels = ['negative', 'neutral', 'positive']

In [4]:
tokenizer = AutoTokenizer.from_pretrained(MODEL)
config = AutoConfig.from_pretrained(MODEL)
model = AutoModelForSequenceClassification.from_pretrained(MODEL)
model.save_pretrained(MODEL)

In [5]:
def preprocess(arr):
    processed = []

    for i in range(len(arr)):
        new_text = []
        
        c = 1
        for t in arr[i].split(" "):
            if c > 150:
                break
            t = '@user' if t.startswith('@') and len(t) > 1 else t
            t = 'http' if t.startswith('http') else t
            new_text.append(t)
            c+=1
        
        processed.append(" ".join(new_text))
    
    return processed

In [6]:
def extractNegativeSentimentValues(arr):
    processed = preprocess(arr)
    out = []
    
    for text in processed:
        # tokenizer = AutoTokenizer.from_pretrained(MODEL)
        
        inputs = tokenizer(text, return_tensors='pt')
        outputs = model(**inputs)
        scores = outputs[0][0].detach().numpy()
        probs = softmax(scores)
        # out.append(probs)
        print(labels[0],probs[0])
        out.append(probs[0])

    return out

In [7]:
bad_words = pd.read_csv('./bad-words.csv')
bad_words = bad_words.to_numpy()
bad_words

def countBadWords(data):
    bad_words_count = []
    for i, row in data.iterrows():
        text = str(row['text'])
        # print(text)
        bad_words_count.append(sum(text.count(str(word)) for word in bad_words))
    data['bad_words_count'] = bad_words_count

In [8]:
def FindAuthorId(authorToId, comment, default):
            if comment[0] != '@':
                return default
            else:
                reg = comment.split(' ', 1)[0][1:]
                matches = [author for author in authorToId if author.split(' ', 1)[0] == reg]

                for el in matches:
                    t = '@' + el
                    if comment.find(t) == 0:
                        return authorToId[el]
                return default

In [9]:
elu = ELU(alpha=0.2)

def adjustedScoreCalc(curScore, parentScore):
    c = torch.tensor(curScore)
    p = torch.tensor(parentScore)
    return torch.Tensor.item(c + 0.7*elu(p-c))

In [10]:
def processChannelData(channel_list):
    
    os.makedirs(f'{data_path}/processed', exist_ok=True)

    for channel in channel_list: 

        print("Processing channel: ", channel['name'])
        
        # Read channel data
        data = pd.read_excel(f'{data_path}/xlsx/{channel["name"]}_threads.xlsx')
        
        #Extract negative sentiments of individual comments
        probs = extractNegativeSentimentValues(list(data['text'].values.astype(str)))
        data['negative_prob'] = probs

        #Rename and drop some labels
        data = data.rename(columns={f"label(CyberBullying,Normal)":"label"})
        data.drop(columns=['Unnamed: 0','Unnamed: 0.1','id'], inplace=True, errors='ignore')

        #Counting bad words
        countBadWords(data)

        # Getting parent comments
        parentIdx = list(data[data['s.no.'] == 1].index)
        parentIdx.append(len(data))

        # Creating seperate threads
        threads = []
        for i in range(len(parentIdx)-1):
            if i == len(parentIdx)-1:
                threads.append(data.iloc[parentIdx[i]:].copy())
            else:
                threads.append(data.iloc[parentIdx[i]:parentIdx[i+1]].copy())


        # Labelling authors and creating tree structure
        le = LabelEncoder()
        data = data.assign(repliedTo = '0')

        for i in range(len(threads)):
            cur_thread = threads[i].copy()
            authors = [str(author) for author in cur_thread['authorName'].values]
            
            cur_thread['authorName'] = le.fit_transform(authors)
            authorToId = dict(zip(authors, cur_thread['authorName']))

            replied_to = []
            default = cur_thread['authorName'].values[0]
            for j, row in cur_thread.iterrows():
                text = str(row['text'])
                if text[0] != '@':
                    replied_to.append(default)
                else:
                    replied_to.append(FindAuthorId(authorToId, text, default))
            cur_thread['repliedTo'] = replied_to
            threads[i] = cur_thread
            
            # Calculating adjusted sentiment scores of comments

        for i in range(len(threads)):
            cur_thread = threads[i]
            cur_thread['adjusted_sentiment'] = cur_thread['negative_prob'].values

            for j, row in cur_thread.iterrows():
                if j == 0:
                    continue
                cur_thread['adjusted_sentiment'][j] = adjustedScoreCalc(row['negative_prob'], cur_thread[cur_thread['authorName'] == row['repliedTo']]['negative_prob'].values[0])

        processed_df = pd.concat(threads)
        processed_df.to_csv(f'{data_path}/processed/{channel["name"]}_processed_data.csv', index=False)
            
        print("Processed channel: ", channel['name'])

        

        

In [11]:
channel_list = [
    dict(channelId = "UCsBjURrPoezykLs9EqgamOA",name = "fireship"),
    dict(channelId = "UC8CX0LD98EDXl4UYX1MDCXg",name = "Valorant"),
    # dict(channelId = "UCXIJgqnII2ZOINSWNOGFThA",name = "FoxNews"),
    # dict(channelId = "UCUsN5ZwHx2kILm84-jPDeXw",name = "ComedyCentral"),
]

channel_list

[{'channelId': 'UCsBjURrPoezykLs9EqgamOA', 'name': 'fireship'},
 {'channelId': 'UC8CX0LD98EDXl4UYX1MDCXg', 'name': 'Valorant'}]

In [12]:
processChannelData(channel_list = channel_list)

Processing channel:  fireship


FileNotFoundError: [Errno 2] No such file or directory: './Dataset/xlsx/fireship_threads.xlsx'

In [None]:
df = pd.read_csv(f'{data_path}/processed/{channel_list[0]["name"]}_processed_data.csv')

for i in range(1, len(channel_list)):
    df = df.append(pd.read_csv(f'{data_path}/processed/{channel_list[i]["name"]}_processed_data.csv'))

df.to_csv(f'{data_path}/processed/Final_processed_data.csv', index=False)