In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification
from scipy.special import softmax
from tqdm import tqdm, trange
import numpy as np
import json
import LeetMining as lm

In [2]:
test = pd.read_csv("test_yt.csv")
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 57645 entries, 0 to 57644
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    57645 non-null  object
dtypes: object(1)
memory usage: 450.5+ KB


# LeetSpeak processing

In [None]:
#load existing leet dictionary to initiate
import json
leet_fp = {}
with open('youtube_leetDict.json') as json_file:
    leet_fp = json.load(json_file)

leet_fp

# Sentiment Analysys Using RoBerta

In [4]:
#setup pre-trained roberta model from https://huggingface.co/cardiffnlp/twitter-roberta-base-sentiment-latest 
MODEL = f"cardiffnlp/twitter-roberta-base-sentiment-latest"
tokenizer = AutoTokenizer.from_pretrained(MODEL)
model = AutoModelForSequenceClassification.from_pretrained(MODEL)
# Run for Roberta Model
def polarity_scores_roberta(example):
    encoded_text = tokenizer(example, return_tensors='pt')
    output = model(**encoded_text)
    scores = output[0][0].detach().numpy()
    scores = softmax(scores)
    scores_dict = {
        'roberta_neg' : scores[0],
        'roberta_neu' : scores[1],
        'roberta_pos' : scores[2]
    }
    return scores_dict

print(polarity_scores_roberta("I am happy"))

Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


{'roberta_neg': 0.0077943704, 'roberta_neu': 0.030709133, 'roberta_pos': 0.96149653}


In [5]:
# Run Roberta on test data to get analysis prior leet substitution
test2_result = {'text':[], 'leetWords':[], 'BestMatches':[], 'NewText':[],'Neg':[], 'Neu':[], 'Pos':[]}

for i in trange(len(test['text'].values)):
    test2_result['text'].append(test['text'].values[i])
    leetWords = lm.getLeetWordList(test['text'].values[i])
    test2_result['leetWords'].append(leetWords)
    newText = test['text'].values[i]
    bestMatches = []

    for lword in leetWords:
        bestMatch = lm.getBestMatch(lword,leet_fp)
        bestMatches.append(bestMatch)
        if bestMatch is not None:
            newText = lm.replaceLeet(newText, lword, bestMatch)

    if len(bestMatches) > 0:
        test2_result['BestMatches'].append(bestMatches)
    else:
        test2_result['BestMatches'].append([])

    newText = "".join(newText)
    test2_result['NewText'].append(newText)

    polarity_scores = polarity_scores_roberta(newText)
    test2_result['Neg'].append(polarity_scores['roberta_neg'])
    test2_result['Neu'].append(polarity_scores['roberta_neu'])
    test2_result['Pos'].append(polarity_scores['roberta_pos'])

test2_result_df = pd.DataFrame(test2_result)
test2_result_df.head()



  0%|          | 0/57645 [00:00<?, ?it/s]

100%|██████████| 57645/57645 [4:13:23<00:00,  3.79it/s]   


Unnamed: 0,text,leetWords,BestMatches,NewText,Neg,Neu,Pos
0,Ok I hate to tell everyone but 90s quality is ...,"[90s, Blu-ray, 4K, Blu-ray]","[was, blurry, AK, blurry]",Ok I hate to tell everyone but was quality is ...,0.233162,0.331397,0.435441
1,Actually the recent windows update made my pc ...,[],[],Actually the recent windows update made my pc ...,0.01498,0.076991,0.908029
2,Tom I really hope you enjoyed your stay in War...,[],[],Tom I really hope you enjoyed your stay in War...,0.002816,0.012929,0.984254
3,"OR, more likely, they hired a professional who...",[],[],"OR, more likely, they hired a professional who...",0.330295,0.606575,0.06313
4,I am confused because of Linus's tshirt.,[],[],I am confused because of Linus's tshirt.,0.527813,0.462156,0.010031


In [7]:
test2_result_df[test2_result_df['leetWords'].str.len() >0] #just the rows with leetspeak

Unnamed: 0,text,leetWords,BestMatches,NewText,Neg,Neu,Pos
0,Ok I hate to tell everyone but 90s quality is ...,"[90s, Blu-ray, 4K, Blu-ray]","[was, blurry, AK, blurry]",Ok I hate to tell everyone but was quality is ...,0.233162,0.331397,0.435441
13,That's a smooth way to end the video. Respect++,[Respect++],[RespectED],That's a smooth way to end the video. RespectED,0.019335,0.123817,0.856849
20,"I think i'm going Amish :P, but really, once t...","[:P, PS:]","[EP, PSI]","I think i'm going Amish EP but really, once th...",0.355647,0.572476,0.071877
23,With wanting the progress bar being a little b...,"[bis/b, in-game]","[bison, endgame]",With wanting the progress bar being a little b...,0.723536,0.261255,0.015209
28,Tom Scott : <b>*publishes video with Poland in...,"[b*publishes, title*/b]","[publishes, titleAOb]",Tom Scott : <b>*publishes video with Poland in...,0.317874,0.618814,0.063311
...,...,...,...,...,...,...,...
57623,budget-priced Riley today,[budget-priced],[budgetIpriced],budgetIpriced Riley today,0.130954,0.781844,0.087202
57627,"""<a href=""""https://www.youtube.com/watch?v=G02...","[com/watch, v=G025oxyWv0E, b*conused, screamin...","[comOwatch, vIGAAAoxyWvAE, bAconused, screamin...","""<a comOwatch the audio isn’t the real audio t...",0.730442,0.255147,0.014412
57638,"""Isn't that what we all want? <a href=""""https:...","[com/watch, v=fa28lIGuxq8]","[comOwatch, vIfaAAlIGuxqA]","""Isn't that what we all want? <a comOwatch",0.095881,0.649858,0.254261
57641,"""<a href=""""https://www.youtube.com/watch?v=G02...","[com/watch, v=G025oxyWv0E, Unlocked:]","[comOwatch, vIGAAAoxyWvAE, unlocked]","""<a comOwatch New Achievement unlocked Dieget...",0.008643,0.758368,0.232990


In [None]:
test2_result_df.to_csv("test2_result_yt.csv", index = False)