In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification
from scipy.special import softmax
from tqdm import tqdm, trange

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
orig_df = pd.read_json('tweet_reply.json', lines=True)
orig_df = orig_df.sample(frac=1).reset_index(drop=True) # shuffle records
orig_df.info()
orig_df.head()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 170255 entries, 0 to 170254
Data columns (total 5 columns):
 #   Column       Non-Null Count   Dtype 
---  ------       --------------   ----- 
 0   keyword      170255 non-null  object
 1   main_tweet   170255 non-null  object
 2   main_likes   170255 non-null  int64 
 3   reply        170255 non-null  object
 4   reply_likes  170255 non-null  int64 
dtypes: int64(2), object(3)
memory usage: 6.5+ MB


Unnamed: 0,keyword,main_tweet,main_likes,reply,reply_likes
0,Ukraine,Is Ukraine eating well? That's where all the m...,0,You add to the problems in this country with e...,0
1,Macdonald,Ridiculous Macdonald India. You got excess of ...,0,"bestie, your mcdelivery didn't have enough rid...",1
2,TikTok,I saw the hi c orange lava burst box on tiktok...,0,ok but can y’all bring the snack wraps back,6
3,COVID-19,The bottom 2% of Canadians say they are hurtin...,0,You are not listening to the right new sources...,0
4,Ukraine,Loving that map on the right... Can't wait to ...,0,you’re leaving out a lot of context broski,3


In [3]:
train, test = train_test_split(orig_df, test_size=0.3)
test.reset_index(drop=True)
train.reset_index(drop=True)
print("Test Dataset:")
test.info()
print("\nTraining Dataset:")
train.info()
#test["reply"].values.size

#test["keyword"].values[0] -> 'weather'

Test Dataset:
<class 'pandas.core.frame.DataFrame'>
Index: 51077 entries, 22010 to 47893
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   keyword      51077 non-null  object
 1   main_tweet   51077 non-null  object
 2   main_likes   51077 non-null  int64 
 3   reply        51077 non-null  object
 4   reply_likes  51077 non-null  int64 
dtypes: int64(2), object(3)
memory usage: 2.3+ MB

Training Dataset:
<class 'pandas.core.frame.DataFrame'>
Index: 119178 entries, 23046 to 65658
Data columns (total 5 columns):
 #   Column       Non-Null Count   Dtype 
---  ------       --------------   ----- 
 0   keyword      119178 non-null  object
 1   main_tweet   119178 non-null  object
 2   main_likes   119178 non-null  int64 
 3   reply        119178 non-null  object
 4   reply_likes  119178 non-null  int64 
dtypes: int64(2), object(3)
memory usage: 5.5+ MB


In [4]:
#setup pre-trained roberta model from https://huggingface.co/cardiffnlp/twitter-roberta-base-sentiment-latest 
MODEL = f"cardiffnlp/twitter-roberta-base-sentiment-latest"
tokenizer = AutoTokenizer.from_pretrained(MODEL)
model = AutoModelForSequenceClassification.from_pretrained(MODEL)

Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [5]:
# Run for Roberta Model
def polarity_scores_roberta(example):
    encoded_text = tokenizer(example, return_tensors='pt')
    output = model(**encoded_text)
    scores = output[0][0].detach().numpy()
    scores = softmax(scores)
    scores_dict = {
        'roberta_neg' : scores[0],
        'roberta_neu' : scores[1],
        'roberta_pos' : scores[2]
    }
    return scores_dict

print(polarity_scores_roberta("I am happy"))

{'roberta_neg': 0.0077943704, 'roberta_neu': 0.030709133, 'roberta_pos': 0.96149653}


In [6]:
# Run Roberta on test data to get analysis prior leet substitution
test1_result = {'keyword':[], 'reply':[], 'Neg':[], 'Neu':[], 'Pos':[]}

for i in trange(len(test['reply'].values)):
    test1_result['keyword'].append(test['keyword'].values[i])
    test1_result['reply'].append(test['reply'].values[i])

    polarity_scores = polarity_scores_roberta(test['reply'].values[i])
    test1_result['Neg'].append(polarity_scores['roberta_neg'])
    test1_result['Neu'].append(polarity_scores['roberta_neu'])
    test1_result['Pos'].append(polarity_scores['roberta_pos'])

test1_result_df = pd.DataFrame(test1_result)
test1_result_df.head()

100%|██████████| 51077/51077 [1:57:32<00:00,  7.24it/s]      


Unnamed: 0,keyword,reply,Neg,Neu,Pos
0,Dogecoin,Hodling like there is no tomorrow!,0.061756,0.309112,0.629132
1,Vaccine,As they tell us to not worry. 👀,0.078209,0.717959,0.203832
2,World Cup,I love this,0.009743,0.039927,0.95033
3,COVID-19,"I hope so, but i have no need of immediate inf...",0.015798,0.124445,0.859756
4,COVID-19,Nuremberg doctors' trial was an important mile...,0.005117,0.10639,0.888493


In [7]:
# test.to_csv('test.csv', index=False)
# train.to_csv('train.csv', index=False)
# test1_result_df.to_csv('test1_result.csv', index=False)