In [None]:
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
df = pd.read_csv('data/tweets.csv')
df.head(15)

In [None]:
#rename columns
df.set_axis(['tweet', 'directed', 'emotion'], axis=1, inplace=True)
df.head()

In [None]:
#TweetEval preprocess function

def preprocess(text):
    new_text = []
    for t in text.split(" "):
        t = '@user' if t.startswith('@') and len(t) > 1 else t
        t = 'http' if t.startswith('http') else t
        new_text.append(t)
    return " ".join(new_text)

In [None]:
!pip install transformers

In [None]:
from transformers import AutoModelForSequenceClassification
from transformers import TFAutoModelForSequenceClassification
from transformers import AutoTokenizer
import numpy as np
from scipy.special import softmax
import csv
import urllib.request

task='sentiment'
MODEL = f"cardiffnlp/twitter-roberta-base-{task}"

tokenizer = AutoTokenizer.from_pretrained(MODEL)

In [None]:
import torch

In [None]:
print(torch.__version__)

In [None]:
# download label mapping
mapping_link = f"https://raw.githubusercontent.com/cardiffnlp/tweeteval/main/datasets/{task}/mapping.txt"
with urllib.request.urlopen(mapping_link) as f:
    html = f.read().decode('utf-8').split("\n")
    csvreader = csv.reader(html, delimiter='\t')
labels = [row[1] for row in csvreader if len(row) > 1]

In [None]:
# PT
model = AutoModelForSequenceClassification.from_pretrained(MODEL, from_tf=True)

text = "Good night 😊"
text = preprocess(text)
encoded_input = tokenizer(text, return_tensors='pt')
output = model(**encoded_input)
scores = output[0][0].detach().numpy()
scores = softmax(scores)

In [None]:
ranking = np.argsort(scores)
ranking = ranking[::-1]
for i in range(scores.shape[0]):
    l = labels[ranking[i]]
    s = scores[ranking[i]]
    print(f"{i+1}) {l} {np.round(float(s), 4)}")

In [None]:
def clean_roberta(df):
    clean_tweets = []
    for tweet in df['tweet']:
        words = str(tweet).split()
        new_text = []
        for t in words:
            t = '@user' if t.startswith('@') and len(t) > 1 else t
            t = '@user' if t.startswith('.@') and len(t) > 1 else t
            t = 'http' if t.startswith('http') else t
            new_text.append(t)
        new_t = " ".join(new_text)
        clean_tweets.append(new_t)
    df['clean_tweet'] = clean_tweets
    return df

In [None]:
clean_roberta = clean_roberta(df)

In [None]:
clean_roberta.head()

In [None]:
#encoded_input_df = clean_roberta['clean_tweet'].apply(tokenizer.tokenize)

In [None]:
#encoded_input_df.head()

In [None]:
#output = model(**encoded_input_df)
#scores = output[0][0].detach().numpy()
#scores = softmax(scores)

In [None]:
test = clean_roberta['clean_tweet'][5]
test

In [None]:
#testing model

encoded_input_test = tokenizer(test, return_tensors='pt')
encoded_input_test

In [None]:
#testing model
output_test = model(**encoded_input_test)
output_test

In [None]:
#testing model
scores_test = output_test[0][0].detach().numpy()
scores_test = softmax(scores_test)
scores_test

In [None]:
#testing model
ranking_test = np.argsort(scores_test)
ranking_test = ranking_test[::-1]
for i in range(scores_test.shape[0]):
    l = labels[ranking_test[i]]
    s = scores_test[ranking_test[i]]
    print(f"{i+1}) {l} {np.round(float(s), 4)}")

In [None]:
test2 = clean_roberta['clean_tweet'][0]
encoded_input_test2 = tokenizer(test2, return_tensors='pt')
output_test2 = model(**encoded_input_test2)
scores_test2 = output_test2[0][0].detach().numpy()
scores_test2 = softmax(scores_test2)
ranking_test2 = np.argsort(scores_test2)
ranking_test2 = ranking_test2[::-1]
for i in range(scores_test2.shape[0]):
    l = labels[ranking_test2[i]]
    s = scores_test2[ranking_test2[i]]
    print(f"{i+1}) {l} {np.round(float(s), 4)}")
print(test2)

In [None]:
test3 = clean_roberta['clean_tweet'][5]
encoded_input_test3 = tokenizer(test3, return_tensors='pt')
output_test3 = model(**encoded_input_test3)
scores_test3 = output_test3[0][0].detach().numpy()
scores_test3 = softmax(scores_test3)
ranking_test3 = np.argsort(scores_test3)
ranking_test3 = ranking_test3[::-1]
for i in range(scores_test3.shape[0]):
    l = labels[ranking_test3[i]]
    s = scores_test3[ranking_test3[i]]
    print(f"{i+1}) {l} {np.round(float(s), 4)}")
print(test3)b

In [None]:
scores_test3 = softmax(scores_test3)
score_list = []
neg = scores_test3[0]
score_list.append(neg)
neut = scores_test3[1]
score_list.append(neut)
pos = scores_test3[2]
score_list.append(pos)

for score in score_list:
    if max(score_list) == neg:
        score = 0
    elif max(score_list) == neut:
        score = 1
    elif max(score_list) == pos:
        score = 2
print(test3)
print(score)


In [None]:
clean_roberta.head()

In [None]:
def model_roberta(df):
    sentiment = []
    
    #encode and run through model in clean_tweets
    for tweet in df['tweet']:
        encoded_input = tokenizer(str(tweet), return_tensors='pt')
        output = model(**encoded_input)
        scores = output[0][0].detach().numpy()
        scores = softmax(scores)
        

        score_list = []
        neg = scores_test3[0]
        score_list.append(neg)
        neut = scores_test3[1]
        score_list.append(neut)
        pos = scores_test3[2]
        score_list.append(pos)

        for score in score_list:
            if max(score_list) == neg:
                score = 0
            elif max(score_list) == neut:
                score = 1
            elif max(score_list) == pos:
                score = 2
        sentiment.append(score)
        
    clean_roberta['score'] = sentiment

    return df