In [None]:
import numpy as np
import re
import pandas as pd  

#cleans tweets, if the cleaned tweet ends up being smaller than 3 words, then an empty string will be returned.
def clean_tweet(tweet):
    if type(tweet) == np.float64:
        return ""
    temp = tweet.lower()
    temp = re.sub("'", "", temp)
    temp = re.sub("@[A-Za-z0-9_]+","", temp)
    temp = re.sub("#[A-Za-z0-9_]+","", temp)
    temp = re.sub(r'http\S+', '', temp)
    temp = re.sub('[()!?]', ' ', temp)
    temp = re.sub('\[.*?\]',' ', temp)
    temp = re.sub('rt ','', temp)
    temp = re.sub("[^a-z0-9]"," ", temp)
    temp = temp.split()
    if len(temp) < 3:
        return ""
    else:
        temp = " ".join(word for word in temp)
        return temp

In [None]:
from transformers import AutoModelForSequenceClassification
from transformers import TFAutoModelForSequenceClassification
from transformers import AutoTokenizer
import numpy as np
from scipy.special import softmax
import csv
import urllib.request

#load pre trained model to detect sentiment

task='sentiment'
MODEL = f"cardiffnlp/twitter-roberta-base-{task}"

tokenizer = AutoTokenizer.from_pretrained(MODEL)

# download label mapping
labels=[]
mapping_link = f"https://raw.githubusercontent.com/cardiffnlp/tweeteval/main/datasets/{task}/mapping.txt"
with urllib.request.urlopen(mapping_link) as f:
    html = f.read().decode('utf-8').split("\n")
    csvreader = csv.reader(html, delimiter='\t')
labels = [row[1] for row in csvreader if len(row) > 1]

# PT
model = AutoModelForSequenceClassification.from_pretrained(MODEL)
model.save_pretrained(MODEL)


In [None]:
#enter filenames with tweets here

all_files = ['arsenal.txt',
'aston_vila.txt',
'chelsea.txt',
'everton.txt',
'leeds.txt',
'leicester.txt',
'liverpool.txt',
'man_city.txt',
'man_united.txt',
'newcastle.txt',
'palace.txt',
'spurs.txt',
'westham.txt',
'wolves.txt']

In [None]:
for filenames in all_files:
    # print()
    with open(filenames.split(".")[0] + ".txt") as f:
        lines = f.readlines()

        tweet = []
        processed_tweet = []
        positive = []
        negative = []
        neutral = []

        for line in lines:
            if line != "\n":
            
                text = line
                text = clean_tweet(text)
                if text.strip():
                    encoded_input = tokenizer(text, return_tensors='pt')
                    output = model(**encoded_input)
                    scores = output[0][0].detach().numpy()
                    scores = softmax(scores)

                    ranking = np.argsort(scores)
                    ranking = ranking[::-1]
                    for i in range(scores.shape[0]):
                        l = labels[ranking[i]]
                        s = scores[ranking[i]]
                        s = np.round(float(s), 4)
                        if l == 'positive':
                            positive.append(s)
                        elif l == 'negative':
                            negative.append(s)
                        else:
                            neutral.append(s)

                    tweet.append(line)
                    processed_tweet.append(text)


        # dictionary of lists  
        dict = {'tweet': tweet, 'processed_tweet': processed_tweet, 'positive': positive, 'negative' : negative, 'neutral' : neutral}  
            
        df = pd.DataFrame(dict) 
                
        # saving the dataframe 
        df.to_csv(filenames.split(".")[0] + '.csv') 
