## Imports

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
!pip install pandas
!pip install numpy
!pip install matplotlib
!pip install scipy
!pip install transformers
!pip install datetime

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.19.4-py3-none-any.whl (4.2 MB)
[K     |████████████████████████████████| 4.2 MB 4.9 MB/s 
[?25hCollecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 44.7 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 

In [5]:
import math
import pandas as pd
# Using the tensorflow model
from transformers import AutoModelForSequenceClassification
from transformers import TFAutoModelForSequenceClassification
from transformers import AutoTokenizer
import numpy as np
# softmax for multi-classification scores
from scipy.special import softmax
import csv
import urllib.request
import datetime
import matplotlib.pyplot as plt
%matplotlib inline

## Get Model Info

In [6]:
# model 2 is for emotions: joy, anger, optimism, sadness
# Hugging face documentation here: https://huggingface.co/cardiffnlp/twitter-roberta-base-emotion
# model 6 is for sentiment: positive, negative, neutral
# Hugging face documentation here: https://huggingface.co/cardiffnlp/twitter-roberta-base-sentiment-latest


# Pretrained model path from hugging face
MODEL2 = "cardiffnlp/twitter-roberta-base-emotion"
MODEL6 = "cardiffnlp/twitter-roberta-base-sentiment-latest"

# Use the TFAutoModelForSequenceClassification and AutoTokenizer to load 
# the pretrained model of generic model class and it’s associated tokenizer
model2 = AutoModelForSequenceClassification.from_pretrained(MODEL2)
model6 = AutoModelForSequenceClassification.from_pretrained(MODEL6)

# Tokeniser turns the labels into vectors
tokenizer2 = AutoTokenizer.from_pretrained(MODEL2)
tokenizer6 = AutoTokenizer.from_pretrained(MODEL6)

Downloading:   0%|          | 0.00/768 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/476M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/929 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/478M [00:00<?, ?B/s]

Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Downloading:   0%|          | 0.00/878k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/150 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/878k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/239 [00:00<?, ?B/s]

In [7]:
# Precoded from hugging face - returns list of labels to be used in the models i.e. joy, anger, optimism, sadness. 

mapping_link2 = "https://raw.githubusercontent.com/cardiffnlp/tweeteval/main/datasets/emotion/mapping.txt"
with urllib.request.urlopen(mapping_link2) as f:
    html = f.read().decode('utf-8').split("\n")
    csvreader = csv.reader(html, delimiter='\t')
labels2 = [row[1] for row in csvreader if len(row) > 1]

# Precoded from hugging face - returns list of labels to be used in the models i.e. positive, negative, neutral
mapping_link6 = "https://raw.githubusercontent.com/cardiffnlp/tweeteval/main/datasets/sentiment/mapping.txt"
with urllib.request.urlopen(mapping_link6) as f:
    html = f.read().decode('utf-8').split("\n")
    csvreader = csv.reader(html, delimiter='\t')
labels6 = [row[1] for row in csvreader if len(row) > 1]

## Define Functions

In [24]:
# Precoded with hugging face - turns tweets into a string of words without links ('\nhttp' and 'http') and users 
# and returned as preprocessed_tweets, which is 
# a list of list of words
def preprocess(df):
    tweets = [str(tweet) for tweet in df["Text"]]
    preprocessed_tweets = []
    
    for tweet in tweets:
        new_text=[]
        
        # Amend '\nhttp' into 'http'
        for t in tweet.split(" "):
            t = '@user' if t.startswith('@') and len(t) > 1 else t
            t = 'http' if t.startswith('http') else t
            t = 'http' if '\nhttp' in t else t 
            new_text.append(t)
            
        preprocessed_tweets.append(" ".join(new_text))
    # Could be appended to dofi but not yet
    return preprocessed_tweets

# Running model here
def method2_processing(df, prep_df, append=False):
    score_list = []
    dofi = df
    
    # Runnig the model on each tweets, which is putting a score against the tweet and outputs a list
    for text in prep_df:
        encoded_input = tokenizer2(text, return_tensors='pt')
        output = model2(**encoded_input)
        scores = output[0][0].detach().numpy()
        scores = softmax(scores)
        score_list.append(scores)

    # Precoded with hugging face and put the scores into a ranking
    ranking = np.argsort(score_list[0])
    ranking = ranking[::-1]
    results = {"anger": [], "sadness": [], "optimism": [], "joy": []}

    # Takes the labels and scores and puts it inside of results dictionary
    for count, tweet_score in enumerate(score_list):
        for i in range(tweet_score.shape[0]):
            l = labels2[ranking[i]]
            s = tweet_score[ranking[i]]
            # Round scores at 4 decimal palces
            results[f"{l}"].append(np.round(float(s), 4))
    
    # Converts dictionary results into the df with labels and the original tweets
    if append == True:
        
        dofi["joy"] = results["joy"]
        dofi["optimism"] = results["optimism"]
        dofi["anger"] = results["anger"]
        dofi["sadness"] = results["sadness"]
        
        return dofi
    
    return results                                 

# Same process as above but different labels
def method6_processing(df, prep_df, append=False):
    score_list = []
    dofi = df
    
    for text in prep_df:
        encoded_input = tokenizer6(text, return_tensors='pt')
        output = model6(**encoded_input)
        scores = output[0][0].detach().numpy()
        scores = softmax(scores)
        score_list.append(scores)

    ranking = np.argsort(score_list[0])
    ranking = ranking[::-1]
    results = {"positive": [], "neutral": [], "negative": []}

    for count, tweet_score in enumerate(score_list):
        for i in range(tweet_score.shape[0]):
            l = labels6[ranking[i]]
            s = tweet_score[ranking[i]]
            results[f"{l}"].append(np.round(float(s), 4))

    if append == True:
        
        dofi["positive"] = results["positive"]
        dofi["neutral"] = results["neutral"]
        dofi["negative"] = results["negative"]
        
        return dofi
    
    return results 

# Running the precess steps for models 2 and 6 together and then dropping duplicates
def tweet_RoBERTa_light_processing(raw_df):
    drop_df = raw_df.drop_duplicates()
    prep_df = preprocess(drop_df)
    
    #m2 output is dataframe 
    m2 = method2_processing(drop_df, prep_df, True)
    #add m6 processing on top on m2 output
    m6 = method6_processing(m2, prep_df, True)
    
    # The datetime from the tweets is converted to the date time format
    dates = pd.to_datetime(m6["Datetime"]).dt.date
    # Replaces the original tweet with the preprocessed tweet
    m6["only_date"] = dates
    m6["Text"] = prep_df

    return m6

## Get Data

In [20]:
# Origin dataset
data = pd.read_csv("/content/drive/MyDrive/data/Tweets by minority group/Roberta_Muslim.csv")
tweets_df = pd.DataFrame(data)
tweets_df

Unnamed: 0.1,Unnamed: 0,Datetime,Tweet Id,Text,Username,Like Count
0,0,2022-06-09 18:56:00+00:00,1534972597259194368,Fury at plans to turn Iconic 'Peaky Blinders' ...,Daily_Express,2.0
1,3,2022-06-09 13:30:04+00:00,1534890576197320709,The Lady Of Heaven producer blasts Muslim prot...,DailyMailUK,9.0
2,5,2022-06-09 03:00:02+00:00,1534732022752727041,Sajid Javid warns cancelled screenings of 'bla...,DailyMailUK,23.0
3,7,2022-06-08 20:40:12+00:00,1534636435285884928,Fury over plans to turn historic 'Peaky Blinde...,DailyMailUK,8.0
4,11,2022-06-08 17:19:07+00:00,1534585826906148868,#MsMarvel Rish Shah pays sweet tribute to lead...,Daily_Express,1.0
...,...,...,...,...,...,...
12545,16379,2008-07-28 09:49:53+00:00,870405708,One third of British Muslim students say it&#0...,MailOnline,0.0
12546,16380,2008-07-27 17:49:29+00:00,869872053,Killing in the name of Islam is acceptable... ...,MailOnline,0.0
12547,16381,2008-07-25 01:19:34+00:00,867639069,"Obama addresses 200,000 in Berlin as he calls ...",MailOnline,0.0
12548,16382,2008-07-23 12:59:30+00:00,866074288,"Disabled boy, 12, held under Terrorism Act aft...",MailOnline,0.0


## Process Your Data

In [25]:
# This is the processed dataframe from the model
processed_df = tweet_RoBERTa_light_processing(tweets_df)
processed_df.head(5)

ParserError: ignored

In [18]:
len(processed_df)

608

In [19]:
# Downloading to csv
processed_df.to_csv('muslim_roberta_processed.csv')