In [None]:
!pip3 install transformers
!pip3 install evaluate
!pip3 install torch
!pip3 install protobuf==3.20.3
!pip3 install snscrape

Defaulting to user installation because normal site-packages is not writeable
Collecting transformers
  Downloading transformers-4.25.1-py3-none-any.whl (5.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.8/5.8 MB[0m [31m73.7 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m136.7 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
Collecting huggingface-hub<1.0,>=0.10.0
  Downloading huggingface_hub-0.11.1-py3-none-any.whl (182 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m182.4/182.4 kB[0m [31m428.3 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
[0mSuccessfully installed huggingface-hub-0.11.1 tokenizers-0.13.2 transformers-4.25.1

[1m[[0m[34;49mnotice[0m[1;3

Installing collected packages: xxhash, responses, datasets, evaluate
[0mSuccessfully installed datasets-2.7.1 evaluate-0.3.0 responses-0.18.0 xxhash-3.1.0

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip available: [0m[31;49m22.1.2[0m[39;49m -> [0m[32;49m22.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [None]:
import pandas as pd
import numpy as np
import re

import torch
from sklearn.model_selection import train_test_split

from huggingface_hub import notebook_login
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer, TextClassificationPipeline
from datasets import Dataset
import evaluate

from tqdm.notebook import tqdm

import snscrape.modules.twitter as sntwitter

In [None]:
class DataLoader():
        
    def import_tweets(self, path, cols=[0, 5]):
        self.tweets_df = pd.read_csv(path, usecols=cols, names=["label", "text"], encoding='latin-1') # spentiment140 specific
        self.num_tweets = self.tweets_df.shape[0]
        
        self.tweets_df["text"] = self.tweets_df["text"].astype(str)
        self.tweets_df["label"] = self.tweets_df["label"].astype(str)

        return self.tweets_df
                
    def split_tweets(self, train_size, test_size):
        self.tweets_train, self.tweets_test = train_test_split(self.tweets_df, train_size=train_size, test_size=test_size)
        return self.tweets_train, self.tweets_test

In [None]:
class TwitterSentimentClassifier():
    
    def __init__(self, model_name='Twitter/twhin-bert-base', num_labels=2, label2int={"0":0, "4":1}, train_size=9000, test_size=1000):
        self.model_name = model_name
        self.num_labels = num_labels
        
        self.train_size = train_size
        self.test_size = test_size
        
        self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
        self.label_to_int = label2int
        
        self.metric_accuracy = evaluate.load("accuracy")
        self.metric_precision = evaluate.load("precision")
        self.metric_recall = evaluate.load("recall")   
        
    def tokenize_function(self, batch):
        tokenized_batch = self.tokenizer(batch["text"], padding='max_length', truncation=True, max_length=140)
        tokenized_batch["label"] = [self.label_to_int[label] for label in batch["label"]]
        return tokenized_batch
    
    def import_tweets(self, path):
        dl = DataLoader()
        dl.import_tweets(path)
        self.train, self.test = dl.split_tweets(self.train_size, self.test_size)
        
        self.train = Dataset.from_pandas(self.train)
        self.test =  Dataset.from_pandas(self.test)
        
        self.train_dataset = self.train.map(self.tokenize_function, batched=True)
        self.test_dataset = self.test.map(self.tokenize_function, batched=True)
        
    def get_metrics(self, predictions, labels):
        accuracy = self.metric_accuracy.compute(predictions=predictions, references=labels)["accuracy"]
        precision = self.metric_precision.compute(predictions=predictions, references=labels)["precision"]
        recall = self.metric_recall.compute(predictions=predictions, references=labels)["recall"]
        f1_score = 2 * (precision * recall) / (precision + recall)

        return {"accuracy":accuracy,
                "precision":precision,
                "recall":recall,
                "f1": f1_score}
        
    def compute_metrics(self, eval_pred):
        logits, labels = eval_pred
        predictions = np.argmax(logits, axis=-1)

        return get_metrics(predictions, labels)

    def train_model(self):
        self.model = AutoModelForSequenceClassification.from_pretrained(self.model_name, num_labels=self.num_labels)
        self.training_args = TrainingArguments(
            report_to="wandb",
            #report_to="none",
            output_dir="training_arguments", 
            save_strategy="no", # limited storage on SCC
            #save_steps=10000,
            evaluation_strategy="epoch",
            per_device_train_batch_size=32,
            per_device_eval_batch_size=32,
            learning_rate=3e-5,
            weight_decay=0.01,
            num_train_epochs=3,
            #warmup_ratio=0.1,
        )
        
        self.trainer = Trainer(
            model=self.model,
            args=self.training_args,
            train_dataset=self.train_dataset,
            eval_dataset=self.test_dataset,
            compute_metrics=self.compute_metrics,
        )
        
        self.trainer.train()
        
        ts.model = ts.model.to("cpu")
        
    def save_model_local(self, path):
        self.trainer.save_model(path)
        
    def save_model_cloud(self):
        raise NotImplementedError("not currently implemented") 
        #self.trainer.push_to_hub("ivan-nikitovic/twitter-sentiment-analyzer")
    
    def load_saved_model(self, path):
        self.model = AutoModelForSequenceClassification.from_pretrained(path, num_labels=self.num_labels)
        self.model = self.model.to("cpu")
        self.pipe = TextClassificationPipeline(model=self.model, tokenizer=self.tokenizer)
        
    def get_scores(self, text):
        return self.pipe(text)
    
    def predict(self, text):
        label2id = {"LABEL_0":0, "LABEL_1":1}
        scores = self.get_scores(text)

        predicted = [label2id[sample["label"]] for sample in scores]
        
        return predicted
    
    def evaluate_model(self, path, cols=[0, 5]):
        dl = DataLoader()
        dl.import_tweets(path, cols=cols)
        
        self.eval = Dataset.from_pandas(dl.tweets_df)
        self.eval_dataset = self.eval.map(self.tokenize_function, batched=True)
        
        predictions = np.array(ts.predict(ts.eval_dataset["text"]))
        labels = np.array(ts.eval_dataset["label"])
        
        return self.get_metrics(predictions, labels)

In [None]:
ts = TwitterSentimentClassifier()

In [None]:
dataFile = 'sentiment140.csv'

# usecols=[label, text]
ts.import_tweets(dataFile, usecols=[0, 5])



  0%|          | 0/9 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

In [None]:
ts.train_model()

Some weights of the model checkpoint at Twitter/twhin-bert-base were not used when initializing BertForSequenceClassification: ['cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at Twitter/twhin-bert-ba



Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.3158,0.316543,0.86428,0.835128,0.907314,0.869725
2,0.2565,0.314273,0.8703,0.873338,0.865818,0.869562


The following columns in the evaluation set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: __index_level_0__, text. If __index_level_0__, text are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 50000
  Batch size = 128
The following columns in the evaluation set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: __index_level_0__, text. If __index_level_0__, text are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 50000
  Batch size = 128


Training completed. Do not forget to share your model on huggingface.co/models =)




In [None]:
ts.save_model_local("twitter-sentiment-500k-2e")

Saving model checkpoint to twitter-sentiment-500k-2e
Configuration saved in twitter-sentiment-500k-2e/config.json
Model weights saved in twitter-sentiment-500k-2e/pytorch_model.bin


In [None]:
ts.load_saved_model("twitter-sentiment-500k-2e")

In [None]:
# on manual test set from sentiment140 (2009)
ts.evaluate_model("sentiment140no_neutral.csv")



  0%|          | 0/1 [00:00<?, ?ba/s]

{'accuracy': 0.8356545961002786,
 'precision': 0.8059701492537313,
 'recall': 0.8901098901098901,
 'f1': 0.845953002610966}

In [None]:
# sentument-eval dataset (~2017)
ts.evaluate_model("sentiment_2016_no_neutral.csv", cols=[0, 1])



  0%|          | 0/22 [00:00<?, ?ba/s]

{'accuracy': 0.8132368337202421,
 'precision': 0.8892655367231639,
 'recall': 0.8459155285786545,
 'f1': 0.8670490262791225}

In [None]:
class TwitterSentimentScraper():
    
    def __init__(self, result=None):
        self.result = result
        
    def get_tweets(self, query, min_likes=0, num_tweets=None):
        """
        Parameters
        ----------
        query : string
            Query sent to Twitter API.
        min_likes : int, optional
            Minimum number of likes of tweets. The default is 0.
        num_tweets : int, optional
            Number of tweets to return. The default is to return all.
    
        Returns
        -------
        tweets_df : pandas dataframe
            dataframe containing tweets with columns ['date_time', 'tweet_id', 'platform', 'username', 'num_likes', 'content'].
    
        """
        tweets = []
        len_result = 0
        
        for tweet in sntwitter.TwitterSearchScraper(query).get_items():
            if num_tweets and len_result == num_tweets:
                break
            
            if tweet.likeCount > min_likes:
                tweets.append([tweet.date, tweet.id, tweet.sourceLabel, tweet.user.username, tweet.likeCount, tweet.content])
                len_result += 1
     
        tweets_df = pd.DataFrame(tweets, columns=['date_time', 'tweet_id', 'platform', 'username', 'num_likes', 'content'])
        
        return tweets_df
    
    def get_tweets_from_user(self, username, since=None, until=None, min_likes=0, num_tweets=None):
        """
        Parameters
        ----------
        username : string
            Twitter username (without @).
        since : string ("YYYY-MM-DD")
            From when to return tweets.
        until : string ("YYYY-MM-DD")
            Until when to return tweets.
        num_tweets : int, optional
            Number of tweets to return. The default is to return all.
    
        Returns
        -------
        tweets_df : pandas dataframe containing tweets
    
        """
        
        if since and until:
            query = f"from:{username} since:{since} until:{until} -is:retweet"
        else:
            query = f"from:{username} -is:retweet"
        
        tweets_df = self.get_tweets(query, min_likes=min_likes, num_tweets=num_tweets)
        
        self.result = tweets_df
        return tweets_df
    
    def get_tweets_from_key(self, key, since, until, lang="en", links=False, retweets=False, min_likes=0, num_tweets=None):
        """
        Parameters
        ----------
        key : string
            Mathces tweets with key included.
        since : string ("YYYY-MM-DD")
            From when to return tweets.
        until : string ("YYYY-MM-DD")
            Until when to return tweets.
        lang : string, optional
            Language of tweets to return. The default is English.
        num_tweets : int, optional
            Number of tweets to return. The default is to return all.
    
        Returns
        -------
        tweets_df : pandas dataframe containing tweets
    
        """
        
        query = f"{key} since:{since} until:{until} lang:{lang}"
        if not retweets:
            query += " -is:retweet"
        if not links:
            query += " -has:links"
        
        tweets_df = self.get_tweets(query, min_likes=min_likes, num_tweets=num_tweets)
        
        self.result = tweets_df
        return tweets_df
    
    def serialize_result(self, file_name):
        self.result.to_csv(file_name, encoding='utf-8') 

In [None]:
class TwitterSentiment():
    
    def __init__(self, keywords, min_likes=5):
    
        self.keywords = keywords
        self.min_likes = min_likes
        
        self.scraper = TwitterSentimentScraper()
        self.model = TwitterSentimentClassifier()
        
    def scrape_tweets(self, since, until):
        
        self.tweets = dict()
        
        for i, key in enumerate(tqdm(self.keywords)):
            self.tweets[key] = self.scraper.get_tweets_from_key(key, since, until, min_likes=self.min_likes)
        
    def load_saved_model(self, path):
        
        self.model.load_saved_model(path)
    
    def predict_sentiment(self):
        
        sentiment_scores = list()
        self.sentiment_list = list()
        self.tweets_content = list()
        for i, key in enumerate(tqdm(self.tweets)):
            tweets_topic_df = self.tweets[key]
            
            # TODO: better duplicate resolver needed
            self.tweets_content.append([tweet.replace("\n", " ") for tweet in list(set(tweets_topic_df["content"]))])
            self.sentiment_list.append(self.model.predict(self.tweets_content[i]))

            sentiment_scores.append(sum(self.sentiment_list[i])/len(self.sentiment_list[i]))
        
        self.sentiment = pd.DataFrame(list(zip(self.keywords, sentiment_scores)), columns=["Keyword", "Sentiment"])
            
        return self.sentiment
            
    def manual_review(self):
        
        for i in range(len(self.sentiment_list)):
            print("-----------------------------")
            print("Sentiment: ", self.sentiment_list[i])
            print("--------------")
            print(self.tweets_content[i])
            print("-----------------------------")
            
    def __main__(self, since, until, model="twitter-sentiment-500k-2e"):
        self.load_saved_model(model)
        self.scrape_tweets(since, until)
        self.predict_sentiment()
        
        with pd.option_context('display.max_rows', None,
                       'display.max_columns', None,
                       'display.precision', 4,
                       ):
            print(self.sentiment)
        

In [None]:
tst = TwitterSentiment(["Joe Biden", "Donald Trump", "Vladimir Putin"], min_likes=10)

In [None]:
# Twitter sentiment of Joe Biden, Donald Trump, Vladimir Putin on December 1st, 2022
tst.__main__("2022-12-01", "2022-12-02")

Unnamed: 0,Keyword,Sentiment
0,Joe Biden,0.390864
1,Donald Trump,0.512868
2,Vladimir Putin,0.233645


In [None]:
tst = TwitterSentiment(["Novak Djokovic", "Dusan Vlahovic"], min_likes=3)

In [None]:
# Twitter sentiment of Novak Djokovic, Dusan Vlahovic on December 1st, 2022
tst.__main__("2022-12-01", "2022-12-02")

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

          Keyword  Sentiment
0  Novak Djokovic     0.8814
1  Dusan Vlahovic     0.3333


In [None]:
tst = TwitterSentiment(["Russia", "Ukraine"], min_likes=10)

In [None]:
tst.__main__("2022-12-01", "2022-12-02")

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

   Keyword  Sentiment
0   Russia     0.2886
1  Ukraine     0.3340
