## **Pipeline for emotion, sentiment e hate speech analysis**

Notebook used for EDGE: Inclusione LGBTQIA+ e sviluppo sostenibile [link](https://www.bloomberg.com/news/articles/2024-04-19/italy-s-lgbtqia-inclusion-levels-have-stopped-improving)

Notebook builds on three Pretrained models for sentiment/emotion/hate speech analysis developed by [MilaNLProc](https://github.com/MilaNLProc).

In [1]:
import pandas as pd
import numpy as np
import re
import string
import nltk
import spacy 



In [2]:
!pip install datasets transformers==4.28.0



In [None]:
## Non funziona bene, non usare! Usa cella successiva...


from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
from tqdm import tqdm

class TextDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        return item

    def __len__(self):
        return len(self.encodings["input_ids"])

class SentimentClassifier:

    def __init__(self):
        """
        Simple class initialization for the sentiment classifier,
        the sentiment classification model is going to be downloaded
        directly from huggingface
        """
        self.sentiment_map = {0: "negative", 1: "positive"}
        self.tokenizer = AutoTokenizer.from_pretrained("MilaNLProc/feel-it-italian-sentiment")
        self.device = self._get_device()
        self.model = self._load_model()
        print(f"Model running on: {self.device}")
        
    def _get_device(self):
        if torch.cuda.is_available():
            num_gpus = torch.cuda.device_count()
            print(f"Number of GPUs available is {num_gpus}")
            if num_gpus > 1:
                return torch.device('cuda')
            else:
                return torch.device('cuda:0' if num_gpus > 0 else 'cpu')
        else:
            return torch.device('cpu')

    def _load_model(self):
        model = AutoModelForSequenceClassification.from_pretrained("MilaNLProc/feel-it-italian-sentiment")
        model.eval()
        if torch.cuda.is_available() and torch.cuda.device_count() > 1:
            model = nn.DataParallel(model)
        model.to(self.device)
        return model



    def predict(self, sentences, batch_size=32):
        """
        Predicts the sentiment for the sentences in input
        @param sentences: sentences to be classified with the sentiment classifier
        @param batch_size: batch size for the network
        @return:
        """
        train_encodings = self.tokenizer(sentences,
                                    truncation=True,
                                    padding=True,
                                    max_length=500)

        train_dataset = TextDataset(train_encodings)

        loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, num_workers=2, pin_memory=True)
        collect_outputs = []
        with torch.no_grad():
            pbar = tqdm(loader, total=len(loader), desc="Predicting Emotion")
            for batch in pbar:  # Loop through pbar instead of loader
                input_ids = batch['input_ids'].to(self.device)  # Move input_ids to device
                attention_mask = batch['attention_mask'].to(self.device)  # Move attention_mask to device

                outputs = self.model(input_ids, attention_mask=attention_mask)
                collect_outputs.extend(torch.argmax(outputs["logits"], axis=1).cpu().numpy().tolist())
                
        torch.cuda.empty_cache()

        return [self.sentiment_map[k] for k in collect_outputs]

class EmotionClassifier:

    def __init__(self):
        """
        Simple class initialization for the emotion classifier,
        the emotion classification model is going to be downloaded
        directly from huggingface
        """
        self.emotion_map = {0: "anger", 1: "fear", 2 : "joy", 3: "sadness"}
        self.tokenizer = AutoTokenizer.from_pretrained("MilaNLProc/feel-it-italian-emotion")
        self.device = self._get_device()
        self.model = self._load_model()
        print(f"Model running on: {self.device}")

    def _get_device(self):
        if torch.cuda.is_available():
            num_gpus = torch.cuda.device_count()
            if num_gpus > 1:
                return torch.device('cuda')
            else:
                return torch.device('cuda:0' if num_gpus > 0 else 'cpu')
        else:
            return torch.device('cpu')

    def _load_model(self):
        model = AutoModelForSequenceClassification.from_pretrained("MilaNLProc/feel-it-italian-sentiment")
        model.eval()
        if torch.cuda.is_available() and torch.cuda.device_count() > 1:
            model = nn.DataParallel(model)
        model.to(self.device)
        return model


    def predict(self, sentences, batch_size=32):
        """
        Predicts the emotion for the sentences in input
        @param sentences: sentences to be classified with the emotion classifier
        @param batch_size: batch size for the network
        @return:
        """
        train_encodings = self.tokenizer(sentences,
                                    truncation=True,
                                    padding=True,
                                    max_length=500)

        train_dataset = TextDataset(train_encodings)

        loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, num_workers=2, pin_memory=True)
        collect_outputs = []

        with torch.no_grad():
            pbar = tqdm(loader, total=len(loader), desc="Predicting Emotion")
            for batch in pbar:  # Loop through pbar instead of loader
                input_ids = batch['input_ids'].to(self.device)  # Move input_ids to device
                attention_mask = batch['attention_mask'].to(self.device)  # Move attention_mask to device

                outputs = self.model(input_ids, attention_mask=attention_mask)
                collect_outputs.extend(torch.argmax(outputs["logits"], axis=1).cpu().numpy().tolist())

        torch.cuda.empty_cache()

        return [self.emotion_map[k] for k in collect_outputs]

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
from torch import nn
from tqdm import tqdm

# No change here, as this is a straightforward Dataset definition.
class TextDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        return item

    def __len__(self):
        return len(self.encodings["input_ids"])

class BaseClassifier:

    def __init__(self, model_name, label_map):
        self.label_map = label_map
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.device = self._get_device()
        self.model = self._load_model(model_name)
        print(f"Model running on: {self.device}")

    def _get_device(self):
        """Getting the device. If multiple GPUs are available, DataParallel will be used later."""
        if torch.cuda.is_available():
            num_gpus = torch.cuda.device_count()
            if num_gpus > 1:
                return torch.device('cuda')
            else:
                return torch.device('cuda:0')
        else:
            return torch.device('cpu')

    def _load_model(self, model_name):
        """Load the model. Use Data Parallelism if multiple GPUs are available."""
        model = AutoModelForSequenceClassification.from_pretrained(model_name)
        model.eval()
        if torch.cuda.is_available() and torch.cuda.device_count() > 1:
            model = nn.DataParallel(model)
        model.to(self.device)
        return model

    def predict(self, sentences, batch_size=32):
        """
        Predicts the label for the input sentences.
        Sentences are tokenized and passed through the model to get predictions.
        """
        encodings = self.tokenizer(sentences, truncation=True, padding=True, max_length=500)
        dataset = TextDataset(encodings)

        # Use multiple workers for DataLoader to speed up data loading.
        # Also, pin_memory speeds up data transfer to CUDA devices.
        loader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, num_workers=2, pin_memory=True)
        
        collect_outputs = []

        with torch.no_grad():
            pbar = tqdm(loader, total=len(loader), desc="Predicting")
            for batch in pbar:
                input_ids = batch['input_ids'].to(self.device)
                attention_mask = batch['attention_mask'].to(self.device)
                
                outputs = self.model(input_ids, attention_mask=attention_mask)
                collect_outputs.extend(torch.argmax(outputs["logits"], axis=1).cpu().numpy().tolist())
        
        # Ensure any memory not being used is freed up.
        torch.cuda.empty_cache()

        return [self.label_map[k] for k in collect_outputs]


class SentimentClassifier(BaseClassifier):

    def __init__(self):
        super().__init__("MilaNLProc/feel-it-italian-sentiment", {0: "negative", 1: "positive"})


class EmotionClassifier(BaseClassifier):

    def __init__(self):
        super().__init__("MilaNLProc/feel-it-italian-emotion", {0: "anger", 1: "fear", 2: "joy", 3: "sadness"})


# Please note:
# 1. Model loading: Using DataParallel to utilize both GPUs for training. 
# 2. Data loading: Using `num_workers` to speed up data loading. 
# 3. Memory management: Used `torch.cuda.empty_cache()` to free up unused memory.
# 4. Refactored the code for DRY principle: Extracted common functionalities to a BaseClassifier to avoid redundancy.

In [3]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from torch.utils.data import Dataset, DataLoader
import numpy as np
from typing import List

class TextInferenceDataset(Dataset):
    def __init__(self, texts, tokenizer):
        self.texts = texts
        self.tokenizer = tokenizer
        self.encodings = tokenizer(texts, padding='max_length', truncation=True, max_length=100, return_tensors='pt')

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        return {key: val[idx] for key, val in self.encodings.items()}

class HateSpeechClassifier:
    
    def __init__(self, model="twitter-base"):
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")  # Check if CUDA is available

        if model == "twitter-base":
            self.tokenizer = AutoTokenizer.from_pretrained("MilaNLProc/hate-ita")
            self.model = AutoModelForSequenceClassification.from_pretrained("MilaNLProc/hate-ita")
        elif model == "base":
            self.tokenizer = AutoTokenizer.from_pretrained("MilaNLProc/xlm-emo-t")
            self.model = AutoModelForSequenceClassification.from_pretrained("MilaNLProc/hate-ita-xlm-r-base")
        elif model == "large":
            self.tokenizer = AutoTokenizer.from_pretrained("MilaNLProc/xlm-emo-t")
            self.model = AutoModelForSequenceClassification.from_pretrained("MilaNLProc/hate-ita-xlm-r-large")
        else:
            raise Exception("Not Yet Implemented")

        if torch.cuda.is_available() and torch.cuda.device_count() > 1:
            self.model = torch.nn.DataParallel(self.model)
        
        self.model.to(self.device)  # Move the model to GPU if available


    def predict(self, text: List[str], batch_size=32):
        
        if torch.cuda.is_available() and torch.cuda.device_count() > 1:
            batch_size *= torch.cuda.device_count()

        # Create Dataset and DataLoader
        dataset = TextInferenceDataset(text, self.tokenizer)
        data_loader = DataLoader(dataset, batch_size=batch_size, shuffle=False)

        all_results = []

        # Custom inference loop
        self.model.eval()
        with torch.no_grad():
            for batch in tqdm(data_loader, desc="Predicting", total=len(data_loader)):
                for key in batch:
                    batch[key] = batch[key].to(self.device)
                outputs = self.model(**batch)
                logits = outputs[0]
                results = torch.argmax(logits, dim=1).cpu().numpy()
                all_results.extend(results)

        mapper = {0: "not-hate", 1: "hate"}
        return [mapper[k] for k in all_results]

def tt(tokenizer):
    def tokenize_function(examples):
        return tokenizer(examples["texts"], padding='max_length', truncation=True, max_length=100)
    return tokenize_function

def prepare_dataset(dataset, tokenizer):

    dataset = dataset.map(
        tt(tokenizer),
        batched=True,
        remove_columns=["texts"],
    )

    dataset.set_format("torch")

    return dataset

In [8]:
df = pd.read_csv("/kaggle/input/nogeodata/zan.csv", sep=',')

In [9]:
df

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,text,language,user_id,date,com,reg,prov,testo,link,sent,emozione
0,1843,2088,egaylity l omosessualità anche se dubbia è ele...,it,PairsonnalitesU,2017-01-02 13:44:08,Monte Caminetto,Lazio,RM,egaylity l omosessualità anche se dubbia è ele...,http,negative,anger
1,1844,2089,egaylity a firenze approda chiesa casa per tut...,it,PairsonnalitesU,2017-01-02 13:44:09,Monte Caminetto,Lazio,RM,egaylity a firenze approda chiesa casa per tut...,http,positive,sadness
2,1961,2234,egaylity a firenze approda chiesa casa per tut...,it,PairsonnalitesU,2017-01-02 15:57:04,Monte Caminetto,Lazio,RM,egaylity a firenze approda chiesa casa per tut...,http,positive,fear
3,1972,2246,stigmabase it l omosessualità anche se dubbia ...,it,pairsonnalitesB,2017-01-02 16:09:11,Rome,Lazio,RM,stigmabase it l omosessualità anche se dubbia ...,http,negative,anger
4,2029,2311,? buona sera a tutti amici ? ? ? ? ? tbt gay g...,it,lordguerreiro_,2017-01-02 17:07:53,Pisa,Toscana,PI,? buona sera a tutti amici ? ? ? ? ? tbt gay g...,http,positive,joy
...,...,...,...,...,...,...,...,...,...,...,...,...,...
7874,1244248,100506,william lo sgarbo a kate middleton passo falso...,it,ArmyArmy79,2022-12-19 14:11:35,Rome,Lazio,RM,william lo sgarbo a kate middleton passo falso...,http,negative,anger
7875,1244284,100549,william lascia a casa kate middleton e rivede ...,it,ArmyArmy79,2022-12-19 16:22:34,Rome,Lazio,RM,william lascia a casa kate middleton e rivede ...,http,negative,joy
7876,1245008,101570,parlanodinoi bilancio consigliocomunale tollo ...,it,AngeloRadica,2022-12-23 20:15:26,Tollo,Abruzzo,CH,parlanodinoi bilancio consigliocomunale tollo ...,http,positive,joy
7877,1245391,102042,e mancava una foto ancora quella con l albero ...,it,stegru1,2022-12-25 18:19:56,Florence,Toscana,FI,e mancava una foto ancora quella con l albero ...,http,positive,joy


In [5]:
import torch.nn as nn

In [None]:
emotion_classifier = EmotionClassifier()

In [10]:
df = df.dropna(subset='text')
tweets = df['text'].tolist()
len(tweets)

7879

In [None]:
def divide_vector(vector_length):
    parts = 1
    equal_parts = vector_length // parts
    remainder = vector_length % parts

    divided_parts = []
    start_index = 0
    for i in range(parts):
        end_index = start_index + (equal_parts + 1 if i < remainder else equal_parts)
        divided_parts.append((start_index, end_index))
        start_index = end_index

    return divided_parts

vector_length = len(tweets)
result = divide_vector(vector_length)
print(result)

In [None]:
tweets_pt1 = tweets[result[0][0]:result[0][1]]
tweets_pt2 = tweets[result[1][0]:result[1][1]]
tweets_pt3 = tweets[result[2][0]:result[2][1]]
tweets_pt4 = tweets[result[3][0]:result[3][1]]
tweets_pt5 = tweets[result[4][0]:result[4][1]]

print(len(tweets_pt1),len(tweets_pt2),len(tweets_pt3), len(tweets_pt4), len(tweets_pt5))

In [None]:
df

In [None]:
emo_pt1 = emotion_classifier.predict(tweets_pt1)
emo_pt2 = emotion_classifier.predict(tweets_pt2)
emo_pt3 = emotion_classifier.predict(tweets_pt3)
emo_pt4 = emotion_classifier.predict(tweets_pt4)
emo_pt5 = emotion_classifier.predict(tweets_pt5)

In [None]:
emo = emo_pt1 + emo_pt2 + emo_pt3 + emo_pt4 + emo_pt5
len(emo)

In [None]:
df["emotion"] = emo

In [None]:
df.to_csv("/kaggle/working/2020_fIT_noGeo_emo.csv", sep=';')

In [None]:
import torch.nn as nn

In [None]:
sentiment_classifier = SentimentClassifier()

In [None]:
sent_pt1 = sentiment_classifier.predict(tweets_pt1)
sent_pt2 = sentiment_classifier.predict(tweets_pt2)
sent_pt3 = sentiment_classifier.predict(tweets_pt3)
sent_pt4 = sentiment_classifier.predict(tweets_pt4)
sent_pt5 = sentiment_classifier.predict(tweets_pt5)

In [None]:
sent = sent_pt1 + sent_pt2 + sent_pt3 + sent_pt4 + sent_pt5 
len(sent)

In [None]:
df["feel-it sentiment"] = sent

In [None]:
df.to_csv("/kaggle/working/2020_fIT_noGeo_emoSent.csv", sep=';')

In [None]:
df

In [11]:
hc = HateSpeechClassifier("large")

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.01k [00:00<?, ?B/s]

Downloading (…)tencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.10k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/2.24G [00:00<?, ?B/s]

In [12]:
from tqdm import tqdm

In [13]:
df

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,text,language,user_id,date,com,reg,prov,testo,link,sent,emozione
0,1843,2088,egaylity l omosessualità anche se dubbia è ele...,it,PairsonnalitesU,2017-01-02 13:44:08,Monte Caminetto,Lazio,RM,egaylity l omosessualità anche se dubbia è ele...,http,negative,anger
1,1844,2089,egaylity a firenze approda chiesa casa per tut...,it,PairsonnalitesU,2017-01-02 13:44:09,Monte Caminetto,Lazio,RM,egaylity a firenze approda chiesa casa per tut...,http,positive,sadness
2,1961,2234,egaylity a firenze approda chiesa casa per tut...,it,PairsonnalitesU,2017-01-02 15:57:04,Monte Caminetto,Lazio,RM,egaylity a firenze approda chiesa casa per tut...,http,positive,fear
3,1972,2246,stigmabase it l omosessualità anche se dubbia ...,it,pairsonnalitesB,2017-01-02 16:09:11,Rome,Lazio,RM,stigmabase it l omosessualità anche se dubbia ...,http,negative,anger
4,2029,2311,? buona sera a tutti amici ? ? ? ? ? tbt gay g...,it,lordguerreiro_,2017-01-02 17:07:53,Pisa,Toscana,PI,? buona sera a tutti amici ? ? ? ? ? tbt gay g...,http,positive,joy
...,...,...,...,...,...,...,...,...,...,...,...,...,...
7874,1244248,100506,william lo sgarbo a kate middleton passo falso...,it,ArmyArmy79,2022-12-19 14:11:35,Rome,Lazio,RM,william lo sgarbo a kate middleton passo falso...,http,negative,anger
7875,1244284,100549,william lascia a casa kate middleton e rivede ...,it,ArmyArmy79,2022-12-19 16:22:34,Rome,Lazio,RM,william lascia a casa kate middleton e rivede ...,http,negative,joy
7876,1245008,101570,parlanodinoi bilancio consigliocomunale tollo ...,it,AngeloRadica,2022-12-23 20:15:26,Tollo,Abruzzo,CH,parlanodinoi bilancio consigliocomunale tollo ...,http,positive,joy
7877,1245391,102042,e mancava una foto ancora quella con l albero ...,it,stegru1,2022-12-25 18:19:56,Florence,Toscana,FI,e mancava una foto ancora quella con l albero ...,http,positive,joy


In [14]:
hate = hc.predict(tweets)

Predicting: 100%|██████████| 124/124 [01:31<00:00,  1.35it/s]


In [None]:
hate_pt1 = hc.predict(tweets_pt1)
hate_pt2 = hc.predict(tweets_pt2)
hate_pt3 = hc.predict(tweets_pt3)
hate_pt4 = hc.predict(tweets_pt4)
hate_pt5 = hc.predict(tweets_pt5)

In [None]:
hate = hate_pt1 + hate_pt2 + hate_pt3 + hate_pt4 + hate_pt5 

In [15]:
df["hate"] = hate

In [16]:
df

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,text,language,user_id,date,com,reg,prov,testo,link,sent,emozione,hate
0,1843,2088,egaylity l omosessualità anche se dubbia è ele...,it,PairsonnalitesU,2017-01-02 13:44:08,Monte Caminetto,Lazio,RM,egaylity l omosessualità anche se dubbia è ele...,http,negative,anger,not-hate
1,1844,2089,egaylity a firenze approda chiesa casa per tut...,it,PairsonnalitesU,2017-01-02 13:44:09,Monte Caminetto,Lazio,RM,egaylity a firenze approda chiesa casa per tut...,http,positive,sadness,not-hate
2,1961,2234,egaylity a firenze approda chiesa casa per tut...,it,PairsonnalitesU,2017-01-02 15:57:04,Monte Caminetto,Lazio,RM,egaylity a firenze approda chiesa casa per tut...,http,positive,fear,not-hate
3,1972,2246,stigmabase it l omosessualità anche se dubbia ...,it,pairsonnalitesB,2017-01-02 16:09:11,Rome,Lazio,RM,stigmabase it l omosessualità anche se dubbia ...,http,negative,anger,not-hate
4,2029,2311,? buona sera a tutti amici ? ? ? ? ? tbt gay g...,it,lordguerreiro_,2017-01-02 17:07:53,Pisa,Toscana,PI,? buona sera a tutti amici ? ? ? ? ? tbt gay g...,http,positive,joy,not-hate
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7874,1244248,100506,william lo sgarbo a kate middleton passo falso...,it,ArmyArmy79,2022-12-19 14:11:35,Rome,Lazio,RM,william lo sgarbo a kate middleton passo falso...,http,negative,anger,not-hate
7875,1244284,100549,william lascia a casa kate middleton e rivede ...,it,ArmyArmy79,2022-12-19 16:22:34,Rome,Lazio,RM,william lascia a casa kate middleton e rivede ...,http,negative,joy,not-hate
7876,1245008,101570,parlanodinoi bilancio consigliocomunale tollo ...,it,AngeloRadica,2022-12-23 20:15:26,Tollo,Abruzzo,CH,parlanodinoi bilancio consigliocomunale tollo ...,http,positive,joy,not-hate
7877,1245391,102042,e mancava una foto ancora quella con l albero ...,it,stegru1,2022-12-25 18:19:56,Florence,Toscana,FI,e mancava una foto ancora quella con l albero ...,http,positive,joy,not-hate


In [17]:
df.to_csv("/kaggle/working/zan_hate.csv", sep=';')