#Saracasm Detection by Emojis
## By: Rivka Sheiner
## Purpose: to know the contribution of emojis for sarcasm detection

Download the dataset of sarcastic and non sarcastic tweets with emojis and splitting it into train and test sets

In [None]:
! gdown 1X1jZgrRaVH-h0xX3k8cSzT0kumIVlmwx #sarc_emojis_1000.csv
! gdown 1--LLxkODAryG3TgUHrcggfrYoXnr9UFe #not_sarc_emojis_1000.csv

Downloading...
From: https://drive.google.com/uc?id=1X1jZgrRaVH-h0xX3k8cSzT0kumIVlmwx
To: /content/sarc_emojis_1000.csv
100% 1.60M/1.60M [00:00<00:00, 42.4MB/s]
Downloading...
From: https://drive.google.com/uc?id=1--LLxkODAryG3TgUHrcggfrYoXnr9UFe
To: /content/not_sarc_emojis_1000.csv
100% 1.68M/1.68M [00:00<00:00, 45.3MB/s]


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [None]:
# Load the sarcastic and non sarcastic tweets with emojis
sarcastic_df = pd.read_csv('sarc_emojis_1000.csv')
non_sarcastic_df = pd.read_csv('not_sarc_emojis_1000.csv')

# Append emoticon to text in both datasets
sarcastic_df['text'] = sarcastic_df['text'] + ' ' + sarcastic_df['label']
non_sarcastic_df['text'] = non_sarcastic_df['text'] + ' ' + non_sarcastic_df['label']

# Add labels to the data
sarcastic_df['class'] = 1  # 1 for sarcastic
non_sarcastic_df['class'] = 0  # 0 for non sarcastic

# Combine sarcstic and non sarcastic tweets
df = pd.concat([sarcastic_df, non_sarcastic_df], ignore_index=True)

# Shuffle the data
df = df.sample(frac=0.1, random_state=42).reset_index(drop=True)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df['text'], df['class'], test_size=0.2, random_state=42)

In [None]:
# Unique emoticons list from the 'label' column
all_emoticons_list = df['label'].unique()

In [None]:
all_emoticons_list

array(['😐', '👏', '😏', '😊', '😁', '🤔', '😡', '👍', '😳', '😑', '😄', '👌', '😃',
       '☺', '😔', '😜', '😂', '😒', '🙄', '😉'], dtype=object)

##Emojis Results with BERT

In [None]:
import pandas as pd
import torch
from torch.utils.data import DataLoader, TensorDataset
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, AdamW
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score


texts = df['text'].tolist()
labels = df['class'].tolist()

# Load pre-trained DistilBERT model and tokenizer
model_name = 'distilbert-base-uncased'
tokenizer = DistilBertTokenizer.from_pretrained(model_name)
model = DistilBertForSequenceClassification.from_pretrained(model_name, num_labels=2)  # Assuming binary classification

# Tokenize and encode the input texts
inputs = tokenizer(texts, padding=True, truncation=True, return_tensors='pt')

# Create TensorDataset for training
dataset = TensorDataset(inputs['input_ids'], inputs['attention_mask'], torch.tensor(labels))

# Split the dataset into training and testing sets
train_size = int(0.8 * len(dataset))
test_size = len(dataset) - train_size
train_dataset, test_dataset = torch.utils.data.random_split(dataset, [train_size, test_size])

# Define data loaders
train_dataloader = DataLoader(train_dataset, batch_size=128, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=128, shuffle=False)

# Define optimizer and training parameters
optimizer = AdamW(model.parameters(), lr=2e-5)
epochs = 4

# Training loop
for epoch in range(epochs):
    model.train()
    total_loss = 0

    for batch in train_dataloader:
        optimizer.zero_grad()

        input_ids, attention_mask, label = batch
        outputs = model(input_ids, attention_mask=attention_mask, labels=label)
        loss = outputs.loss
        total_loss += loss.item()

        loss.backward()
        optimizer.step()

    avg_loss = total_loss / len(train_dataloader)
    print(f'Epoch {epoch + 1}/{epochs}, Average Training Loss: {avg_loss}')


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.bias', 'classifier.weight', 'pre_classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/4, Average Training Loss: 0.6681870031356811
Epoch 2/4, Average Training Loss: 0.5674539041519165
Epoch 3/4, Average Training Loss: 0.4612859606742859
Epoch 4/4, Average Training Loss: 0.3606361949443817


In [None]:
# Evaluation on the test set
model.eval()
all_preds = []
all_labels = []

with torch.no_grad():
    for batch in test_dataloader:
        input_ids, attention_mask, label = batch
        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        preds = torch.argmax(logits, dim=1).tolist()
        all_preds.extend(preds)
        all_labels.extend(label.tolist())

# Calculate evaluation metrics
accuracy = accuracy_score(all_labels, all_preds)
precision = precision_score(all_labels, all_preds)
recall = recall_score(all_labels, all_preds)
f1 = f1_score(all_labels, all_preds)

print('BERT results: \n')
print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")

BERT results: 

Accuracy: 0.7275
Precision: 0.8162544169611308
Recall: 0.5818639798488665
F1 Score: 0.6794117647058824


In [None]:
# Analysis for misclassifications with emoticon details and FP/FN examples
test_df = pd.DataFrame({'text': X_test, 'true_label': all_labels, 'predicted_label': all_preds})

for emoticon in all_emoticons_list:
    df_filtered = test_df[test_df['text'].str.contains(emoticon, regex=False)]

    if len(df_filtered) > 0:
        emoticon_accuracy = accuracy_score(df_filtered['true_label'], df_filtered['predicted_label'])

        # False Positives: Predicted as 1 but true label is 0
        fp_examples = df_filtered[(df_filtered['predicted_label'] == 1) & (df_filtered['true_label'] == 0)]['text'].head(3)

        # False Negatives: Predicted as 0 but true label is 1
        fn_examples = df_filtered[(df_filtered['predicted_label'] == 0) & (df_filtered['true_label'] == 1)]['text'].head(3)

        print(f"\nEmoticon: {emoticon}, Accuracy: {emoticon_accuracy:.4f}, Count: {len(df_filtered)}")

        # Display FP and FN examples
        if not fp_examples.empty:
            print("\nFalse Positives:")
            for example in fp_examples:
                print(f"- {example}")

        if not fn_examples.empty:
            print("\nFalse Negatives:")
            for example in fn_examples:
                print(f"- {example}")
    else:
        print(f"\nEmoticon: {emoticon}, No samples in test set.")



Emoticon: 😐, Accuracy: 0.7317, Count: 41

False Positives:
- glad my sisters car works 100% and we didn't have to stay out in the snow for 20 minutes fixing it..  😐
- exactly: politics is not religion; and if in religion all are sinners- is it not then more likely that politics all are 'criminals'???  😐

False Negatives:
- i just had a dream that made me so nervous n anxious that it woke me up  i’m so exhausted, i can’t stand being awake one more second 😐
- disappointment is not finding your favourite actor in the sequel of a movie. 😐
- le me in stats/maths exam:sir steps ky marks milein gay?sir:beta exam hai dance plus nhi 😐

Emoticon: 👏, Accuracy: 0.7045, Count: 44

False Positives:
- ooowee.... very good photo 👏
- congrats ba คนใหม่ด้วยนะคะba burberry brightbright vachirawit 👏
- thank you for your behaviour  👏

False Negatives:
- great job  i love the way some stories from childhood stay with us and keep niggling as we move through life. 👏
- that'll be amazing  👏
- not only was my 