In [28]:
import os

import torch
from torch.utils.data import Dataset
from torch import nn
from torch import optim
import numpy as np

### Read data from Sarcasm Amazon Review dataset

In [3]:
class CustomDataset(Dataset):
    def __init__(self, data_folder_path, transform=None):
        self.data_folder_path = data_folder_path
        self.transform = transform

        self.data_list = list()
        self.data_list.extend([os.path.join("Ironic", file) for file in os.listdir(os.path.join(dataset_path, "Ironic")) if file.endswith(".txt")])
        self.data_list.extend([os.path.join("Regular", file) for file in os.listdir(os.path.join(dataset_path, "Regular")) if file.endswith(".txt")])

    def __len__(self):
        # Return the total number of samples in your dataset
        return len(self.data_list)
    
    def __getitem__(self, idx):
        # Load and process the text file at the given index and return it
        os.chdir(self.data_folder_path)
        file_path = self.data_list[idx]

        # Get the label from the file name
        label = 0 if 'Regular' in file_path else 1

        text_data = None
        # Read the review
        with open(file_path, 'r', encoding="unicode_escape") as file:
            # First line is the number of stars of the review
            first_line = file.readline()
            # ï»¿ is the BOM character
            first_line = first_line.replace('ï»¿', '').replace('<STARS>', '').replace('</STARS>\n', '')
            stars = float(first_line)

            # Then fin the <REVIEW> tag to retrive the review text
            for line in file:
                if '<REVIEW>' in line:
                        text_data = file.readlines()

        # Remove \n and combine in one string
        text_data = ' '.join([line.replace('\n', '') for line in text_data])
        # Remove </REVIEW> and &quot; tokens
        text_data = text_data.replace('</REVIEW>', '').replace('&quot;', '')
        
        # Apply transformations if any
        if self.transform:
            text_data = self.transform(text_data)
        
        # Return the processed data and its corresponding label (if applicable)
        return text_data, label, stars

In [5]:
os.chdir("../../datasets/SarcasmAmazonReviewsCorpus-master")
dataset_path = os.path.abspath(os.curdir)
dataset = CustomDataset(dataset_path)

# Suffle the dataset
torch.manual_seed(0)
indices = torch.randperm(len(dataset)).tolist()
dataset = torch.utils.data.Subset(dataset, indices)

# Split the dataset into train, validation and test sets
train_size = int(0.8 * len(dataset))
val_size = int(0.1 * len(dataset))
test_size = len(dataset) - train_size - val_size

train_dataset, val_dataset, test_dataset = torch.utils.data.random_split(dataset, [train_size, val_size, test_size])

# Set sizes
print(len(train_dataset), len(val_dataset), len(test_dataset))

# Labels rates (0: Regular, 1: Ironic)
labels = [label for _, label, _ in train_dataset]
print(labels.count(0), labels.count(1))

1003 125 126
639 364


In [36]:
# Retrive the data and labels from the dataset into numpy array
train_data = list()
train_labels = list()
for data, label, stars in train_dataset:
    train_data.append([data, stars])
    train_labels.append(label)

val_data = list()
val_labels = list()
for data, label, stars in val_dataset:
    val_data.append([data, stars])
    val_labels.append(label)

test_data = list()
test_labels = list()
for data, label, stars in test_dataset:
    test_data.append([data, stars])
    test_labels.append(label)

print(train_data[0])


["The Sun also Rises made a huge impression on me when I read it as a college student a number of years ago.  It is true that one must look beyond the surface to get a clear understanding of any book by Hemingway.   It is also true that the language that he used was not flowery, nor overly  eloquent but the meaning revealed within the lines.  It is also true that  the characters are often expatriates; living on the fringe of society and  hedonistic to the max.  All of those elements are visible here, yet  sometimes it might require a magnifying glass to see it.  However, these  are the qualities which make Ernest Hemingway, the seminal writer for a  generation and certainly one of the best.      I propose one hint when  reading the Sun also Rises.  Pay close attention to the  relationship between Barnes and Robert Cohn.  Barnes laothes Cohn for being  everything that he is not.  What drives him over the edge (in the inner  sanctum of his own mind and demons) is the success Cohn has ins

### Features building functions

In [16]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from sklearn.feature_extraction.text import CountVectorizer
import stanza

In [8]:
def get_sentiment_score_feature(text, analyzer):
    """!
    @brief Get the sentiment score feature of a text using VADER sentiment analysis tool.
    @param text (str): Text to be analyzed.
    @param analyzer (SentimentIntensityAnalyzer): VADER sentiment analysis tool.
    @return (dict): Sentiment score feature of the text.
    """
    return analyzer.polarity_scores(text)["compound"]

In [9]:
def get_punctuation_feature(text):
    """!
    @brief Get the punctuation feature of a text.
    @param text (str): Text to be analyzed.
    @return (list): List of punctuation counts normalized by the total count of punctuation in the text.
                    [count of '.', count of '!', count of '?', count of ',']
    """
    punctuations = ['.', '!', '?', ',']
    punctuations_counts = list()
    total_count = 0
    for punctuation in punctuations:
        count = text.count(punctuation)
        punctuations_counts.append(count)
        total_count += count
    # Normalize the counts (ratio of punctuation count to total count)
    if total_count != 0:
        punctuations_counts = [count / total_count for count in punctuations_counts]
    return punctuations_counts

In [10]:
def get_POS_feature(text, pipeline):
    """!
    @brief Get the POS feature of a text.
    @param text (str): Text to be analyzed.
    @param pipeline (stanza.Pipeline): The Stanza pipeline use for the constituency parsing.
    @return (list): List of POS tag counts normalized by the total count of POS tags in the text.
                    [Noun count, Verb count, Adjective count, Adverb count]
    """
    doc = pipeline(text)
    POS_tags = ['NOUN', 'VERB', 'ADJ', 'ADV']
    POS_counts = [0, 0, 0, 0]
    total_count = 0
    for sentence in doc.sentences:
        for word in sentence.words:
            total_count += 1
            if word.upos in POS_tags:
                POS_counts[POS_tags.index(word.upos)] += 1
    # Normalize the counts (ratio of POS tag count to total count)
    if total_count != 0:
        POS_counts = [count / total_count for count in POS_counts]

    return POS_counts

In [11]:
def get_word_unigram_bigram_feature(text, vocabulary_sarcastic, vocabulary_regular, top_range):
    """!
    @brief Get the word unigram and bigram feature of a text.
    @param text (str): Text to be analyzed.
    @param vocabulary_sarcastic (dict): Vocabulary of sarcastic words.
    @param vocabulary_regular (dict): Vocabulary of regular words.
    @param top_range (int): Number of top words to be considered for the feature.
    @return (list): List of word unigram and bigram counts normalized by the total count of words in the text.
                    [count of sarcastic words, count of regular words, count of sarcastic bigrams, count of regular bigrams]
    """
    # Only consider the top words of the vocabulary
    vocabulary_sarcastic = dict(sorted(vocabulary_sarcastic.items(), key=lambda item: item[1], reverse=True)[:top_range])
    vocabulary_regular = dict(sorted(vocabulary_regular.items(), key=lambda item: item[1], reverse=True)[:top_range])
    # Get the word unigram and bigram counts
    word_unigram_bigram_counts = [0, 0, 0, 0]
    word_unigram_bigram_counts[0] = sum([text.count(word) for word in vocabulary_sarcastic.keys()])
    word_unigram_bigram_counts[1] = sum([text.count(word) for word in vocabulary_regular.keys()])
    word_unigram_bigram_counts[2] = sum([text.count(word) for word in vocabulary_sarcastic.keys() if len(word.split()) == 2])
    word_unigram_bigram_counts[3] = sum([text.count(word) for word in vocabulary_regular.keys() if len(word.split()) == 2])
    # Normalize the counts (ratio of word unigram and bigram count to total count)
    total_count = sum(word_unigram_bigram_counts)
    if total_count != 0:
        word_unigram_bigram_counts = [count / total_count for count in word_unigram_bigram_counts]
    return word_unigram_bigram_counts

In [12]:
def get_contextual_feature(text, sentiment_score, review_stars):
    """!
    @brief Get the contextual feature of a text.
    A sarcastic text may have a sentiment score that contradicts the review stars.
    @param text (str): Text to be analyzed.
    @param sentiment_score (float): Sentiment score of the text.
    @param review_stars (float): Review stars of the text.
    @return (float): Absolute difference between the sentiment score (normalized) and review stars.
    """
    
    # Normalize sentiment_score on a 0 to 5 scale (scale of review_stars)
    # Sentiment score is in the range [-1, 1]
    sentiment_score = (sentiment_score + 1) * 2.5
    diff = abs(sentiment_score - review_stars)
    return diff

In [44]:
def get_feature_vector(text, review_starts, vocabulary_sarcastic, vocabulary_regular, analyzer, constituency_parser):
    """!
    @brief Get the feature vector of a text.
    @param text (str): Text to be analyzed.
    @param review_starts (float): Review stars of the text.
    @param vocabulary_sarcastic (dict): Vocabulary of sarcastic set.
    @param vocabulary_regular (dict): Vocabulary of regular set.
    @param analyzer (SentimentIntensityAnalyzer): VADER sentiment analysis tool.
    @param constituency_parser (stanza.Pipeline): The Stanza pipeline use for the constituency parsing.
    @return (list): Feature vector of the text.
    """
    # Get features
    sentiment_score = get_sentiment_score_feature(text, analyzer)
    punctuation_counts = get_punctuation_feature(text)
    POS_counts = get_POS_feature(text, constituency_parser)
    word_unigram_bigram_counts = get_word_unigram_bigram_feature(text, vocabulary_sarcastic, vocabulary_regular, 100)
    contextual_feature = get_contextual_feature(text, sentiment_score, review_starts)

    # Concatenate features in a single vector
    feature_vector = [sentiment_score]
    feature_vector.extend(punctuation_counts)
    feature_vector.extend(POS_counts)
    feature_vector.extend(word_unigram_bigram_counts)
    feature_vector.append(contextual_feature)

    return feature_vector

In [42]:
def get_vocabularies_from_dataset(data, labels, vectorizer):
    """!
    @brief Get the vocabulary of sarcastic and regular texts from the dataset.
    @param data (list): List of texts.
    @param labels (list): List of labels.
    @param vectorizer (CountVectorizer): CountVectorizer object.
    @return (tuple): Vocabulary of sarcastic texts, vocabulary of regular texts.
    """
    sarcastic_sentences = list()
    regular_sentences = list()
    for i in range(len(data)):
        if labels[i] == 0:
            regular_sentences.append(data[i][0])
        else:
            sarcastic_sentences.append(data[i][0])
    vectorizer.fit(sarcastic_sentences)
    vocabulary_sarcastic = vectorizer.vocabulary_
    vectorizer.fit(regular_sentences)
    vocabulary_regular = vectorizer.vocabulary_
    return vocabulary_sarcastic, vocabulary_regular


### Build feature dataset

In [47]:
from tqdm import tqdm

In [39]:
analyzer = SentimentIntensityAnalyzer()
constituency_parser = stanza.Pipeline(lang='en', processors='tokenize,pos')
vectorizer = CountVectorizer(lowercase=True, stop_words='english', ngram_range=(1, 2), max_features=1000)

2023-12-01 11:14:13 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES
Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.6.0.json: 367kB [00:00, 18.7MB/s]                    
2023-12-01 11:14:13 INFO: Loading these models for language: en (English):
| Processor | Package         |
-------------------------------
| tokenize  | combined        |
| pos       | combined_charlm |

2023-12-01 11:14:13 INFO: Using device: cpu
2023-12-01 11:14:13 INFO: Loading: tokenize
2023-12-01 11:14:13 INFO: Loading: pos
2023-12-01 11:14:13 INFO: Done loading processors!


In [45]:
vocabulary_sarcastic, vocabulary_regular = get_vocabularies_from_dataset(train_data, train_labels, vectorizer)

In [48]:
feature_vectors_train = list()
feature_vectors_val = list()
feature_vectors_test = list()
for i in tqdm(range(len(train_data))):
    feature_vectors_train.append(get_feature_vector(train_data[i][0], train_data[i][1], vocabulary_sarcastic, vocabulary_regular, analyzer, constituency_parser))
for i in tqdm(range(len(val_data))):
    feature_vectors_val.append(get_feature_vector(val_data[i][0], val_data[i][1], vocabulary_sarcastic, vocabulary_regular, analyzer, constituency_parser))
for i in tqdm(range(len(test_data))):
    feature_vectors_test.append(get_feature_vector(test_data[i][0], test_data[i][1], vocabulary_sarcastic, vocabulary_regular, analyzer, constituency_parser))


100%|██████████| 1003/1003 [10:19<00:00,  1.62it/s]
100%|██████████| 125/125 [01:14<00:00,  1.67it/s]
100%|██████████| 126/126 [01:24<00:00,  1.49it/s]


In [49]:
print(feature_vectors_train[0])

[0.9825, 0.7142857142857143, 0.0, 0.0, 0.2857142857142857, 0.14640198511166252, 0.10421836228287841, 0.05707196029776675, 0.062034739454094295, 0.54, 0.46, 0.0, 0.0, 0.04375000000000018]


In [50]:
# Save feature vectors to numpy files
np.save('feature_vectors_train.npy', feature_vectors_train)
np.save('feature_vectors_val.npy', feature_vectors_val)
np.save('feature_vectors_test.npy', feature_vectors_test)

### Perform SVD

In [52]:
# Read data from numpy files
feature_vectors_train_read = np.load('feature_vectors_train.npy')
feature_vectors_val_read = np.load('feature_vectors_val.npy')
feature_vectors_test_read = np.load('feature_vectors_test.npy')

In [53]:
from sklearn import svm
from sklearn.metrics import classification_report

clf = svm.SVC(kernel='linear')
clf.fit(feature_vectors_train_read, train_labels)

y_pred = clf.predict(feature_vectors_val_read)
print(classification_report(val_labels, y_pred))

y_pred = clf.predict(feature_vectors_test_read)
print(classification_report(test_labels, y_pred))

              precision    recall  f1-score   support

           0       0.87      0.78      0.82        87
           1       0.60      0.74      0.66        38

    accuracy                           0.77       125
   macro avg       0.73      0.76      0.74       125
weighted avg       0.79      0.77      0.77       125

              precision    recall  f1-score   support

           0       0.87      0.86      0.86        91
           1       0.64      0.66      0.65        35

    accuracy                           0.80       126
   macro avg       0.75      0.76      0.75       126
weighted avg       0.80      0.80      0.80       126

