In [174]:
import os
import numpy as np

### Read data from Sarcasm Amazon Review dataset

In [97]:
import pandas as pd

In [104]:
amazon_dataset_path = os.path.join('..', '..', 'datasets', 'amazon_combined.csv')
dataset = pd.read_csv(amazon_dataset_path)
dataset.head()

Unnamed: 0,stars,title,date,author,product,review,is_sarcastic
0,1.0,"Listening to this ""Hurt"" me!","November 8, 2007","MomKKC ""momkkc""",The Sun Also Rises (Audio CD),William Hurt cannot read. At all. The cadenc...,1
1,1.0,"40% price hike, hmm","April 15, 2010",M. Barnhart,"Heineken BT06 BeerTender Tubes, Pack of 6 (Kit...","As another reviewer noted, these used to be 10...",1
2,5.0,Don't Mess With the Lupine Trinity!!!,"June 2, 2010",Jake &#34;The Wolfman&#34; Sanchez,The Mountain Three Wolf Moon Short Sleeve Tee ...,I've read several reviews from people who have...,1
3,1.0,IT'S A BLENDER!,"June 17, 2010",S. Cashdollar,Margaritaville DM1000 Frozen Concoction Maker ...,If you pay $250 for this blender you need your...,1
4,1.0,Another movie to ignore....,"April 24, 2010","Kody ""ParisHiltonFan""",Valentine's Day (DVD),A perfect date movie: you'll miss absolutely n...,1


In [105]:
# Suffle the dataset (random_state=1 for reproducibility)
dataset = dataset.sample(frac=1, random_state=1).reset_index(drop=True)
dataset.head()

Unnamed: 0,stars,title,date,author,product,review,is_sarcastic
0,1.0,Meyers' books are GREAT....,"April 4, 2008",Smarmstress,"New Moon (The Twilight Saga, Book 2) (Hardcover)",...if you're an aspiring writer in need of a p...,1
1,5.0,The correct 5-quart bowl numbers,"January 2, 2007",Steven Quigley,KitchenAid K5ASBP Bowl for 5-Quart Professiona...,"If you do a lot of cooking or baking, an extra...",0
2,1.0,Please. Everyone. Stop! Please stop wearing...,"May 26, 2009","The Wild Gunman ""man on the run""","crocs Classic Sandal (3.5"" and 5.25"" disks)",Let's get one thing straight. These things ar...,1
3,1.0,Sansa m250 Promises but fails to deliver,"June 16, 2008",Hemanth Kumar,SanDisk Sansa m250 2 GB MP3 Player (Black) (El...,I bought it and it worked right for some time....,0
4,4.0,What can I say - it's Bacon!,"February 24, 2009","W. D. Hairston ""Huh? What?""",Bacon Bandaid Bandages (Toy),What can one possibly say that isn't self expl...,1


In [112]:
# Split the dataset into train, validation and test sets
train_size = int(0.8 * len(dataset))
val_size = int(0.1 * len(dataset))
test_size = len(dataset) - train_size - val_size

train_dataset = dataset[:train_size]
val_dataset = dataset[train_size:train_size+val_size].reset_index(drop=True)
test_dataset = dataset[train_size+val_size:].reset_index(drop=True)

# Sizes of the sub-datasets
print("Subsets sizes:")
print(len(train_dataset), len(val_dataset), len(test_dataset))
val_dataset.head()

#Labels rates (0: Regular, 1: Ironic)
print("Labels rates:")
print(train_dataset['is_sarcastic'].value_counts())
print(val_dataset['is_sarcastic'].value_counts())
print(test_dataset['is_sarcastic'].value_counts())

Subsets sizes:
1003 125 126
Labels rates:
is_sarcastic
0    643
1    360
Name: count, dtype: int64
is_sarcastic
0    84
1    41
Name: count, dtype: int64
is_sarcastic
0    90
1    36
Name: count, dtype: int64


### Features building functions

In [127]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from sklearn.feature_extraction.text import CountVectorizer
import stanza
import spacy
from tqdm import tqdm
from nltk import word_tokenize          
from nltk.stem import WordNetLemmatizer
import string
from rake_nltk import Rake

In [114]:
def get_sentiment_score_feature(text, analyzer):
    """!
    @brief Get the sentiment score feature of a text using VADER sentiment analysis tool.
    @param text (str): Text to be analyzed.
    @param analyzer (SentimentIntensityAnalyzer): VADER sentiment analysis tool.
    @return (dict): Sentiment score feature of the text.
    """
    return analyzer.polarity_scores(text)["compound"]

In [115]:
def get_punctuation_feature(text):
    """!
    @brief Get the punctuation feature of a text.
    @param text (str): Text to be analyzed.
    @return (list): List of punctuation counts normalized by the total count of punctuation in the text.
                    [count of '.', count of '!', count of '?', count of ',']
    """
    punctuations = ['.', '!', '?', ',']
    punctuations_counts = list()
    total_count = 0
    for punctuation in punctuations:
        count = text.count(punctuation)
        punctuations_counts.append(count)
        total_count += count
    # Normalize the counts (ratio of punctuation count to total count)
    if total_count != 0:
        punctuations_counts = [count / total_count for count in punctuations_counts]
    return punctuations_counts

In [116]:
def get_POS_feature(text, pipeline):
    """!
    @brief Get the POS feature of a text.
    @param text (str): Text to be analyzed.
    @param pipeline (stanza.Pipeline): The Stanza pipeline use for the constituency parsing.
    @return (list): List of POS tag counts normalized by the total count of POS tags in the text.
                    [Noun count, Verb count, Adjective count, Adverb count]
    """
    doc = pipeline(text)
    POS_tags = ['NOUN', 'VERB', 'ADJ', 'ADV']
    POS_counts = [0, 0, 0, 0]
    total_count = 0
    for sentence in doc.sentences:
        for word in sentence.words:
            total_count += 1
            if word.upos in POS_tags:
                POS_counts[POS_tags.index(word.upos)] += 1
    # Normalize the counts (ratio of POS tag count to total count)
    if total_count != 0:
        POS_counts = [count / total_count for count in POS_counts]

    return POS_counts

In [117]:
def get_word_unigram_bigram_feature(text, vocabulary_sarcastic, vocabulary_regular, top_range):
    """!
    @brief Get the word unigram and bigram feature of a text.
    @param text (str): Text to be analyzed.
    @param vocabulary_sarcastic (dict): Vocabulary of sarcastic words.
    @param vocabulary_regular (dict): Vocabulary of regular words.
    @param top_range (int): Number of top words to be considered for the feature.
    @return (list): List of word unigram and bigram counts normalized by the total count of words in the text.
                    [count of sarcastic words, count of regular words, count of sarcastic bigrams, count of regular bigrams]
    """
    # Only consider the top words of the vocabulary
    vocabulary_sarcastic = dict(sorted(vocabulary_sarcastic.items(), key=lambda item: item[1], reverse=True)[:top_range])
    vocabulary_regular = dict(sorted(vocabulary_regular.items(), key=lambda item: item[1], reverse=True)[:top_range])
    # Get the word unigram and bigram counts
    word_unigram_bigram_counts = [0, 0, 0, 0]
    word_unigram_bigram_counts[0] = sum([text.count(word) for word in vocabulary_sarcastic.keys()])
    word_unigram_bigram_counts[1] = sum([text.count(word) for word in vocabulary_regular.keys()])
    word_unigram_bigram_counts[2] = sum([text.count(word) for word in vocabulary_sarcastic.keys() if len(word.split()) == 2])
    word_unigram_bigram_counts[3] = sum([text.count(word) for word in vocabulary_regular.keys() if len(word.split()) == 2])
    # Normalize the counts (ratio of word unigram and bigram count to total count)
    total_count = sum(word_unigram_bigram_counts)
    if total_count != 0:
        word_unigram_bigram_counts = [count / total_count for count in word_unigram_bigram_counts]
    return word_unigram_bigram_counts

In [118]:
def get_contextual_feature(text, sentiment_score, review_stars):
    """!
    @brief Get the contextual feature of a text.
    A sarcastic text may have a sentiment score that contradicts the review stars.
    @param text (str): Text to be analyzed.
    @param sentiment_score (float): Sentiment score of the text.
    @param review_stars (float): Review stars of the text.
    @return (float): Absolute difference between the sentiment score (normalized) and review stars.
    """
    
    # Normalize sentiment_score on a 0 to 5 scale (scale of review_stars)
    # Sentiment score is in the range [-1, 1]
    sentiment_score = (sentiment_score + 1) * 2.5
    diff = abs(sentiment_score - review_stars)
    return diff

In [135]:
class LemmaTokenizer:
    def __init__(self):
        self.wnl = WordNetLemmatizer()
    def __call__(self, doc):
        return [self.wnl.lemmatize(t) for t in word_tokenize(doc)]

In [132]:
def get_dict_from_keyword_extracted(keyword_extracted):
    vectorizer = CountVectorizer(lowercase=True, tokenizer=LemmaTokenizer())
    matrix = vectorizer.fit_transform(keyword_extracted)
    counts_dict = dict()
    i = 0
    for word in vectorizer.get_feature_names_out():
        counts_dict[word] = matrix.sum(axis=0).tolist()[0][i]
        i += 1
    counts_dict = dict(sorted(counts_dict.items(), key=lambda item: item[1], reverse=True))
    # Remove punctuation
    for punctuation in string.punctuation:
        if punctuation in counts_dict.keys():
            counts_dict.pop(punctuation)
    return counts_dict

In [160]:
def get_similarity_feature(review, title, similarity_pipeline, keyword_extractor):
    """!
    @brief Get the similarity feature of a review and a title.
    @param review (str): Review to be analyzed.
    @param title (str): Title to be analyzed.
    @param similarity_pipeline (spacy.lang.en.English): The Spacy pipeline use for the similarity analysis.
    @param keyword_extractor (Rake): The Rake keyword extractor.
    @return (float): Average similarity between the review and the title.
    """
    # Extract keywords from title and review
    keyword_extractor.extract_keywords_from_text(title)
    dict_title = get_dict_from_keyword_extracted(keyword_extractor.get_ranked_phrases())
    keyword_extractor.extract_keywords_from_text(review)
    dict_review = get_dict_from_keyword_extracted(keyword_extractor.get_ranked_phrases())
    
    # Sort keywords by frequency in the review and only keep the top ones (same number as the title)
    dict_review = list(sorted(dict_review.items(), key=lambda item: item[1], reverse=True))
    dict_review = dict_review[:len(dict_title)]

    # Similarities between top keywords in title and review
    similarities = list()
    for word_title in dict_title.keys():
        for word_review in dict_review:
                similarities.append(similarity_pipeline(word_title).similarity(similarity_pipeline(word_review[0])))
    
    average = sum(similarities) / len(similarities)
    return average

Just a little experiment to verify the relevance of this feature

In [161]:
# Get the average similary feature for sarcastic and regular dataset
similarity_pipeline = spacy.load("en_core_web_lg")
keyword_extractor = Rake()

average_similarity_sarcastic = 0
average_similarity_regular = 0
for i in tqdm(range(len(train_dataset))):
    if train_dataset['is_sarcastic'][i] == 1:
        average_similarity_sarcastic += get_similarity_feature(train_dataset['review'][i], train_dataset['product'][i], similarity_pipeline, keyword_extractor)
    else:
        average_similarity_regular += get_similarity_feature(train_dataset['review'][i], train_dataset['product'][i], similarity_pipeline, keyword_extractor)
average_similarity_sarcastic /= len(train_dataset[train_dataset['is_sarcastic'] == 1])
average_similarity_regular /= len(train_dataset[train_dataset['is_sarcastic'] == 0])
print("Average similarity feature for sarcastic dataset:", average_similarity_sarcastic)
print("Average similarity feature for regular dataset:", average_similarity_regular)

  similarities.append(similarity_pipeline(word_title).similarity(similarity_pipeline(word_review[0])))
100%|██████████| 1003/1003 [04:44<00:00,  3.53it/s]

Average similarity feature for sarcastic dataset: 0.14598271274568028
Average similarity feature for regular dataset: 0.15680566635173698





In [191]:
def get_feature_vector(review, vocabulary_sarcastic, vocabulary_regular, analyzer, constituency_parser, keyword_extractor, similarity_pipeline):
    """!
    @brief Get the feature vector of a text.
    @param text (str): Text to be analyzed.
    @param review_starts (float): Review stars of the text.
    @param vocabulary_sarcastic (dict): Vocabulary of sarcastic set.
    @param vocabulary_regular (dict): Vocabulary of regular set.
    @param analyzer (SentimentIntensityAnalyzer): VADER sentiment analysis tool.
    @param constituency_parser (stanza.Pipeline): The Stanza pipeline use for the constituency parsing.
    @param keyword_extractor (Rake): The Rake keyword extractor.
    @param similarity_pipeline (spacy.lang.en.English): The Spacy pipeline use for the similarity analysis.
    @return (list): Feature vector of the text.
    [sentiment score, punctuation counts, POS counts, word unigram and bigram counts, contextual feature, similarity feature, review stars]
    """
    # Get features
    sentiment_score = get_sentiment_score_feature(review['review'], analyzer)
    punctuation_counts = get_punctuation_feature(review['review'])
    POS_counts = get_POS_feature(review['review'], constituency_parser)
    word_unigram_bigram_counts = get_word_unigram_bigram_feature(review['review'], vocabulary_sarcastic, vocabulary_regular, 1000)
    contextual_feature = get_contextual_feature(review['review'], sentiment_score, review['stars'])
    similarity_feature = get_similarity_feature(review['review'], review['product'], similarity_pipeline, keyword_extractor)

    # Concatenate features in a single vector
    feature_vector = [sentiment_score]
    feature_vector.extend(punctuation_counts)
    feature_vector.extend(POS_counts)
    feature_vector.extend(word_unigram_bigram_counts)
    feature_vector.append(contextual_feature)
    feature_vector.append(similarity_feature)
    feature_vector.append(review['stars'])

    return feature_vector

In [175]:
def get_vocabularies_from_dataset(dataset, vectorizer):
    """!
    @brief Get the vocabulary of sarcastic and regular texts from the dataset.
    @param data (list): List of texts.
    @param labels (list): List of labels.
    @param vectorizer (CountVectorizer): CountVectorizer object.
    @return (tuple): Vocabulary of sarcastic texts, vocabulary of regular texts.
    """
    sarcastic_sentences = list()
    regular_sentences = list()
    for i in range(len(dataset)):
        if dataset['is_sarcastic'][i] == 1:
            sarcastic_sentences.append(dataset['review'][i])
        else:
            regular_sentences.append(dataset['review'][i])
    vectorizer.fit(sarcastic_sentences)
    vocabulary_sarcastic = vectorizer.vocabulary_
    vectorizer.fit(regular_sentences)
    vocabulary_regular = vectorizer.vocabulary_
    return vocabulary_sarcastic, vocabulary_regular


### Build feature dataset

In [188]:
analyzer = SentimentIntensityAnalyzer()
constituency_parser = stanza.Pipeline(lang='en', processors='tokenize,pos')
vectorizer = CountVectorizer(lowercase=True, stop_words='english', ngram_range=(1, 2))
similarity_pipeline = spacy.load("en_core_web_lg")
keyword_extractor = Rake()

2023-12-01 17:29:02 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES
Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.6.0.json: 367kB [00:00, 46.5MB/s]                    
2023-12-01 17:29:03 INFO: Loading these models for language: en (English):
| Processor | Package         |
-------------------------------
| tokenize  | combined        |
| pos       | combined_charlm |

2023-12-01 17:29:03 INFO: Using device: cpu
2023-12-01 17:29:03 INFO: Loading: tokenize
2023-12-01 17:29:03 INFO: Loading: pos
2023-12-01 17:29:03 INFO: Done loading processors!


In [189]:
vocabulary_sarcastic, vocabulary_regular = get_vocabularies_from_dataset(train_dataset, vectorizer)

In [193]:
get_feature_vector(train_dataset.iloc[0], vocabulary_sarcastic, vocabulary_regular, analyzer, constituency_parser, keyword_extractor, similarity_pipeline)

[0.9995,
 0.4,
 0.044444444444444446,
 0.022222222222222223,
 0.5333333333333333,
 0.11348464619492657,
 0.102803738317757,
 0.08945260347129506,
 0.06275033377837116,
 0.66,
 0.26,
 0.06,
 0.02,
 3.9987500000000002,
 0.07309403448939084,
 1.0]

In [194]:
feature_vectors_train = list()
feature_vectors_val = list()
feature_vectors_test = list()
for i in tqdm(range(len(train_dataset))):
    feature_vectors_train.append(get_feature_vector(train_dataset.iloc[i], vocabulary_sarcastic, vocabulary_regular, analyzer, constituency_parser, keyword_extractor, similarity_pipeline))

for i in tqdm(range(len(val_dataset))):
    feature_vectors_val.append(get_feature_vector(val_dataset.iloc[i], vocabulary_sarcastic, vocabulary_regular, analyzer, constituency_parser, keyword_extractor, similarity_pipeline))

for i in tqdm(range(len(test_dataset))):
    feature_vectors_test.append(get_feature_vector(test_dataset.iloc[i], vocabulary_sarcastic, vocabulary_regular, analyzer, constituency_parser, keyword_extractor, similarity_pipeline))

  similarities.append(similarity_pipeline(word_title).similarity(similarity_pipeline(word_review[0])))
100%|██████████| 1003/1003 [15:23<00:00,  1.09it/s]
100%|██████████| 125/125 [01:54<00:00,  1.09it/s]
100%|██████████| 126/126 [01:58<00:00,  1.06it/s]


In [195]:
print(feature_vectors_train[0])

[0.9995, 0.4, 0.044444444444444446, 0.022222222222222223, 0.5333333333333333, 0.11348464619492657, 0.102803738317757, 0.08945260347129506, 0.06275033377837116, 0.66, 0.26, 0.06, 0.02, 3.9987500000000002, 0.07309403448939084, 1.0]


In [196]:
# Save feature vectors to numpy files
np.save('feature_vectors_train.npy', feature_vectors_train)
np.save('feature_vectors_val.npy', feature_vectors_val)
np.save('feature_vectors_test.npy', feature_vectors_test)

### Perform SVD

In [197]:
# Read data from numpy files
feature_vectors_train_read = np.load('feature_vectors_train.npy')
feature_vectors_val_read = np.load('feature_vectors_val.npy')
feature_vectors_test_read = np.load('feature_vectors_test.npy')

In [198]:
from sklearn import svm
from sklearn.metrics import classification_report

clf = svm.SVC(kernel='linear')
clf.fit(feature_vectors_train_read, train_dataset['is_sarcastic'])

y_pred = clf.predict(feature_vectors_val_read)
print(classification_report(val_dataset['is_sarcastic'], y_pred))

y_pred = clf.predict(feature_vectors_test_read)
print(classification_report(test_dataset['is_sarcastic'], y_pred))

              precision    recall  f1-score   support

           0       0.89      0.89      0.89        84
           1       0.78      0.78      0.78        41

    accuracy                           0.86       125
   macro avg       0.84      0.84      0.84       125
weighted avg       0.86      0.86      0.86       125

              precision    recall  f1-score   support

           0       0.89      0.88      0.88        90
           1       0.70      0.72      0.71        36

    accuracy                           0.83       126
   macro avg       0.80      0.80      0.80       126
weighted avg       0.83      0.83      0.83       126



  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
