Sarcasm Detection Method 2

In [None]:
import gensim
from nltk.corpus import stopwords
import numpy as np
import scipy as sp
import re
from sklearn.cluster import KMeans
from scipy.spatial.distance import cdist
from sklearn import metrics
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
bigmodel = gensim.models.KeyedVectors.load_word2vec_format('/content/drive/MyDrive/SarcasmProjectNLP/GoogleNews-vectors-negative300-SLIM.bin', binary=True)

In [None]:
notsarcasm = []     # list of non sarcastic comments
notsarcasmtoks = []  # list of lists of tokens in the non sarcastic comments


# reads in the non sarcasm train file and tokenizes (with or without parent comment)
f = open('/content/drive/MyDrive/SarcasmProjectNLP/train_not_sarcasm.txt')
# f = open('/content/drive/MyDrive/SarcasmProjectNLP/train_not_npsarcasm.txt')
for line in f:
    line = line.rstrip()
    notsarcasm.append(line)    
    line = re.sub(r"(^| )[0-9]+($| )", r" ", line)  
    addme = [t.lower() for t in line.split()]
    notsarcasmtoks.append(addme)
f.close()

In [None]:
# list of 300-dimensional vectors per non sarcastic comment
notsarcasmvectors = []   

for h in notsarcasmtoks:
    totvec = np.zeros(300)
    for w in h:
        if w.lower() in bigmodel:
            totvec = totvec + bigmodel[w.lower()]
    notsarcasmvectors.append(totvec)

In [None]:
# Tried a variety of cluster sizes ranging from 30 to 60 for Knn

# kmnews = KMeans(n_clusters=30, random_state=0)
# kmnews = KMeans(n_clusters=40, random_state=0)
kmnews = KMeans(n_clusters=50, random_state=0)
# kmnews = KMeans(n_clusters=60, random_state=0)
notsarcasmclusters = kmnews.fit_predict(notsarcasmvectors)

In [None]:
sarcasm = []     # list of sarcastic comments
sarcasmtoks = []  # list of lists of tokens in the sarcastic comments

# reads in the sarcasm train file and tokenizes (with or without parent comment)
f = open('/content/drive/MyDrive/SarcasmProjectNLP/train_sarcasm.txt')
# f = open('/content/drive/MyDrive/SarcasmProjectNLP/train_npsarcasm.txt')
for line in f:
    line = line.rstrip()
    sarcasm.append(line)    
    line = re.sub(r"(^| )[0-9]+($| )", r" ", line)
    addme = [t.lower() for t in line.split()]
    sarcasmtoks.append(addme)
f.close()

# list of 300-dimensional vectors per sarcastic comment
sarcasmvectors = []      

for h in sarcasmtoks:
    totvec = np.zeros(300)
    for w in h:
        if w.lower() in bigmodel:
            totvec = totvec + bigmodel[w.lower()]
    sarcasmvectors.append(totvec)

In [None]:
kmclick = KMeans(n_clusters=50, random_state=0)  
sarcasmclusters = kmclick.fit_predict(sarcasmvectors)  

In [None]:
testtargets = []  # stores whether a test comment is sarcastic or not
testvectors = []  # stores the vector of the test comment

# reads in test file and stores the appropriate values in the lists above (with and without parent comment)
f = open('/content/drive/MyDrive/SarcasmProjectNLP/sarcasm_test.txt')
# f = open('/content/drive/MyDrive/SarcasmProjectNLP/npsarcasm_test.txt')

for line in f:
    line = line.rstrip()    
    line = re.sub(r"(^| )[0-9]+($| )", r" ", line)
    if line[0] == ' ':
      testtargets.append(1)
    else:
      testtargets.append(int(line[0]))
    line = line[2:]
    vectors = [t.lower() for t in line.split()]
    totvec = np.zeros(300)
    for h in vectors:
        if h.lower() in bigmodel:
            totvec = totvec + bigmodel[h.lower()]
    testvectors.append(totvec)

# calculates cosine distance between vectors

sarcasmdistances = cdist(testvectors, sarcasmvectors)

notsarcasmdistances = cdist(testvectors, notsarcasmvectors)

sarcasmmins = sarcasmdistances.min(axis=1)

notsarcasmmins = notsarcasmdistances.min(axis=1)

predictedknn = []

# predicts correct label based on the minimum cosine distance (a larger value means vectors are closer together)

for i in range(len(notsarcasmmins)):
    if notsarcasmmins[i] < sarcasmmins[i]: 
        predictedknn.append(0)
    else:
        predictedknn.append(1)

print(metrics.classification_report(testtargets, predictedknn))

              precision    recall  f1-score   support

           0       0.53      0.54      0.54      5001
           1       0.53      0.52      0.52      5000

    accuracy                           0.53     10001
   macro avg       0.53      0.53      0.53     10001
weighted avg       0.53      0.53      0.53     10001



In [None]:
alltargets = list(np.ones(len(sarcasmvectors)))
alltargets.extend(np.zeros(len(notsarcasmvectors)))
alltargets = np.array(alltargets)

allvectors = sarcasmvectors + notsarcasmvectors

# Using Naive Bayes on the dataset by fitting it to the training data then predicting on the test set

model = GaussianNB()

model.fit(allvectors, alltargets)

expected = testtargets
predicted = model.predict(testvectors)

print(metrics.classification_report(expected, predicted))


# Using LinearSVM on the dataset by fitting it to the training data then predicting on the test set

lsvc = LinearSVC(dual=False, max_iter = 1000)

lsvc.fit(allvectors, alltargets)

expected = testtargets
predicted = lsvc.predict(testvectors)

print(metrics.classification_report(expected, predicted))

# Using Logistic Regression on the dataset by fitting it to the training data then predicting on the test set

logreg = LogisticRegression(max_iter = 1000)

logreg.fit(allvectors, alltargets)

expected = testtargets
predicted = logreg.predict(testvectors)

print(metrics.classification_report(expected, predicted))

              precision    recall  f1-score   support

           0       0.54      0.09      0.16      5001
           1       0.50      0.92      0.65      5000

    accuracy                           0.51     10001
   macro avg       0.52      0.51      0.41     10001
weighted avg       0.52      0.51      0.41     10001

              precision    recall  f1-score   support

           0       0.59      0.64      0.61      5001
           1       0.61      0.56      0.58      5000

    accuracy                           0.60     10001
   macro avg       0.60      0.60      0.60     10001
weighted avg       0.60      0.60      0.60     10001

              precision    recall  f1-score   support

           0       0.59      0.64      0.61      5001
           1       0.61      0.56      0.58      5000

    accuracy                           0.60     10001
   macro avg       0.60      0.60      0.60     10001
weighted avg       0.60      0.60      0.60     10001

