### Part 1: Data Cleaning

In [1]:
# Imports
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import keras
import tensorflow as tf
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from nltk.stem import PorterStemmer
import string
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.naive_bayes import GaussianNB
from sklearn import metrics

2022-12-02 01:59:33.668501: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-12-02 01:59:33.810275: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2022-12-02 01:59:33.810296: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
2022-12-02 01:59:34.571902: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory
2022-

In [2]:
# Read in data
data = pd.read_json("../Sarcasm_Headlines_Dataset_v2.json", lines=True, nrows=3000) #NOTE: Remove the nrows function to parse all data

In [3]:
# Remove duplicate headlines
data=data.drop(data[data['headline'].duplicated()].index,axis=0)
sarc_cnt = len(data.query('is_sarcastic==1'))
non_sarc_cnt = len(data.query('is_sarcastic==0'))

# Print out summary of sarcastic lines
print(f'There are {sarc_cnt} sarcastic headlines and {non_sarc_cnt} non-sarcastic headlines')

There are 1452 sarcastic headlines and 1547 non-sarcastic headlines


In [4]:
# Import stopwords from nltk
stwrds = set(stopwords.words('english'))
ps = PorterStemmer()

# Define a method to clean a given headline by lowercasing the string, removing spaces, and removing stopwords
def clean_headlines(headline):
    headline = headline.lower()
    headline_split = headline.split()
    cleaned_headline = []
    for word in headline_split:
        if word not in stwrds and word not in string.punctuation:
            cleaned_headline.append(word)
    cleaned_line = " ".join(cleaned_headline)
    return cleaned_line

### Part 2: Creating N-grams of Size 2

In [5]:
# Clean the headlines
data['cleaned'] = data['headline'].apply(clean_headlines)

# Create the N-grams (of size 2) for each headline
cv = CountVectorizer(ngram_range=(2,2))
res = cv.fit_transform(data.iloc[0:2].cleaned)
res = cv.fit_transform(data.cleaned)

features = pd.DataFrame(res.toarray(),columns=cv.get_feature_names_out())

### Part 3: Creating and Training Gaussian Naive Bayes model using N-grams of size 2

In [6]:
# Split the data
X_train, X_test, y_train, y_test= train_test_split(features, data.is_sarcastic, test_size = 0.33)

#Calling the Class
naive_bayes = GaussianNB()
 
#Fitting the data to the classifier
naive_bayes.fit(X_train , y_train)
 
#Predict on test data
y_predicted = naive_bayes.predict(X_test)

### Part 4: Display Model Metrics for N-grams of size 2

In [7]:
# Give classification report

print(classification_report(y_test, y_predicted))

              precision    recall  f1-score   support

           0       0.56      0.93      0.70       528
           1       0.70      0.18      0.29       462

    accuracy                           0.58       990
   macro avg       0.63      0.56      0.49       990
weighted avg       0.63      0.58      0.51       990



In [8]:
# Give the confusion matrix for the training dataset

#confusion_matrix(y_train, naive_bayes.predict(X_train))

In [9]:
# Give the confusion matrix for the testing dataset

confusion_matrix(y_test, y_predicted)

array([[492,  36],
       [379,  83]])

### Part 5: Creating N-grams of Size 3

In [10]:
# Clean the headlines
data['cleaned'] = data['headline'].apply(clean_headlines)

# Create the N-grams (of size 2) for each headline
cv = CountVectorizer(ngram_range=(3,3))
res = cv.fit_transform(data.iloc[0:2].cleaned)
res = cv.fit_transform(data.cleaned)

features = pd.DataFrame(res.toarray(),columns=cv.get_feature_names_out())

### Part 6: Creating and Training Gaussian Naive Bayes model using N-grams of size 3

In [11]:
# Split the data
X_train, X_test, y_train, y_test= train_test_split(features, data.is_sarcastic, test_size = 0.33)

#Calling the Class
naive_bayes = GaussianNB()
 
#Fitting the data to the classifier
naive_bayes.fit(X_train , y_train)
 
#Predict on test data
y_predicted = naive_bayes.predict(X_test)

### Part 7: Display Model Metrics for N-grams of size 3

In [12]:
# Give classification report

print(classification_report(y_test, y_predicted))

              precision    recall  f1-score   support

           0       0.54      1.00      0.70       531
           1       1.00      0.01      0.03       459

    accuracy                           0.54       990
   macro avg       0.77      0.51      0.36       990
weighted avg       0.75      0.54      0.39       990



In [13]:
# Give the confusion matrix for the training dataset

#confusion_matrix(y_train, naive_bayes.predict(X_train))

In [14]:
# Give the confusion matrix for the testing dataset

confusion_matrix(y_test, y_predicted)

array([[531,   0],
       [453,   6]])