In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import keras
import tensorflow as tf
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from nltk.stem import PorterStemmer
import string

In [57]:
data = pd.read_json("./Sarcasm_Headlines_Dataset_v2.json",lines=True)

In [58]:
# remove duplicate headlines
data=data.drop(data[data['headline'].duplicated()].index,axis=0)
sarc_cnt = len(data.query('is_sarcastic==1'))
non_sarc_cnt = len(data.query('is_sarcastic==0'))

# Summary of sarcastic lines
print(f'There are {sarc_cnt} sarcastic headlines and {non_sarc_cnt} non-sarcastic headlines')

There are 13552 sarcastic headlines and 14951 non-sarcastic headlines


## Part 2: Data Processing/Cleaning

In [14]:
# import stopwords from nltk
stwrds = set(stopwords.words('english'))
ps = PorterStemmer()
# method to clean a given headline by lowercasing the string, removing spaces, and removing stopwords
def clean_headlines(headline):
    headline = headline.lower()
    headline_split = headline.split()
    cleaned_headline = []
    for word in headline_split:
        if word not in stwrds and word not in string.punctuation:
            cleaned_headline.append(ps.stem(word))
    cleaned_line = " ".join(cleaned_headline)
    return cleaned_line

In [15]:
data['cleaned'] = data['headline'].apply(clean_headlines)

In [17]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(ngram_range=(2,2))

In [23]:
labels = data.is_sarcastic

In [18]:
res = cv.fit_transform(data.iloc[0:2].cleaned)
res = cv.fit_transform(data.cleaned)
res.shape

(28503, 148333)

In [19]:
df = pd.DataFrame(res.toarray(),columns=cv.get_feature_names_out())

In [88]:
df

Unnamed: 0,00 probabl,000 000,000 100,000 115,000 15,000 acr,000 actual,000 adopt,000 airlin,000 american,...,zuckerberg regret,zuckerberg sell,zuckerberg senat,zuckerberg tout,zuckerberg trump,zuckerberg watch,zuckerberg wish,zuckerberg you,zz top,ünite stäte
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
28498,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
28499,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
28500,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
28501,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [21]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix

In [22]:
model = LogisticRegression(solver='liblinear', random_state=0)

In [24]:
 test_split = train_test_split(df,labels, test_size=.3)

In [32]:
feature_train, feature_test, labels_train, labels_test= test_split

In [33]:
model.fit(feature_train,labels_train)

In [37]:
model.predict_proba(feature_train)

array([[0.81269162, 0.18730838],
       [0.21131565, 0.78868435],
       [0.83364155, 0.16635845],
       ...,
       [0.73603691, 0.26396309],
       [0.78623822, 0.21376178],
       [0.71691896, 0.28308104]])

In [38]:
confusion_matrix(labels_train, model.predict(feature_train))

array([[10454,     5],
       [    8,  9485]])

In [39]:
confusion_matrix(labels_test, model.predict(feature_test))

array([[4063,  429],
       [2412, 1647]])

In [41]:
print(classification_report(labels_test, model.predict(feature_test)))

              precision    recall  f1-score   support

           0       0.63      0.90      0.74      4492
           1       0.79      0.41      0.54      4059

    accuracy                           0.67      8551
   macro avg       0.71      0.66      0.64      8551
weighted avg       0.71      0.67      0.64      8551



In [42]:
model = LogisticRegression(solver='liblinear', C=10.0, random_state=0)

In [43]:
model.fit(feature_train,labels_train)

In [44]:
print(classification_report(labels_test, model.predict(feature_test)))

              precision    recall  f1-score   support

           0       0.63      0.90      0.74      4492
           1       0.79      0.41      0.54      4059

    accuracy                           0.67      8551
   macro avg       0.71      0.66      0.64      8551
weighted avg       0.71      0.67      0.65      8551



In [51]:
print ([i for i in range(2,15,2)])

[2, 4, 6, 8, 10, 12, 14]


In [52]:
from tqdm import tqdm

In [53]:
for i in tqdm (range (2,15,2)):
    if i in[0,1,10]: continue
    model = LogisticRegression(solver='liblinear', C=i, random_state=0)
    model.fit(feature_train,labels_train)
    print("C = ", i)
    print(classification_report(labels_test, model.predict(feature_test)))

  0%|                                                     | 0/7 [00:00<?, ?it/s]

C =  2


 14%|██████▍                                      | 1/7 [00:51<05:08, 51.35s/it]

              precision    recall  f1-score   support

           0       0.63      0.90      0.74      4492
           1       0.79      0.41      0.54      4059

    accuracy                           0.67      8551
   macro avg       0.71      0.66      0.64      8551
weighted avg       0.71      0.67      0.65      8551

C =  4


 29%|████████████▊                                | 2/7 [01:40<04:09, 49.94s/it]

              precision    recall  f1-score   support

           0       0.63      0.90      0.74      4492
           1       0.79      0.42      0.55      4059

    accuracy                           0.67      8551
   macro avg       0.71      0.66      0.64      8551
weighted avg       0.71      0.67      0.65      8551

C =  6


 43%|███████████████████▎                         | 3/7 [02:31<03:22, 50.70s/it]

              precision    recall  f1-score   support

           0       0.63      0.90      0.74      4492
           1       0.79      0.42      0.54      4059

    accuracy                           0.67      8551
   macro avg       0.71      0.66      0.64      8551
weighted avg       0.71      0.67      0.65      8551

C =  8


 57%|█████████████████████████▋                   | 4/7 [03:20<02:29, 49.95s/it]

              precision    recall  f1-score   support

           0       0.63      0.90      0.74      4492
           1       0.79      0.41      0.54      4059

    accuracy                           0.67      8551
   macro avg       0.71      0.66      0.64      8551
weighted avg       0.71      0.67      0.65      8551

C =  12


 86%|██████████████████████████████████████▌      | 6/7 [04:09<00:36, 36.51s/it]

              precision    recall  f1-score   support

           0       0.63      0.90      0.74      4492
           1       0.79      0.41      0.54      4059

    accuracy                           0.67      8551
   macro avg       0.71      0.66      0.64      8551
weighted avg       0.71      0.67      0.64      8551

C =  14


100%|█████████████████████████████████████████████| 7/7 [04:58<00:00, 42.67s/it]

              precision    recall  f1-score   support

           0       0.63      0.90      0.74      4492
           1       0.79      0.41      0.54      4059

    accuracy                           0.67      8551
   macro avg       0.71      0.65      0.64      8551
weighted avg       0.71      0.67      0.64      8551






## Part 3: Data Processing/Cleaning 3


In [60]:
# import stopwords from nltk
stwrds = set(stopwords.words('english'))
ps = PorterStemmer()
# method to clean a given headline by lowercasing the string, removing spaces, and removing stopwords
def clean_headlines(headline):
    headline = headline.lower()
    headline_split = headline.split()
    cleaned_headline = []
    for word in headline_split:
        if word not in stwrds and word not in string.punctuation:
            cleaned_headline.append(ps.stem(word))
    cleaned_line = " ".join(cleaned_headline)
    return cleaned_line

In [61]:
data['cleaned'] = data['headline'].apply(clean_headlines)

In [62]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(ngram_range=(3,3))

In [63]:
labels = data.is_sarcastic

In [64]:
res = cv.fit_transform(data.iloc[0:2].cleaned)
res = cv.fit_transform(data.cleaned)
res.shape

(28503, 150155)

In [65]:
df = pd.DataFrame(res.toarray(),columns=cv.get_feature_names_out())

In [66]:
df

Unnamed: 0,00 probabl upstand,000 000 115,000 000 decis,000 000 robot,000 100 000,000 115 000,000 15 000,000 acr area,000 acr feder,000 acr nation,...,zuckerberg regret reject,zuckerberg sell person,zuckerberg senat testimoni,zuckerberg tout complet,zuckerberg trump time,zuckerberg watch engag,zuckerberg wish old,zuckerberg you grate,zz top reveal,ünite stäte toughen
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
28498,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
28499,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
28500,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
28501,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [67]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix

In [68]:
model = LogisticRegression(solver='liblinear', random_state=0)

In [None]:
 test_split = train_test_split(df,labels, test_size=.3)

In [None]:
feature_train, feature_test, labels_train, labels_test= test_split

In [33]:
model.fit(feature_train,labels_train)

In [37]:
model.predict_proba(feature_train)

array([[0.81269162, 0.18730838],
       [0.21131565, 0.78868435],
       [0.83364155, 0.16635845],
       ...,
       [0.73603691, 0.26396309],
       [0.78623822, 0.21376178],
       [0.71691896, 0.28308104]])

In [38]:
confusion_matrix(labels_train, model.predict(feature_train))

array([[10454,     5],
       [    8,  9485]])

In [39]:
confusion_matrix(labels_test, model.predict(feature_test))

array([[4063,  429],
       [2412, 1647]])

In [41]:
print(classification_report(labels_test, model.predict(feature_test)))

              precision    recall  f1-score   support

           0       0.63      0.90      0.74      4492
           1       0.79      0.41      0.54      4059

    accuracy                           0.67      8551
   macro avg       0.71      0.66      0.64      8551
weighted avg       0.71      0.67      0.64      8551



In [42]:
model = LogisticRegression(solver='liblinear', C=10.0, random_state=0)

In [43]:
model.fit(feature_train,labels_train)

In [44]:
print(classification_report(labels_test, model.predict(feature_test)))

              precision    recall  f1-score   support

           0       0.63      0.90      0.74      4492
           1       0.79      0.41      0.54      4059

    accuracy                           0.67      8551
   macro avg       0.71      0.66      0.64      8551
weighted avg       0.71      0.67      0.65      8551



In [51]:
print ([i for i in range(2,15,2)])

[2, 4, 6, 8, 10, 12, 14]


In [52]:
from tqdm import tqdm

In [53]:
for i in tqdm (range (2,15,2)):
    if i in[0,1,10]: continue
    model = LogisticRegression(solver='liblinear', C=i, random_state=0)
    model.fit(feature_train,labels_train)
    print("C = ", i)
    print(classification_report(labels_test, model.predict(feature_test)))

  0%|                                                     | 0/7 [00:00<?, ?it/s]

C =  2


 14%|██████▍                                      | 1/7 [00:51<05:08, 51.35s/it]

              precision    recall  f1-score   support

           0       0.63      0.90      0.74      4492
           1       0.79      0.41      0.54      4059

    accuracy                           0.67      8551
   macro avg       0.71      0.66      0.64      8551
weighted avg       0.71      0.67      0.65      8551

C =  4


 29%|████████████▊                                | 2/7 [01:40<04:09, 49.94s/it]

              precision    recall  f1-score   support

           0       0.63      0.90      0.74      4492
           1       0.79      0.42      0.55      4059

    accuracy                           0.67      8551
   macro avg       0.71      0.66      0.64      8551
weighted avg       0.71      0.67      0.65      8551

C =  6


 43%|███████████████████▎                         | 3/7 [02:31<03:22, 50.70s/it]

              precision    recall  f1-score   support

           0       0.63      0.90      0.74      4492
           1       0.79      0.42      0.54      4059

    accuracy                           0.67      8551
   macro avg       0.71      0.66      0.64      8551
weighted avg       0.71      0.67      0.65      8551

C =  8


 57%|█████████████████████████▋                   | 4/7 [03:20<02:29, 49.95s/it]

              precision    recall  f1-score   support

           0       0.63      0.90      0.74      4492
           1       0.79      0.41      0.54      4059

    accuracy                           0.67      8551
   macro avg       0.71      0.66      0.64      8551
weighted avg       0.71      0.67      0.65      8551

C =  12


 86%|██████████████████████████████████████▌      | 6/7 [04:09<00:36, 36.51s/it]

              precision    recall  f1-score   support

           0       0.63      0.90      0.74      4492
           1       0.79      0.41      0.54      4059

    accuracy                           0.67      8551
   macro avg       0.71      0.66      0.64      8551
weighted avg       0.71      0.67      0.64      8551

C =  14


100%|█████████████████████████████████████████████| 7/7 [04:58<00:00, 42.67s/it]

              precision    recall  f1-score   support

           0       0.63      0.90      0.74      4492
           1       0.79      0.41      0.54      4059

    accuracy                           0.67      8551
   macro avg       0.71      0.65      0.64      8551
weighted avg       0.71      0.67      0.64      8551






In [59]:
## Part 2: Data Processing/Cleaning

# import stopwords from nltk
stwrds = set(stopwords.words('english'))
ps = PorterStemmer()
# method to clean a given headline by lowercasing the string, removing spaces, and removing stopwords
def clean_headlines(headline):
    headline = headline.lower()
    headline_split = headline.split()
    cleaned_headline = []
    for word in headline_split:
        if word not in stwrds and word not in string.punctuation:
            cleaned_headline.append(ps.stem(word))
    cleaned_line = " ".join(cleaned_headline)
    return cleaned_line

data['cleaned'] = data['headline'].apply(clean_headlines)

from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(ngram_range=(2,2))

labels = data.is_sarcastic

res = cv.fit_transform(data.iloc[0:2].cleaned)
res = cv.fit_transform(data.cleaned)
res.shape

df = pd.DataFrame(res.toarray(),columns=cv.get_feature_names_out())

df

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix

model = LogisticRegression(solver='liblinear', random_state=0)

 test_split = train_test_split(df,labels, test_size=.3)

feature_train, feature_test, labels_train, labels_test= test_split

model.fit(feature_train,labels_train)

model.predict_proba(feature_train)

confusion_matrix(labels_train, model.predict(feature_train))

confusion_matrix(labels_test, model.predict(feature_test))

print(classification_report(labels_test, model.predict(feature_test)))

model = LogisticRegression(solver='liblinear', C=10.0, random_state=0)

model.fit(feature_train,labels_train)

print(classification_report(labels_test, model.predict(feature_test)))

print ([i for i in range(2,15,2)])

from tqdm import tqdm

for i in tqdm (range (2,15,2)):
    if i in[0,1,10]: continue
    model = LogisticRegression(solver='liblinear', C=i, random_state=0)
    model.fit(feature_train,labels_train)
    print("C = ", i)
    print(classification_report(labels_test, model.predict(feature_test)))

IndentationError: unexpected indent (1524009363.py, line 37)