In [1]:
import pandas as pd
import re 
import nltk
from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns
import keras
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense,Embedding,Dropout,Input,Flatten,LSTM
from keras.models import Sequential
from sklearn.metrics import classification_report,confusion_matrix
import warnings
warnings.filterwarnings('ignore')

In [2]:
from sklearn import metrics
import numpy as np


In [3]:
train_data=pd.read_csv('/content/Train-Data- Sarcasm.csv')
train_data.head()

Unnamed: 0,Id,article_link,headline,is_sarcastic
0,2347,https://www.huffingtonpost.com/entry/best-song...,the 23 best songs of 2014,0
1,26516,https://www.theonion.com/sesame-street-mourns-...,sesame street mourns death of original letter k,1
2,2576,https://www.huffingtonpost.com/entry/super-bow...,san francisco vandals keep messing with super ...,0
3,25464,https://local.theonion.com/area-man-cant-remem...,area man can't remember whether he rented mimi...,1
4,9030,https://local.theonion.com/fourth-grade-teache...,fourth-grade teacher receives dark portent of ...,1


In [4]:
test_data=pd.read_csv('/content/test_data.csv')
test_data.drop(labels='Unnamed: 0',axis=1,inplace=True)
test_data.head()

Unnamed: 0,Id,article_link,headline,is_sarcastic
0,18504,https://www.theonion.com/woman-relieved-soulma...,woman relieved soulmate turned out to be in sa...,1
1,20811,https://www.huffingtonpost.com/entry/end-the-i...,end the international drug war to control the ...,0
2,20419,https://www.huffingtonpost.com/entry/donald-tr...,who said it: renowned racist george wallace or...,0
3,20631,https://local.theonion.com/struggling-local-th...,struggling local theater space put out of its ...,1
4,1921,https://www.theonion.com/graffiti-artist-no-lo...,graffiti artist no longer putting his heart in it,1


In [5]:
def clean_text(text):
    word_len=[]
    clean_data=[]
    for sent in tqdm(text):
        sent=re.sub(r'[^\w\s\d]','',str(sent))
        sent = str(sent).lower()
        words=[word for word in sent.split()]
        word_len.append(len(words))
        clean_data.append(' '.join(words))
    return clean_data,word_len

In [6]:
train_data['cleaned_headline'] , train_data['word_count'] = clean_text(train_data['headline'])

100%|██████████| 21367/21367 [00:00<00:00, 165928.90it/s]


In [7]:
train_data['word_count'].max()

39

In [8]:
test_data['cleaned_headline'] , test_data['word_count'] = clean_text(test_data['headline'])

100%|██████████| 5342/5342 [00:00<00:00, 153768.89it/s]


In [9]:
tokenizer=Tokenizer()
tokenizer.fit_on_texts(train_data['cleaned_headline'])
pad_len = 30

train_headlines=tokenizer.texts_to_sequences(train_data['cleaned_headline'])
train_headlines_padded=pad_sequences(train_headlines,maxlen=pad_len,padding='pre')
test_headlines=tokenizer.texts_to_sequences(test_data['cleaned_headline'])
test_headlines_padded=pad_sequences(test_headlines,maxlen=pad_len,padding='pre')

In [10]:
vocab_size = len(tokenizer.word_index)
model=Sequential()
model.add(Input(shape=(pad_len,)))
model.add(Embedding(vocab_size+1,20))

model.add(Flatten())
model.add(Dropout(0.3))
model.add(Dense(8,activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(8,activation='relu'))

model.add(Dense(1,activation='sigmoid'))

my_opt = keras.optimizers.Adam(learning_rate=0.0001)
model.compile(optimizer=my_opt,loss='binary_crossentropy',metrics=['accuracy'])

model.fit(train_headlines_padded, train_data['is_sarcastic'],validation_data=(test_headlines_padded,test_data['is_sarcastic']), epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f558f4ff450>

In [11]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 30, 20)            510020    
_________________________________________________________________
flatten (Flatten)            (None, 600)               0         
_________________________________________________________________
dropout (Dropout)            (None, 600)               0         
_________________________________________________________________
dense (Dense)                (None, 8)                 4808      
_________________________________________________________________
dropout_1 (Dropout)          (None, 8)                 0         
_________________________________________________________________
dense_1 (Dense)              (None, 8)                 72        
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 9

In [12]:
model.evaluate(test_headlines_padded,test_data['is_sarcastic'])
predictions = model.predict_classes(test_headlines_padded)



In [13]:
print(metrics.classification_report(test_data['is_sarcastic'],predictions))

              precision    recall  f1-score   support

           0       0.88      0.86      0.87      2997
           1       0.83      0.85      0.84      2345

    accuracy                           0.86      5342
   macro avg       0.86      0.86      0.86      5342
weighted avg       0.86      0.86      0.86      5342



In [14]:
test_d=pd.read_csv('Test-Data- Sarcasm.csv')

In [15]:
test_d

Unnamed: 0,Id,article_link,headline
0,18504,https://www.theonion.com/woman-relieved-soulma...,woman relieved soulmate turned out to be in sa...
1,20811,https://www.huffingtonpost.com/entry/end-the-i...,end the international drug war to control the ...
2,20419,https://www.huffingtonpost.com/entry/donald-tr...,who said it: renowned racist george wallace or...
3,20631,https://local.theonion.com/struggling-local-th...,struggling local theater space put out of its ...
4,1921,https://www.theonion.com/graffiti-artist-no-lo...,graffiti artist no longer putting his heart in it
...,...,...,...
5337,9516,https://www.theonion.com/man-s-whole-job-undoi...,man's whole job undoing handiwork of self-chec...
5338,16044,https://www.huffingtonpost.com/entry/ndaa-inde...,house vote maintains military ability to jail ...
5339,22535,https://www.huffingtonpost.com/entry/john-west...,fearless veteran celebrates 90th birthday on t...
5340,18545,https://www.huffingtonpost.com/entry/ukraine-a...,ukraine at a crossroads: could putin lose his ...


In [16]:
test_d['cleaned_headline'] , test_d['word_count'] = clean_text(test_d['headline'])

100%|██████████| 5342/5342 [00:00<00:00, 153325.89it/s]


In [17]:
test_d

Unnamed: 0,Id,article_link,headline,cleaned_headline,word_count
0,18504,https://www.theonion.com/woman-relieved-soulma...,woman relieved soulmate turned out to be in sa...,woman relieved soulmate turned out to be in sa...,11
1,20811,https://www.huffingtonpost.com/entry/end-the-i...,end the international drug war to control the ...,end the international drug war to control the ...,10
2,20419,https://www.huffingtonpost.com/entry/donald-tr...,who said it: renowned racist george wallace or...,who said it renowned racist george wallace or ...,14
3,20631,https://local.theonion.com/struggling-local-th...,struggling local theater space put out of its ...,struggling local theater space put out of its ...,9
4,1921,https://www.theonion.com/graffiti-artist-no-lo...,graffiti artist no longer putting his heart in it,graffiti artist no longer putting his heart in it,9
...,...,...,...,...,...
5337,9516,https://www.theonion.com/man-s-whole-job-undoi...,man's whole job undoing handiwork of self-chec...,mans whole job undoing handiwork of selfchecko...,8
5338,16044,https://www.huffingtonpost.com/entry/ndaa-inde...,house vote maintains military ability to jail ...,house vote maintains military ability to jail ...,10
5339,22535,https://www.huffingtonpost.com/entry/john-west...,fearless veteran celebrates 90th birthday on t...,fearless veteran celebrates 90th birthday on t...,10
5340,18545,https://www.huffingtonpost.com/entry/ukraine-a...,ukraine at a crossroads: could putin lose his ...,ukraine at a crossroads could putin lose his j...,13


In [18]:
test_d['word_count'].max()

38

In [19]:
test_headlines=tokenizer.texts_to_sequences(test_d['cleaned_headline'])
test_headlines_padded=pad_sequences(test_headlines,maxlen=pad_len,padding='pre')

In [20]:
pred = model.predict_classes(test_headlines_padded)

In [21]:
predict=np.array(pred)

In [22]:
dataSet=pd.DataFrame(test_data['Id'],columns=['Id'])

In [23]:
dataSet['Output']=predict

In [24]:
dataSet

Unnamed: 0,Id,Output
0,18504,1
1,20811,0
2,20419,0
3,20631,1
4,1921,1
...,...,...
5337,9516,1
5338,16044,0
5339,22535,0
5340,18545,0


In [None]:
#dataSet.to_csv('sarcasm_output.csv')