# Sarcasm Detection


[Prashant Brahmbhatt](https://github.com/hashbanger)

___

In [1]:
import re
import nltk
import warnings
import numpy as np
import pandas as pd
from plotly import tools
import plotly.offline as pyo
import plotly.graph_objs as go
import plotly.figure_factory as ff

In [2]:
warnings.filterwarnings('ignore')

pyo.offline.init_notebook_mode(connected= True)

### Importing the Data

In [3]:
df = pd.read_json('Sarcasm_Headlines_Dataset.json', lines= True)

In [4]:
df.head()

Unnamed: 0,article_link,headline,is_sarcastic
0,https://www.huffingtonpost.com/entry/versace-b...,former versace store clerk sues over secret 'b...,0
1,https://www.huffingtonpost.com/entry/roseanne-...,the 'roseanne' revival catches up to our thorn...,0
2,https://local.theonion.com/mom-starting-to-fea...,mom starting to fear son's web series closest ...,1
3,https://politics.theonion.com/boehner-just-wan...,"boehner just wants wife to listen, not come up...",1
4,https://www.huffingtonpost.com/entry/jk-rowlin...,j.k. rowling wishes snape happy birthday in th...,0


In [5]:
df.describe()

Unnamed: 0,is_sarcastic
count,26709.0
mean,0.438953
std,0.496269
min,0.0
25%,0.0
50%,0.0
75%,1.0
max,1.0


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26709 entries, 0 to 26708
Data columns (total 3 columns):
article_link    26709 non-null object
headline        26709 non-null object
is_sarcastic    26709 non-null int64
dtypes: int64(1), object(2)
memory usage: 626.1+ KB


In [7]:
for col in df.columns:
    print(sum(df.isnull()[col]))

0
0
0


So we have **no missing values**, that's good!

#### Adding the source of the article to the data

We can use the regular expression for finding the source of the article from the website

In [8]:
x = 'https://www.github.com'

In [9]:
re.findall('\w+', x)

['https', 'www', 'github', 'com']

In [10]:
df['source'] = df['article_link'].apply(lambda x: re.findall('\w+', x)[2])

In [11]:
df.head()

Unnamed: 0,article_link,headline,is_sarcastic,source
0,https://www.huffingtonpost.com/entry/versace-b...,former versace store clerk sues over secret 'b...,0,huffingtonpost
1,https://www.huffingtonpost.com/entry/roseanne-...,the 'roseanne' revival catches up to our thorn...,0,huffingtonpost
2,https://local.theonion.com/mom-starting-to-fea...,mom starting to fear son's web series closest ...,1,theonion
3,https://politics.theonion.com/boehner-just-wan...,"boehner just wants wife to listen, not come up...",1,theonion
4,https://www.huffingtonpost.com/entry/jk-rowlin...,j.k. rowling wishes snape happy birthday in th...,0,huffingtonpost


First of all let's explore how much share are there of sarcasm in the dataset and what kind of are they.

In [12]:
labels = ['Sarcasm', 'Not Sarcasm']
values = [df['is_sarcastic'].sum(), len(df) - df['is_sarcastic'].sum()]

trace0 = go.Bar(x = [labels[1]], y = [values[1]], name = 'No Sarcasm')
trace1 = go.Bar(x = [labels[0]], y = [values[0]], marker= {'color': '#00cc66'} , name = 'Sarcasm')


data = [trace0, trace1]

layout = go.Layout(title = 'Number of Sarcastic Articles',
                   width = 800,
                   height = 500,
                  yaxis= dict(title = 'Number of articles'),)

fig = go.Figure(data, layout)

pyo.offline.iplot(fig)

In [13]:
df.head(10)

Unnamed: 0,article_link,headline,is_sarcastic,source
0,https://www.huffingtonpost.com/entry/versace-b...,former versace store clerk sues over secret 'b...,0,huffingtonpost
1,https://www.huffingtonpost.com/entry/roseanne-...,the 'roseanne' revival catches up to our thorn...,0,huffingtonpost
2,https://local.theonion.com/mom-starting-to-fea...,mom starting to fear son's web series closest ...,1,theonion
3,https://politics.theonion.com/boehner-just-wan...,"boehner just wants wife to listen, not come up...",1,theonion
4,https://www.huffingtonpost.com/entry/jk-rowlin...,j.k. rowling wishes snape happy birthday in th...,0,huffingtonpost
5,https://www.huffingtonpost.com/entry/advancing...,advancing the world's women,0,huffingtonpost
6,https://www.huffingtonpost.com/entry/how-meat-...,the fascinating case for eating lab-grown meat,0,huffingtonpost
7,https://www.huffingtonpost.com/entry/boxed-col...,"this ceo will send your kids to school, if you...",0,huffingtonpost
8,https://politics.theonion.com/top-snake-handle...,top snake handler leaves sinking huckabee camp...,1,theonion
9,https://www.huffingtonpost.com/entry/fridays-m...,friday's morning email: inside trump's presser...,0,huffingtonpost


In [14]:
list(df['source'].value_counts())

[14985, 11724]

In [15]:
trace0 = go.Bar(x = df['source'].unique(), y = list(df['source'].value_counts()) , marker = {'color': '#9900e6'}, name = 'Sarcasm')

data = [trace0]

layout = go.Layout(title = 'Sources of Articles',
                   width = 800,
                   height = 500,
                  yaxis= dict(title = 'Number of articles'),)

fig = go.Figure(data, layout)

pyo.offline.iplot(fig)

So there's peculiar observation that all the articles in the data from ***theonion*** are sarcasm while all others are from ***huffington*** and not sarcastic.  

Seeing this observation we won't be using the sources column at all, it will be a biased model as in a way it is the target variable itself.

#### Removing the punctuations from the headlines

In [16]:
x = 'F*&!@#$%&*(U)+>"{}C<>!@#$K%&*()+>"{}<> T%&!@H#$%&*()+E>"{}<S>!@#$%&*()E+>"{}<> *&P(($UN#$$@!CT%$&UA&&*T()I$%@(O*&N!S'
print(x)
print('\n'+re.sub('[^a-zA-z0-9\s]','',x))

F*&!@#$%&*(U)+>"{}C<>!@#$K%&*()+>"{}<> T%&!@H#$%&*()+E>"{}<S>!@#$%&*()E+>"{}<> *&P(($UN#$$@!CT%$&UA&&*T()I$%@(O*&N!S

FUCK THESE PUNCTUATIONS


In [17]:
df['headlineNoPunc'] = df['headline'].apply(lambda x: re.sub('[^a-zA-z0-9\s]','',x))

In [18]:
print(df['headlineNoPunc'][0])
df['headline'][0]

former versace store clerk sues over secret black code for minority shoppers


"former versace store clerk sues over secret 'black code' for minority shoppers"

Seems like it worked

In [19]:
allWords_df = df['headlineNoPunc'].str.split(expand = True).unstack().value_counts()
allWords_df = pd.DataFrame(allWords_df).reset_index()

In [20]:
trace0 = go.Bar(x = allWords_df['index'][0:50] , y = allWords_df[0][0:50], marker = dict(color = allWords_df[0]
                                                                                   , colorscale = 'Portland')
                                                                                , name = 'Top 50 Frequent Headline Words')

data = [trace0]

layout = go.Layout(title = 'Most Frequent Words',
                   width = 950,
                   height = 600,
                   xaxis = dict(title = 'Words', nticks = 50),
                  yaxis= dict(title = 'Count'),)

fig = go.Figure(data, layout)

pyo.offline.iplot(fig)

So it is pretty obvious that the most frequent words are the *stopwords*. So we better remove these from the data to get to the real stuff.

In [21]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\prash\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [22]:
from nltk.corpus import stopwords

In [23]:
word_counts = df['headlineNoPunc'].str.split(expand = True).unstack().value_counts()
word_counts = pd.DataFrame(word_counts).reset_index()

separating out the word counts based on their target class

In [24]:
stopwords = stopwords.words('english') # let's save us some hassle

In [25]:
noSarcasmWords_all = df[df['is_sarcastic'] == 0]['headlineNoPunc'].str.split(expand = True).unstack()
sarcasmWords_all = df[df['is_sarcastic'] == 1]['headlineNoPunc'].str.split(expand = True).unstack()

In [26]:
noSarcasmWords = [wo for wo in noSarcasmWords_all if wo not in stopwords]
sarcasmWords = [wo for wo in sarcasmWords_all if wo not in stopwords]

Now we get the counts of the words

In [30]:
from collections import Counter

In [59]:
noSarcasmWordCount = pd.DataFrame(list(Counter(noSarcasmWords).items()), columns= ['word','count'])
sarcasmWordCount = pd.DataFrame(list(Counter(sarcasmWords).items()), columns= ['word','count'])

In [103]:
temp1 = noSarcasmWordCount.sort_values(by = ['count'], ascending= False)[1:31]
temp2 = sarcasmWordCount.sort_values(by = ['count'], ascending= False)[1:31]

In [108]:
trace0 = go.Bar(y = temp1['word'] , x = temp1['count'], marker = dict(color = '#ff3333'), opacity= 0.6, orientation = 'h' ,name = 'Frequent words in Non Sarcastic articles')
trace1 = go.Bar(y = temp2['word'] , x = temp2['count'], marker = dict(color = '#33cc33'), opacity= 0.6, orientation = 'h' ,name = 'Frequent words in Sarcastic articles')

fig = tools.make_subplots(rows= 2, cols= 1)
fig.append_trace(trace0, 1, 1)
fig.append_trace(trace1, 2, 1)
fig['layout'].update(height=1200, width=800, title = 'Frequent words', yaxis = dict(nticks = 30))
                     
pyo.offline.iplot(fig)

This is the format of your plot grid:
[ (1,1) x1,y1 ]
[ (2,1) x2,y2 ]

