# Sarcasm Detection


[Prashant Brahmbhatt](https://github.com/hashbanger)

___

In [256]:
import re
import nltk
import warnings
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from plotly import tools
import plotly.offline as pyo
import plotly.graph_objs as go
import plotly.figure_factory as ff

In [257]:
warnings.filterwarnings('ignore')

sns.set_style('darkgrid')

pyo.offline.init_notebook_mode(connected= True)

### Importing the Data

In [258]:
df = pd.read_json('Sarcasm_Headlines_Dataset.json', lines= True)

In [259]:
df.head()

Unnamed: 0,article_link,headline,is_sarcastic
0,https://www.huffingtonpost.com/entry/versace-b...,former versace store clerk sues over secret 'b...,0
1,https://www.huffingtonpost.com/entry/roseanne-...,the 'roseanne' revival catches up to our thorn...,0
2,https://local.theonion.com/mom-starting-to-fea...,mom starting to fear son's web series closest ...,1
3,https://politics.theonion.com/boehner-just-wan...,"boehner just wants wife to listen, not come up...",1
4,https://www.huffingtonpost.com/entry/jk-rowlin...,j.k. rowling wishes snape happy birthday in th...,0


In [260]:
df.describe()

Unnamed: 0,is_sarcastic
count,26709.0
mean,0.438953
std,0.496269
min,0.0
25%,0.0
50%,0.0
75%,1.0
max,1.0


In [261]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26709 entries, 0 to 26708
Data columns (total 3 columns):
article_link    26709 non-null object
headline        26709 non-null object
is_sarcastic    26709 non-null int64
dtypes: int64(1), object(2)
memory usage: 626.1+ KB


In [262]:
for col in df.columns:
    print(sum(df.isnull()[col]))

0
0
0


So we have **no missing values**, that's good!

#### Adding the source of the article to the data

We can use the regular expression for finding the source of the article from the website

In [263]:
x = 'https://www.github.com'

In [264]:
re.findall('\w+', x)

['https', 'www', 'github', 'com']

In [265]:
df['source'] = df['article_link'].apply(lambda x: re.findall('\w+', x)[2])

In [266]:
df.head()

Unnamed: 0,article_link,headline,is_sarcastic,source
0,https://www.huffingtonpost.com/entry/versace-b...,former versace store clerk sues over secret 'b...,0,huffingtonpost
1,https://www.huffingtonpost.com/entry/roseanne-...,the 'roseanne' revival catches up to our thorn...,0,huffingtonpost
2,https://local.theonion.com/mom-starting-to-fea...,mom starting to fear son's web series closest ...,1,theonion
3,https://politics.theonion.com/boehner-just-wan...,"boehner just wants wife to listen, not come up...",1,theonion
4,https://www.huffingtonpost.com/entry/jk-rowlin...,j.k. rowling wishes snape happy birthday in th...,0,huffingtonpost


First of all let's explore how much share are there of sarcasm in the dataset and what kind of are they.

In [267]:
labels = ['Sarcasm', 'Not Sarcasm']
values = [df['is_sarcastic'].sum(), len(df) - df['is_sarcastic'].sum()]

trace0 = go.Bar(x = [labels[1]], y = [values[1]], name = 'No Sarcasm')
trace1 = go.Bar(x = [labels[0]], y = [values[0]], marker= {'color': '#00cc66'} , name = 'Sarcasm')


data = [trace0, trace1]

layout = go.Layout(title = 'Number of Sarcastic Articles',
                   width = 800,
                   height = 500,
                  yaxis= dict(title = 'Number of articles'),)

fig = go.Figure(data, layout)

pyo.offline.iplot(fig)

In [268]:
df.head(10)

Unnamed: 0,article_link,headline,is_sarcastic,source
0,https://www.huffingtonpost.com/entry/versace-b...,former versace store clerk sues over secret 'b...,0,huffingtonpost
1,https://www.huffingtonpost.com/entry/roseanne-...,the 'roseanne' revival catches up to our thorn...,0,huffingtonpost
2,https://local.theonion.com/mom-starting-to-fea...,mom starting to fear son's web series closest ...,1,theonion
3,https://politics.theonion.com/boehner-just-wan...,"boehner just wants wife to listen, not come up...",1,theonion
4,https://www.huffingtonpost.com/entry/jk-rowlin...,j.k. rowling wishes snape happy birthday in th...,0,huffingtonpost
5,https://www.huffingtonpost.com/entry/advancing...,advancing the world's women,0,huffingtonpost
6,https://www.huffingtonpost.com/entry/how-meat-...,the fascinating case for eating lab-grown meat,0,huffingtonpost
7,https://www.huffingtonpost.com/entry/boxed-col...,"this ceo will send your kids to school, if you...",0,huffingtonpost
8,https://politics.theonion.com/top-snake-handle...,top snake handler leaves sinking huckabee camp...,1,theonion
9,https://www.huffingtonpost.com/entry/fridays-m...,friday's morning email: inside trump's presser...,0,huffingtonpost


In [269]:
list(df['source'].value_counts())

[14985, 11724]

In [270]:
trace0 = go.Bar(x = df['source'].unique(), y = list(df['source'].value_counts()) , marker = {'color': '#9900e6'}, name = 'Sarcasm')

data = [trace0]

layout = go.Layout(title = 'Sources of Articles',
                   width = 800,
                   height = 500,
                  yaxis= dict(title = 'Number of articles'),)

fig = go.Figure(data, layout)

pyo.offline.iplot(fig)

So there's peculiar observation that all the articles in the data from ***theonion*** are sarcasm while all others are from ***huffington*** and not sarcastic.  

Seeing this observation we won't be using the sources column at all, it will be a biased model as in a way it is the target variable itself.

#### Removing the punctuations from the headlines

In [271]:
x = 'F*&!@#$%&*(U)+>"{}C<>!@#$K%&*()+>"{}<> T%&!@H#$%&*()+E>"{}<S>!@#$%&*()E+>"{}<> *&P(($UN#$$@!CT%$&UA&&*T()I$%@(O*&N!S'
print(x)
print('\n'+re.sub('[^a-zA-z0-9\s]','',x))

F*&!@#$%&*(U)+>"{}C<>!@#$K%&*()+>"{}<> T%&!@H#$%&*()+E>"{}<S>!@#$%&*()E+>"{}<> *&P(($UN#$$@!CT%$&UA&&*T()I$%@(O*&N!S

FUCK THESE PUNCTUATIONS


In [272]:
df['headline_cleaned'] = df['headline'].apply(lambda x: re.sub('[^a-zA-z0-9\s]','',x))

In [273]:
print(df['headline_cleaned'][0])
df['headline'][0]

former versace store clerk sues over secret black code for minority shoppers


"former versace store clerk sues over secret 'black code' for minority shoppers"

Seems like it worked

In [314]:
frequent_words = df['headline_cleaned'].str.split(expand = True).unstack().value_counts()
frequent_words = pd.DataFrame(frequent_words).reset_index()

In [316]:
trace0 = go.Bar(x = frequent_words['index'][0:50] , y = frequent_words[0][0:50], marker = dict(color = frequent_words[0]
                                                                                   , colorscale = 'Portland')
                                                                                , name = 'Top 50 Frequent Headline Words')

data = [trace0]

layout = go.Layout(title = 'Most Frequent Words',
                   width = 950,
                   height = 600,
                   xaxis = dict(title = 'Words', nticks = 50),
                  yaxis= dict(title = 'Count'),)

fig = go.Figure(data, layout)

pyo.offline.iplot(fig)

So it is pretty obvious that the most frequent words are the *stopwords*. So we better remove these from the data to get to the real stuff.

In [317]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\prash\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [318]:
from nltk.corpus import stopwords

In [319]:
frequent_words = df['headline_cleaned'].str.split(expand = True).unstack().value_counts()
frequent_words = pd.DataFrame(frequent_words).reset_index()

In [301]:
frequent_50 = [word for word in list(frequent_words['index']) if word not in list(stopwords.words('english'))][:50]

In [345]:
word_list = []
count_list = []
for w50 in frequent_50:
    for word in frequent_words['index']:
        if word == w50:
            count = int(frequent_words[frequent_words['index'] == w50][0])
    word_list.append(w50)
    count_list.append(count)

Plotting the most frequent words again

In [348]:
trace0 = go.Bar(x = word_list , y = count_list, marker = dict(color = frequent_words[0]
                                                                                   , colorscale = 'Portland')
                                                                                , name = 'Top 50 Frequent Headline Words')

data = [trace0]

layout = go.Layout(title = 'Most Frequent Words',
                   width = 950,
                   height = 600,
                   xaxis = dict(title = 'Words', nticks = 50),
                  yaxis= dict(title = 'Count'),)

fig = go.Figure(data, layout)

pyo.offline.iplot(fig)