## AI/ML Challenge
In this challenge, you will work on a dataset that contains news headlines - which are aimed to be written in a sarcastic manner by the news author. Our job here is to build our NLP models and predict whether the headline is sarcastic or not. 

In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import spacy
%matplotlib inline

## Loading Data and Checking

In [3]:
train_df = pd.read_csv('Train_Data.csv')
test_df = pd.read_csv('Test_Data.csv')

In [4]:
train_df.head()

Unnamed: 0,headline,is_sarcastic
0,supreme court votes 7-2 to legalize all worldl...,1
1,hungover man horrified to learn he made dozens...,1
2,emily's list founder: women are the 'problem s...,0
3,send your kids back to school with confidence,0
4,watch: experts talk pesticides and health,0


In [5]:
test_df.head()

Unnamed: 0,headline
0,area stand-up comedian questions the deal with...
1,dozens of glowing exit signs mercilessly taunt...
2,perfect response to heckler somewhere in prop ...
3,gop prays for ossoff lossoff
4,trevor noah says the scary truth about trump's...


In [6]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 44262 entries, 0 to 44261
Data columns (total 2 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   headline      44262 non-null  object
 1   is_sarcastic  44262 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 691.7+ KB


In [7]:
train_df.describe()

Unnamed: 0,is_sarcastic
count,44262.0
mean,0.458723
std,0.498299
min,0.0
25%,0.0
50%,0.0
75%,1.0
max,1.0


In [8]:
train_df.is_sarcastic.value_counts()

0    23958
1    20304
Name: is_sarcastic, dtype: int64

### Text Preprocessing

removing extra spaces and expanding the words

In [9]:
train_df['headline'] = train_df['headline'].str.strip()
test_df['headline'] = test_df['headline'].str.strip()

In [12]:
import nltk
import contractions
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [13]:
stop_words = nltk.corpus.stopwords.words('english')
stop_words[:10]

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're"]

for n in ['no','not','never','yes','but','though','although']:
    if n in stop_words:
        stop_words.remove(n)


list(contractions.contractions_dict.items())[:6]

txt = "I won't send y'all kids back to school you've been with confidence"

contractions.fix(txt)

Just checking if a number is present in headline

In [16]:
train_df['headline'] = train_df['headline'].apply(lambda x: contractions.fix(x)) 
test_df['headline'] = test_df['headline'].apply(lambda x: contractions.fix(x))

def find_alpnum(x):
    for hl in x:
        for character in hl:
            if character.isdigit():
                return 1
    return 0

In [17]:
train_df['contains_num'] = train_df['headline'].apply(find_alpnum)
test_df['contains_num'] = test_df['headline'].apply(find_alpnum)

print(train_df[(train_df['contains_num'] == 1) ]['headline'][99])

inspirational disabled horse crosses preakness finish line after 11 hours


In [18]:
train_df[(train_df['contains_num'] == 1) ]['is_sarcastic'].value_counts()

0    3567
1    2906
Name: is_sarcastic, dtype: int64

In [19]:
train_df[train_df['contains_num'] == 1].loc[28]['headline']

"lauren graham just dropped a clue about those final 4 'gilmore girls' words"

In [20]:
def remove_stop_words(x):
    no_stop_words = [n for n in x.split() if n not in stop_words]
    return ' '.join(no_stop_words)
remove_stop_words("lauren graham just dropped a clue about those final 4 'gilmore girls' words")


"lauren graham dropped clue final 4 'gilmore girls' words"

In [21]:
train_df['headline'] = train_df['headline'].apply(remove_stop_words)
test_df['headline'] = test_df['headline'].apply(remove_stop_words)

from nltk.stem import WordNetLemmatizer, PorterStemmer

lem = WordNetLemmatizer()
ps = PorterStemmer()

In [22]:
def stemmer(x):
    stemming = [ps.stem(i) for i in nltk.tokenize.word_tokenize(x)]
    return ' '.join(stemming)
stemmer('area loser blissfully unaffected whims stock market')

'area loser bliss unaffect whim stock market'

In [23]:
train_df['headline'] = train_df['headline'].apply(lambda x: stemmer(x))
test_df['headline'] = test_df['headline'].apply(lambda x: stemmer(x))

## Extraction of Data

In [24]:
import re
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

In [25]:
import textblob

textblob.TextBlob('This is an AMAZING pair of Jeans!').sentiment

Sentiment(polarity=0.7500000000000001, subjectivity=0.9)

In [26]:
cv = CountVectorizer()
tf_idf = TfidfVectorizer()

In [27]:
train_df['headline'][:20000]

0                 suprem court vote 7-2 legal worldli vice
1        hungov man horrifi learn made dozen plan last ...
2        emili 's list founder : women 'problem solver ...
3                              send kid back school confid
4                      watch : expert talk pesticid health
                               ...                        
19995    bunch star wrap ava duvernay 's ' a wrinkl time '
19996    elizabeth warren slam pat toomey tri let bank ...
19997    parent not rememb enough color help kindergart...
19998         atlanta man indict pour boil water gay coupl
19999    cb news chief bob schieffer 's return : 'how c...
Name: headline, Length: 20000, dtype: object

In [28]:
df_train_sent = train_df['headline'].apply(lambda row: textblob.TextBlob(row).sentiment)
train_df['Polarity'] = [obj.polarity for obj in df_train_sent.values]
train_df['Subjectivity'] = [obj.subjectivity for obj in df_train_sent.values]

In [29]:
df_test_sent = test_df['headline'].apply(lambda row: textblob.TextBlob(row).sentiment)
test_df['Polarity'] = [obj.polarity for obj in df_test_sent.values]
test_df['Subjectivity'] = [obj.subjectivity for obj in df_test_sent.values]

In [30]:
X_train = cv.fit_transform(train_df['headline']).astype('int8').toarray()
X_test = cv.transform(test_df['headline']).astype('int8').toarray()

X_train

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int8)

In [31]:
X_train.shape,X_test.shape

((44262, 18092), (11066, 18092))

In [32]:
cv.get_feature_names()[:10]



['00', '000', '00000000001', '00003', '000th', '025', '03', '047', '071', '10']

In [33]:
new_X = pd.DataFrame(X_train, columns=cv.get_feature_names(),dtype=np.int8)
X_test_data = pd.DataFrame(X_test, columns=cv.get_feature_names(),dtype=np.int8)



In [34]:
new_X.head()

Unnamed: 0,00,000,00000000001,00003,000th,025,03,047,071,10,...,zooland,zoologist,zoom,zoroastrian,zsa,zucker,zuckerberg,zz,éclair,ünite
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [35]:
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_similarity

In [36]:
y = train_df['is_sarcastic'].values


X_train, X_val, y_train, y_val = train_test_split(new_X, y, test_size=0.3, random_state=1, stratify=y)

In [37]:
X_train.shape, X_val.shape, y_train.shape, y_val.shape

((30983, 18092), (13279, 18092), (30983,), (13279,))

Building Model

In [38]:
from sklearn.linear_model import LogisticRegression

In [39]:
lr = LogisticRegression(max_iter=200)
lr.fit(X_train,y_train)

LogisticRegression(max_iter=200)

In [40]:
y_pred = lr.predict(X_val)

from sklearn.metrics import accuracy_score, classification_report

lr.score(X_train, y_train)

0.9321886195655682

In [41]:
from sklearn.metrics import accuracy_score, classification_report

In [45]:
lr.score(X_train, y_train)

0.9321886195655682

In [47]:
accuracy_score(y_pred, y_val)

0.8512689208524739

In [48]:
print(classification_report(y_val, y_pred))

              precision    recall  f1-score   support

           0       0.84      0.89      0.87      7188
           1       0.87      0.80      0.83      6091

    accuracy                           0.85     13279
   macro avg       0.85      0.85      0.85     13279
weighted avg       0.85      0.85      0.85     13279



In [50]:
from sklearn.metrics import confusion_matrix

In [51]:
confusion_matrix(y_val, y_pred)

array([[6428,  760],
       [1215, 4876]])

In [52]:
from sklearn.naive_bayes import BernoulliNB

In [53]:
clf = BernoulliNB()

clf.fit(X_train,y_train)

BernoulliNB()

In [55]:
clf.score(X_train, y_train)

0.8952005938740599

In [56]:
y_pred = clf.predict(X_val)
accuracy_score(y_val, y_pred)

0.8393704345206717

In [57]:
print(classification_report(y_val, y_pred))

              precision    recall  f1-score   support

           0       0.83      0.88      0.86      7188
           1       0.85      0.79      0.82      6091

    accuracy                           0.84     13279
   macro avg       0.84      0.84      0.84     13279
weighted avg       0.84      0.84      0.84     13279



In [58]:
print(confusion_matrix(y_val, y_pred))

[[6330  858]
 [1275 4816]]


## Test Prediction

y_pred = lr.predict(X_test_data)

In [60]:
test_data = pd.DataFrame(y_pred,columns=['prediction'])

In [61]:
test_data.to_csv('submission.csv',index=False)