In [1]:
import re
import pandas as pd
import numpy as np
import nltk
import seaborn as sb
from tensorflow.keras.layers import Embedding, Dense, LSTM, Dropout
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.metrics import classification_report
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer 
from nltk.corpus import stopwords

In [2]:
df = pd.read_csv('../DataSets/news.csv')

In [3]:
df.head()

Unnamed: 0,id,title,author,text,label
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20800 entries, 0 to 20799
Data columns (total 5 columns):
id        20800 non-null int64
title     20242 non-null object
author    18843 non-null object
text      20761 non-null object
label     20800 non-null int64
dtypes: int64(2), object(3)
memory usage: 812.6+ KB


In [5]:
df.isnull().sum()

id           0
title      558
author    1957
text        39
label        0
dtype: int64

In [6]:
df = df.dropna(subset=['title','text'])

In [7]:
df.isnull().sum()

id           0
title        0
author    1918
text         0
label        0
dtype: int64

In [8]:
df['text'][10]

'Organizing for Action, the activist group that morphed from Barack Obama’s first presidential campaign, has partnered with the   Indivisible Project for “online trainings” on how to protest President Donald Trump’s agenda. [Last week, Breitbart News extensively reported that Indivisible leaders are openly associated with groups financed by billionaire George Soros.  Politico earlier this month profiled Indivisible in an article titled, “Inside the protest movement that has Republicans reeling. ”  The news agency not only left out the Soros links, but failed to note that the organizations cited in its article as helping to amplify Indivisible’s message are either financed directly by Soros or have close ties to groups funded by the billionaire, as Breitbart News documented. Organizing for Action (OFA) is a   community organizing project that sprung from Obama’s 2012 campaign organization, Organizing for America, becoming a nonprofit described by the Washington Post as “advocate[ing] fo

In [9]:
x = df.drop(columns=['label','id'])
y = df['label']

In [10]:
x.shape, y.shape

((20203, 3), (20203,))

In [11]:
copy = x.copy()

copy.reset_index(inplace=True)

In [12]:
ws = WordNetLemmatizer()
list_titles = []
for i in range(0, len(copy)):
  headline = re.sub('[^a-zA-Z]', ' ', copy['title'][i])   
  headline = headline.lower()
  headline = word_tokenize(headline)
  headline = [ws.lemmatize(word) for word in headline if word not in stopwords.words("english")]
  headline = ' '.join(headline)
  list_titles.append(headline) 

In [13]:
list_titles[:4]

['house dem aide even see comey letter jason chaffetz tweeted',
 'flynn hillary clinton big woman campus breitbart',
 'truth might get fired',
 'civilian killed single u airstrike identified']

In [14]:
vocab = 10000
hot_title = [one_hot(i, vocab) for i in list_titles]
hot_title[:4]

[[2474, 5620, 8004, 9858, 2600, 6968, 7749, 7470, 4667, 6424],
 [3461, 1477, 855, 8943, 3947, 9638, 4392],
 [5247, 1565, 9776, 8814],
 [4925, 4951, 2331, 1249, 9404, 4074]]

In [15]:
longest = len(max(list_titles, key = len))
longest

356

In [16]:
max_length = 356
embed_input = pad_sequences(hot_title, maxlen = max_length, padding='pre')
print(embed_input)

[[   0    0    0 ... 7470 4667 6424]
 [   0    0    0 ... 3947 9638 4392]
 [   0    0    0 ... 1565 9776 8814]
 ...
 [   0    0    0 ... 3120 8678 6998]
 [   0    0    0 ... 6004 9569 6092]
 [   0    0    0 ... 1874 3514   65]]


In [17]:
model = Sequential() #creating the sequential model incrementally vi the add() method
model.add(Embedding(input_dim=vocab, output_dim= 40, input_length=356))
model.add(LSTM(150))
model.add(Dense(1, activation='sigmoid'))#sigmoid returns a value close to zero
#config the model with losses and metrics - compile()
model.compile(loss = 'binary_crossentropy', optimizer = 'adam', metrics=['accuracy'])
print(model.summary())

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 356, 40)           400000    
_________________________________________________________________
lstm (LSTM)                  (None, 150)               114600    
_________________________________________________________________
dense (Dense)                (None, 1)                 151       
Total params: 514,751
Trainable params: 514,751
Non-trainable params: 0
_________________________________________________________________
None


In [18]:
len(embed_input),y.shape

(20203, (20203,))

In [19]:
x_final = np.array(embed_input)
y_final = np.array(y)
x_final.shape, y_final.shape 

((20203, 356), (20203,))

In [20]:
from sklearn.model_selection import train_test_split 
x_train, x_test, y_train, y_test = train_test_split(x_final, y_final, test_size=0.33, random_state=42)

In [21]:
model.fit(x_train, y_train, validation_data=(x_test, y_test), epochs=10, batch_size=64) 

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x2cac824acc0>

In [22]:
y_pred = model.predict(x_test)

In [23]:
y_pred

array([[9.9777544e-01],
       [9.9977338e-01],
       [3.7813813e-06],
       ...,
       [9.9999404e-01],
       [6.3971144e-01],
       [9.9999964e-01]], dtype=float32)

In [24]:
y_pred1 = []
for i in y_pred:
    if i < 0.5:
        y_pred1.append(0)
    else:
        y_pred1.append(1)

In [26]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test, y_pred1)

0.9149542522873856

In [28]:
 from sklearn.metrics import confusion_matrix
 confusion_matrix(y_test, y_pred1)

array([[3148,  272],
       [ 295, 2952]], dtype=int64)

In [29]:
model.evaluate(embed_input, y)



[0.1921045184135437, 0.9719348549842834]