In [45]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

import re, os

from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split

from keras.layers import Embedding
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import one_hot


In [46]:
test_data = pd.read_csv("/home/hasan/DATA SET/Fake News/test.csv")

In [47]:
test_data.head()

Unnamed: 0,id,title,author,text
0,20800,"Specter of Trump Loosens Tongues, if Not Purse...",David Streitfeld,"PALO ALTO, Calif. — After years of scorning..."
1,20801,Russian warships ready to strike terrorists ne...,,Russian warships ready to strike terrorists ne...
2,20802,#NoDAPL: Native American Leaders Vow to Stay A...,Common Dreams,Videos #NoDAPL: Native American Leaders Vow to...
3,20803,"Tim Tebow Will Attempt Another Comeback, This ...",Daniel Victor,"If at first you don’t succeed, try a different..."
4,20804,Keiser Report: Meme Wars (E995),Truth Broadcast Network,42 mins ago 1 Views 0 Comments 0 Likes 'For th...


### Handling nan value

In [48]:
test_data.isnull().sum()

id          0
title     122
author    503
text        7
dtype: int64

In [49]:
test_data['title'].fillna(method='ffill', inplace=True)
test_data['author'].fillna(method='bfill', inplace=True)
test_data['text'].fillna(method='ffill', inplace=True)

In [50]:
test_data['title'][0]

'Specter of Trump Loosens Tongues, if Not Purse Strings, in Silicon Valley - The New York Times'

In [51]:
#copy the dataset
X = test_data.copy()
print('Shape of X is :',X.shape)

Shape of X is : (5200, 4)


In [52]:
#resetting index
X.reset_index(inplace=True)
X.drop('index', axis=1, inplace=True)

In [53]:
X.head()

Unnamed: 0,id,title,author,text
0,20800,"Specter of Trump Loosens Tongues, if Not Purse...",David Streitfeld,"PALO ALTO, Calif. — After years of scorning..."
1,20801,Russian warships ready to strike terrorists ne...,Common Dreams,Russian warships ready to strike terrorists ne...
2,20802,#NoDAPL: Native American Leaders Vow to Stay A...,Common Dreams,Videos #NoDAPL: Native American Leaders Vow to...
3,20803,"Tim Tebow Will Attempt Another Comeback, This ...",Daniel Victor,"If at first you don’t succeed, try a different..."
4,20804,Keiser Report: Meme Wars (E995),Truth Broadcast Network,42 mins ago 1 Views 0 Comments 0 Likes 'For th...


### Data Cleaning

In [54]:
ps = PorterStemmer()

In [55]:
corpus = []
for i in range(0, len(X)):
    review = re.sub('[^a-zA-Z]', ' ',X['title'][i])
    review = review.lower()
    review = review.split()
    
    review = [ps.stem(word) for word in review if not word in stopwords.words('english')]
    review = ' '.join(review)
    corpus.append(review)
    

In [56]:
corpus[3]

'tim tebow attempt anoth comeback time basebal new york time'

### One Hot Encoding

In [57]:
one_hot_repre = [one_hot(words, 500) for words in corpus]

In [58]:
#printing 
for i in range(0,5):
    print(one_hot_repre[i])

[365, 213, 279, 130, 479, 473, 51, 1, 168, 128, 77]
[376, 285, 71, 5, 261, 108, 413]
[90, 463, 97, 173, 39, 128, 89, 400, 85, 480]
[366, 493, 272, 488, 442, 77, 328, 168, 128, 77]
[10, 159, 209, 121, 489]


### Embedding Representation

In [59]:
embedded = pad_sequences(one_hot_repre, padding='post', maxlen=20)

In [60]:
print(embedded)
print('\n\nShape of embedded is :',embedded.shape)

[[365 213 279 ...   0   0   0]
 [376 285  71 ...   0   0   0]
 [ 90 463  97 ...   0   0   0]
 ...
 [261 151 361 ...   0   0   0]
 [404 203 210 ...   0   0   0]
 [389 130 121 ...   0   0   0]]


Shape of embedded is : (5200, 20)


In [61]:
embedded[2]

array([ 90, 463,  97, 173,  39, 128,  89, 400,  85, 480,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0], dtype=int32)

### Converting to Array

In [62]:
X_final = np.array(embedded)
print('Shape of X_final is :',X_final.shape)

Shape of X_final is : (5200, 20)


In [64]:
df = pd.DataFrame(X_final)

In [66]:
df.to_csv('/home/hasan/DATA SET/Fake News/processed test data.csv')