In [46]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import random, re

from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer


In [24]:
test_data = pd.read_csv('/home/hasan/DATA SET/Fake News/test.csv')

In [25]:
test_data.head()

Unnamed: 0,id,title,author,text
0,20800,"Specter of Trump Loosens Tongues, if Not Purse...",David Streitfeld,"PALO ALTO, Calif. — After years of scorning..."
1,20801,Russian warships ready to strike terrorists ne...,,Russian warships ready to strike terrorists ne...
2,20802,#NoDAPL: Native American Leaders Vow to Stay A...,Common Dreams,Videos #NoDAPL: Native American Leaders Vow to...
3,20803,"Tim Tebow Will Attempt Another Comeback, This ...",Daniel Victor,"If at first you don’t succeed, try a different..."
4,20804,Keiser Report: Meme Wars (E995),Truth Broadcast Network,42 mins ago 1 Views 0 Comments 0 Likes 'For th...


In [26]:
test_data.shape

(5200, 4)

# Feature Engineering

In [27]:
test_data.dtypes

id         int64
title     object
author    object
text      object
dtype: object

In [28]:
test_data.isnull().sum()

id          0
title     122
author    503
text        7
dtype: int64

In [29]:
#total number and percent of null value 
print('null value number = {} and percent = {} '.format((test_data['title'].isnull().sum()),   (test_data['title'].isnull().sum() *100)/len(test_data['title'])))
print('null value number = {} and percent = {} '.format((test_data['author'].isnull().sum()),   (test_data['author'].isnull().sum() *100)/len(test_data['title'])))
print('null value number = {} and percent = {} '.format((test_data['text'].isnull().sum()),   (test_data['text'].isnull().sum() *100)/len(test_data['title'])))

null value number = 122 and percent = 2.3461538461538463 
null value number = 503 and percent = 9.673076923076923 
null value number = 7 and percent = 0.1346153846153846 


In [30]:
#fill up null data
test_data['title'].fillna(method='ffill', inplace=True)
test_data['author'].fillna(method='bfill', inplace=True)
test_data['text'].fillna(method='ffill', inplace=True)

In [31]:
#checking null value again
test_data.isnull().sum()

id        0
title     0
author    0
text      0
dtype: int64

In [32]:
#resetting index
test_data = test_data.reset_index()

In [34]:
test_data.head(50)
test_data.drop('index', axis=1, inplace=True)

In [38]:
#printing a text
test_data['text'][0]

'PALO ALTO, Calif.  —   After years of scorning the political process, Silicon Valley has leapt into the fray. The prospect of a President Donald J. Trump is pushing the tech community to move beyond its traditional role as donors and to embrace a new existence as agitators and activists. A distinguished venture capital firm emblazoned on its corporate home page an earthy   epithet. One prominent tech chieftain says the consequences of Mr. Trump’s election would “range between disastrous and terrible. ” Another compares him to a dictator. And nearly 150 tech leaders signed an open letter decrying Mr. Trump and his campaign of “anger” and “bigotry. ” Not quite all the action is  . Peter Thiel, a founder of PayPal and Palantir who was the first outside investor in Facebook, spoke at the Republican convention in July. The New York Times reported on Saturday that Mr. Thiel is giving $1. 25 million to support Mr. Trump’s candidacy even as other supporters flee. (He also recently gave $1 mil

In [40]:
ps = PorterStemmer()

In [43]:
#Data cleaning and stemming
corpus = []

for row in range(0,len(test_data)):
    review = re.sub('[^a-zA-Z]', ' ', test_data['text'][row])
    review = review.lower()
    review = review.split()
    
    review = [ps.stem(word) for word in review if not word in stopwords.words('english')]
    review = ' '.join(review)
    corpus.append(review)

In [44]:
corpus[3]

'first succeed tri differ sport tim tebow heisman quarterback univers florida unabl hold n f l job pursu career major leagu basebal hold workout l b team month agent told espn news outlet may sound like public stunt noth could truth said brodi van wagenen caa basebal part sport agenc caa sport statement seen tim workout peopl insid outsid industri scout execut player fan impress talent decad sinc tebow play basebal full time mean comeback would easi task former major leagu catcher chad moeller said statement train tebow arizona said beyond impress tim athletic swing see bat speed power real basebal talent moeller said truli believ tim skill set potenti achiev goal play major leagu base seen past two month could happen rel quickli take gari sheffield former outfield news tebow attempt comeback basebal greet skeptic twitter junior neas high pont vedra fla tebow drew attent major leagu scout bat four home run left fielder ditch bat glove favor pigskin lead florida two nation championship 

In [47]:
#Bag of Words using TfidfVectorizer()
tfidf = TfidfVectorizer(max_features=5000, ngram_range=(1,3))
X = tfidf.fit_transform(corpus).toarray()
print('Shape of X is :',X.shape)

Shape of X is : (5200, 5000)


In [48]:
#printing some feature names
tfidf.get_feature_names()[:10]

['aaron',
 'abandon',
 'abba',
 'abc',
 'abc news',
 'abe',
 'abedin',
 'abil',
 'abl',
 'aboard']

In [49]:
#printing parameter
tfidf.get_params()

{'analyzer': 'word',
 'binary': False,
 'decode_error': 'strict',
 'dtype': numpy.float64,
 'encoding': 'utf-8',
 'input': 'content',
 'lowercase': True,
 'max_df': 1.0,
 'max_features': 5000,
 'min_df': 1,
 'ngram_range': (1, 3),
 'norm': 'l2',
 'preprocessor': None,
 'smooth_idf': True,
 'stop_words': None,
 'strip_accents': None,
 'sublinear_tf': False,
 'token_pattern': '(?u)\\b\\w\\w+\\b',
 'tokenizer': None,
 'use_idf': True,
 'vocabulary': None}

In [53]:
X

array([[0.       , 0.       , 0.       , ..., 0.       , 0.       ,
        0.       ],
       [0.       , 0.       , 0.       , ..., 0.       , 0.       ,
        0.       ],
       [0.       , 0.       , 0.       , ..., 0.       , 0.       ,
        0.       ],
       ...,
       [0.       , 0.0350673, 0.       , ..., 0.       , 0.       ,
        0.       ],
       [0.       , 0.       , 0.       , ..., 0.       , 0.       ,
        0.       ],
       [0.       , 0.       , 0.       , ..., 0.       , 0.       ,
        0.       ]])

This X is our processed test data