In [1]:
import pandas as pd
import numpy as np
import nltk
from nltk.corpus  import stopwords

In [2]:
fake_news = pd.read_csv("../input/fake-news-detection/Fake.csv")
true_news = pd.read_csv("../input/fake-news-detection/True.csv")
fake_news.head()

Unnamed: 0,title,text,subject,date
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017"
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017"
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,"December 30, 2017"
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",News,"December 29, 2017"
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,"December 25, 2017"


In [3]:
true_news.head()

Unnamed: 0,title,text,subject,date
0,"As U.S. budget fight looms, Republicans flip t...",WASHINGTON (Reuters) - The head of a conservat...,politicsNews,"December 31, 2017"
1,U.S. military to accept transgender recruits o...,WASHINGTON (Reuters) - Transgender people will...,politicsNews,"December 29, 2017"
2,Senior U.S. Republican senator: 'Let Mr. Muell...,WASHINGTON (Reuters) - The special counsel inv...,politicsNews,"December 31, 2017"
3,FBI Russia probe helped by Australian diplomat...,WASHINGTON (Reuters) - Trump campaign adviser ...,politicsNews,"December 30, 2017"
4,Trump wants Postal Service to charge 'much mor...,SEATTLE/WASHINGTON (Reuters) - President Donal...,politicsNews,"December 29, 2017"


In [4]:
print(fake_news.info())
print(true_news.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23481 entries, 0 to 23480
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   title    23481 non-null  object
 1   text     23481 non-null  object
 2   subject  23481 non-null  object
 3   date     23481 non-null  object
dtypes: object(4)
memory usage: 733.9+ KB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21417 entries, 0 to 21416
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   title    21417 non-null  object
 1   text     21417 non-null  object
 2   subject  21417 non-null  object
 3   date     21417 non-null  object
dtypes: object(4)
memory usage: 669.4+ KB
None


In [5]:
print(fake_news.subject.unique())
print(true_news.subject.unique())

['News' 'politics' 'Government News' 'left-news' 'US_News' 'Middle-east']
['politicsNews' 'worldnews']


In [6]:
fake_news['class'] = 0
true_news['class'] = 1

In [7]:
true_news.shape

(21417, 5)

In [8]:
merge_data = pd.concat([true_news,fake_news],axis=0)

In [9]:
merge_data.shape

(44898, 5)

In [10]:
merge_data= merge_data.sample(frac=1).reset_index()

In [11]:
merge_data.head()

Unnamed: 0,index,title,text,subject,date,class
0,8938,"Trump vows to reopen, or toss, NAFTA pact with...","MONESSEN, Pennsylvania/WASHINGTON (Reuters) - ...",politicsNews,"June 28, 2016",1
1,21654,AIR FORCE WILL EASE POLICY ON DISCHARGING TRAN...,"As a side note, I m just curious when did the ...",left-news,"Jun 11, 2015",0
2,19967,DR. WOLF CALLS OUT HILLARY For Lying About Pne...,Dr. Milton Wolf seems to have take offense at ...,left-news,"Sep 16, 2016",0
3,17030,PRISON PORK BAN: We Have Great News For Prison...,Just start asking questions and the Obama admi...,Government News,"Oct 17, 2015",0
4,624,Republican Senator Johnson opposes Senate tax ...,WASHINGTON (Reuters) - Republican U.S. Senator...,politicsNews,"November 15, 2017",1


In [12]:
merge_data.drop(['index','title','subject','date'],axis=1,inplace=True)
merge_data.head()

Unnamed: 0,text,class
0,"MONESSEN, Pennsylvania/WASHINGTON (Reuters) - ...",1
1,"As a side note, I m just curious when did the ...",0
2,Dr. Milton Wolf seems to have take offense at ...,0
3,Just start asking questions and the Obama admi...,0
4,WASHINGTON (Reuters) - Republican U.S. Senator...,1


In [13]:
import re
from string import punctuation
from nltk.stem import SnowballStemmer

In [14]:
stopword = stopwords.words('english')

In [15]:
len(stopword)

179

In [16]:
def clean_text(text,stop_words =True,stem_words = True):
    text = text.lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub("\\W"," ",text) 
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    
    text = ''.join([c for c in text if c not in punctuation])
    
    if stop_words:
        text = text.split()
        text = [w for w in text if not w in stopword]
        text = " ".join(text)
        
    if stem_words:
        text = text.split()
        stemmer = SnowballStemmer('english')
        stemmed_words = [stemmer.stem(word) for word in text]
        text = " ".join(stemmed_words)
        
    return text

In [17]:
merge_data['text'] = merge_data['text'].apply(clean_text)

In [18]:
merge_data

Unnamed: 0,text,class
0,monessen pennsylvania washington reuter republ...,1
1,side note curious world agre forfeit rainbow s...,0
2,dr milton wolf seem take offens diagnosi hilla...,0
3,start ask question obama administr revers cour...,0
4,washington reuter republican u senat ron johns...,1
...,...,...
44893,influenc peddl clinton time could hope consequ,0
44894,better blow doll actual citizen citi l beck ge...,0
44895,shock liber senat dian feinstein call closer l...,0
44896,cairo reuter sudan summon u charg affair sunda...,1


In [19]:
x = merge_data['text']
y = merge_data['class']

In [20]:
test_data= x.iloc[-10:]
train_data = x.iloc[:-10]
y_testl = y.iloc[-10:]
y_train = y.iloc[:-10]

In [21]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(train_data,y_train,test_size=0.25)

In [22]:
from sklearn.feature_extraction.text import TfidfVectorizer
tiv = TfidfVectorizer()
x_train_vector = tiv.fit_transform(x_train)
x_test_vector = tiv.transform(x_test)


In [23]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()
lr.fit(x_train_vector,y_train)

LogisticRegression()

In [24]:
from sklearn.metrics import accuracy_score,classification_report

In [25]:
tst_preds = lr.predict(x_test_vector)

In [26]:
print(lr.score(x_test_vector,y_test))

0.9863660666547852


In [27]:
print('accuracy : {} '.format(accuracy_score(y_test,tst_preds)))
print('\n')
print( 'classification_report : {}'.format(classification_report(y_test,tst_preds)))

accuracy : 0.9863660666547852 


classification_report :               precision    recall  f1-score   support

           0       0.99      0.99      0.99      5915
           1       0.98      0.99      0.99      5307

    accuracy                           0.99     11222
   macro avg       0.99      0.99      0.99     11222
weighted avg       0.99      0.99      0.99     11222



In [28]:
print(tst_preds)

[1 0 0 ... 1 0 0]


In [29]:
from sklearn.ensemble import GradientBoostingClassifier
gbc = GradientBoostingClassifier()
gbc.fit(x_train_vector,y_train)

GradientBoostingClassifier()

In [30]:
tst1_preds = gbc.predict(x_test_vector)

print('accuracy : {} '.format(accuracy_score(y_test,tst1_preds)))
print('\n')
print( 'classification_report : {}'.format(classification_report(y_test,tst1_preds)))

accuracy : 0.9951880235252183 


classification_report :               precision    recall  f1-score   support

           0       1.00      0.99      1.00      5915
           1       0.99      1.00      0.99      5307

    accuracy                           1.00     11222
   macro avg       1.00      1.00      1.00     11222
weighted avg       1.00      1.00      1.00     11222



In [31]:
test_datav = tiv.transform(test_data)

In [32]:
preds = (lr.predict(test_datav) + gbc.predict(test_datav))/2

In [33]:
preds = preds.astype(int)

In [34]:
accuracy_score(y_testl,preds)

1.0