In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

import nltk
nltk.download('punkt')
from nltk.corpus import stopwords
from nltk.util import ngrams
from nltk.tokenize import word_tokenize,sent_tokenize
import re
import string

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer

from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics
from sklearn.feature_extraction.text import TfidfVectorizer

[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
df_fake = pd.read_csv('../input/fake-and-real-news-dataset/Fake.csv')
df_true = pd.read_csv('../input/fake-and-real-news-dataset/True.csv')

In [3]:
print(df_fake.shape)
print(df_true.shape)

(23481, 4)
(21417, 4)


In [4]:
print(df_fake.head())

                                               title  \
0   Donald Trump Sends Out Embarrassing New Year’...   
1   Drunk Bragging Trump Staffer Started Russian ...   
2   Sheriff David Clarke Becomes An Internet Joke...   
3   Trump Is So Obsessed He Even Has Obama’s Name...   
4   Pope Francis Just Called Out Donald Trump Dur...   

                                                text subject  \
0  Donald Trump just couldn t wish all Americans ...    News   
1  House Intelligence Committee Chairman Devin Nu...    News   
2  On Friday, it was revealed that former Milwauk...    News   
3  On Christmas day, Donald Trump announced that ...    News   
4  Pope Francis used his annual Christmas Day mes...    News   

                date  
0  December 31, 2017  
1  December 31, 2017  
2  December 30, 2017  
3  December 29, 2017  
4  December 25, 2017  


In [5]:
print(df_true.head())

                                               title  \
0  As U.S. budget fight looms, Republicans flip t...   
1  U.S. military to accept transgender recruits o...   
2  Senior U.S. Republican senator: 'Let Mr. Muell...   
3  FBI Russia probe helped by Australian diplomat...   
4  Trump wants Postal Service to charge 'much mor...   

                                                text       subject  \
0  WASHINGTON (Reuters) - The head of a conservat...  politicsNews   
1  WASHINGTON (Reuters) - Transgender people will...  politicsNews   
2  WASHINGTON (Reuters) - The special counsel inv...  politicsNews   
3  WASHINGTON (Reuters) - Trump campaign adviser ...  politicsNews   
4  SEATTLE/WASHINGTON (Reuters) - President Donal...  politicsNews   

                 date  
0  December 31, 2017   
1  December 29, 2017   
2  December 31, 2017   
3  December 30, 2017   
4  December 29, 2017   


In [6]:
df_fake.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23481 entries, 0 to 23480
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   title    23481 non-null  object
 1   text     23481 non-null  object
 2   subject  23481 non-null  object
 3   date     23481 non-null  object
dtypes: object(4)
memory usage: 733.9+ KB


In [7]:
df_true.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21417 entries, 0 to 21416
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   title    21417 non-null  object
 1   text     21417 non-null  object
 2   subject  21417 non-null  object
 3   date     21417 non-null  object
dtypes: object(4)
memory usage: 669.4+ KB


In [8]:
df_fake.describe()

Unnamed: 0,title,text,subject,date
count,23481,23481.0,23481,23481
unique,17903,17455.0,6,1681
top,MEDIA IGNORES Time That Bill Clinton FIRED His...,,News,"May 10, 2017"
freq,6,626.0,9050,46


In [9]:
df_true.describe()

Unnamed: 0,title,text,subject,date
count,21417,21417,21417,21417
unique,20826,21192,2,716
top,Factbox: Trump fills top jobs for his administ...,(Reuters) - Highlights for U.S. President Dona...,politicsNews,"December 20, 2017"
freq,14,8,11272,182


In [10]:
df_fake.isnull().sum()

title      0
text       0
subject    0
date       0
dtype: int64

In [11]:
df_true.isnull().sum()

title      0
text       0
subject    0
date       0
dtype: int64

In [12]:
df_fake['Label'] = 1
df_true['Label'] = 0
df_total = pd.concat([df_true,df_fake],axis=0,ignore_index=True)

In [13]:
print(df_total.shape)
print(df_total.head())

(44898, 5)
                                               title  \
0  As U.S. budget fight looms, Republicans flip t...   
1  U.S. military to accept transgender recruits o...   
2  Senior U.S. Republican senator: 'Let Mr. Muell...   
3  FBI Russia probe helped by Australian diplomat...   
4  Trump wants Postal Service to charge 'much mor...   

                                                text       subject  \
0  WASHINGTON (Reuters) - The head of a conservat...  politicsNews   
1  WASHINGTON (Reuters) - Transgender people will...  politicsNews   
2  WASHINGTON (Reuters) - The special counsel inv...  politicsNews   
3  WASHINGTON (Reuters) - Trump campaign adviser ...  politicsNews   
4  SEATTLE/WASHINGTON (Reuters) - President Donal...  politicsNews   

                 date  Label  
0  December 31, 2017       0  
1  December 29, 2017       0  
2  December 31, 2017       0  
3  December 30, 2017       0  
4  December 29, 2017       0  


In [14]:
df_total['text'] = df_total['title'] + df_total['text']
df_total = df_total.drop(['title'], axis=1)

In [15]:
def clean_text(text):
    
    text = str(text).lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    text = re.sub('Reuters','',text)
    return text

In [16]:
df_total['text'] = df_total['text'].apply(lambda x:clean_text(x))

In [17]:
stop_words = stopwords.words('english')
df_total['text'] = df_total['text'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop_words)]))

In [18]:
def lemmatize_words(text):
    wnl = nltk.stem.WordNetLemmatizer()
    lem = ' '.join([wnl.lemmatize(word) for word in text.split()])    
    return lem

In [19]:
df_total['text'] = df_total['text'].apply(lemmatize_words)

In [20]:
x_ = df_total['text']
y_ = df_total['Label']
x_train, x_test, y_train, y_test = train_test_split(x_, y_,test_size=0.20,random_state=16)

In [21]:
count_vectorizer = CountVectorizer(stop_words='english')
train_count_vectorizer = count_vectorizer.fit_transform(x_train.values)
test_count_vectorizer = count_vectorizer.transform(x_test.values)


In [22]:
print(train_count_vectorizer.shape)
print(test_count_vectorizer.shape)

(35918, 189016)
(8980, 189016)


In [23]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score

In [24]:
clf_mnb = MultinomialNB()
clf_mnb.fit(train_count_vectorizer, y_train)
pred_mnb = clf_mnb.predict(test_count_vectorizer)

In [25]:
print(accuracy_score(y_test, pred_mnb))
print(recall_score(y_test, pred_mnb))
print(precision_score(y_test, pred_mnb))
print(f1_score(y_test, pred_mnb))

0.9525612472160356
0.9532571912013537
0.9564940577249575
0.9548728813559323


In [26]:
from sklearn.linear_model import LogisticRegression
clf_lr = LogisticRegression()
clf_lr.fit(train_count_vectorizer, y_train)
pred_lr = clf_lr.predict(test_count_vectorizer)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [27]:
print(accuracy_score(y_test, pred_lr))
print(recall_score(y_test, pred_lr))
print(precision_score(y_test, pred_lr))
print(f1_score(y_test, pred_lr))

0.9908685968819599
0.9928087986463621
0.9898776887389287
0.9913410770855332
