# Fake news detection

### Importing required libraries
Here we are going to import some of the required libraries, if extra library is required to install It will be installed later on.

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
import re
import string
from sklearn.model_selection import RandomizedSearchCV

### Inserting fake and real datasets

In [2]:
df_fake = pd.read_csv("fakenew.csv")
df_true = pd.read_csv("truenew.csv")

In [3]:
df_fake.head(5)

Unnamed: 0,Statement
0,"The science says “open the schools, stop weari..."
1,Says Joe Biden didn’t act in response to the e...
2,A Texas energy company billed a customer more ...
3,"“If you make $50,000/year, $36 of your taxes g..."
4,"Donald Trump's second impeachment ""cost $33 mi..."


In [4]:
df_true.head(5)

Unnamed: 0,Statement
0,"“Energy experts and State House Dems, among ot..."
1,"""The Texas power grid is not part of the US po..."
2,Said Rep. Alexandria Ocasio-Cortez and her all...
3,"“There is a consensus among economists left, r..."
4,“The United States was energy independent in 2...


Inserting a column called "class" for fake and real news dataset to categorise fake and true news. 

In [5]:
df_fake["class"] = 0
df_true["class"] = 1

Removing last 10 rows from both the datasets, for manual testing 

In [6]:
df_fake.shape, df_true.shape

((30234, 2), (27167, 2))

In [7]:
df_fake_manual_testing = df_fake.tail(10)
for i in range(23480,23470,-1):
    df_fake.drop([i], axis = 0, inplace = True)
df_true_manual_testing = df_true.tail(5000)
for i in range(21416,21406,-1):
    df_true.drop([i], axis = 0, inplace = True)

In [8]:
df_fake.shape, df_true.shape

((30224, 2), (27157, 2))

Merging the manual testing dataframe in single dataset and save it in a csv file

In [9]:
df_fake_manual_testing["class"] = 0
df_true_manual_testing["class"] = 1

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [10]:
df_fake_manual_testing.head(10)

Unnamed: 0,Statement,class
30224,Seven Iranians freed in the prisoner swap have...,0
30225,#Hashtag Hell & The Fake Left,0
30226,Astroturfing: Journalist Reveals Brainwashing ...,0
30227,The New American Century: An Era of Fraud,0
30228,Hillary Clinton: ‘Israel First’ (and no peace ...,0
30229,McPain: John McCain Furious That Iran Treated ...,0
30230,JUSTICE? Yahoo Settles E-mail Privacy Class-ac...,0
30231,Sunnistan: US and Allied ‘Safe Zone’ Plan to T...,0
30232,How to Blow $700 Million: Al Jazeera America F...,0
30233,10 U.S. Navy Sailors Held by Iranian Military ...,0


In [11]:
df_true_manual_testing.head(10)

Unnamed: 0,Statement,class
22167,Trump leans toward replacing Fed chief if he w...,1
22168,"Obama, Saudi king discuss U.S.-Saudi ties, con...",1
22169,Lawmakers may ask Air Force to look at restart...,1
22170,Oklahoma can consider PTSD in sentencing veter...,1
22171,"Obama, Abu Dhabi Crown Prince discuss Yemen, L...",1
22172,War of words with pope no issue for Trump back...,1
22173,U.S. lawmaker wants hearing on bill to curb sh...,1
22174,'Reality' of 9/11 report less damaging than ru...,1
22175,"Senate passes bill to bolster power grid, spee...",1
22176,Supreme Court upholds Arizona legislative dist...,1


In [12]:
df_manual_testing = pd.concat([df_fake_manual_testing,df_true_manual_testing], axis = 0)
df_manual_testing.to_csv("manual_testing.csv")

Merging the main fake and true dataframe

In [13]:
df_marge = pd.concat([df_fake, df_true], axis =0 )
df_marge.head(10)

Unnamed: 0,Statement,class
0,"The science says “open the schools, stop weari...",0
1,Says Joe Biden didn’t act in response to the e...,0
2,A Texas energy company billed a customer more ...,0
3,"“If you make $50,000/year, $36 of your taxes g...",0
4,"Donald Trump's second impeachment ""cost $33 mi...",0
5,"Says Ted Cruz tweeted, “I’ll believe in climat...",0
6,The snow is government-generated or fake.,0
7,Photo shows an ice mass hanging inside a Houst...,0
8,"""Our wind and our solar got shut down ... and ...",0
9,“FEMA is paying for hotel rooms!!!” for Texas ...,0


In [14]:
df_marge.columns

Index(['Statement', 'class'], dtype='object')

In [15]:
df_marge.isnull().sum()

Statement    0
class        0
dtype: int64

#### Randomly shuffling the dataframe 

In [16]:
df = df_marge.sample(frac = 1)

In [17]:
df.head()

Unnamed: 0,Statement,class
1447,“We have an America where … hard-working famil...,0
9545,Every 1 percent increase in the number of Ohio...,1
24940,France's conservatives choose leader to rattle...,1
2535,"Says Democratic Senators ""demand Supreme Court...",0
25132,Ukraine shelves controversial corruption law a...,1


In [18]:
df.reset_index(inplace = True)
df.drop(["index"], axis = 1, inplace = True)

In [19]:
df.columns

Index(['Statement', 'class'], dtype='object')

In [20]:
df.head()

Unnamed: 0,Statement,class
0,“We have an America where … hard-working famil...,0
1,Every 1 percent increase in the number of Ohio...,1
2,France's conservatives choose leader to rattle...,1
3,"Says Democratic Senators ""demand Supreme Court...",0
4,Ukraine shelves controversial corruption law a...,1


#### Creating a function to convert the text in lowercase, remove the extra space, special chr., ulr and links.

In [21]:
def wordopt(Statement):
    Statement = Statement.lower()
    Statement = re.sub('\[.*?\]', '', Statement)
    Statement = re.sub("\\W"," ",Statement) 
    Statement = re.sub('https?://\S+|www\.\S+', '', Statement)
    Statement = re.sub('<.*?>+', '', Statement)
    Statement= re.sub('[%s]' % re.escape(string.punctuation), '', Statement)
    Statement= re.sub('\n', '', Statement)
    Statement = re.sub('\w*\d\w*', '', Statement)    
    return Statement

In [22]:
df["Statement"] = df["Statement"].apply(wordopt)

#### Defining dependent and independent variable as x and y

In [23]:
x = df["Statement"]
y = df["class"]

#### Splitting the dataset into training set and testing set. 

In [24]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25)

#### Convert text to vectors

In [25]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [26]:
vectorization = TfidfVectorizer()
xv_train = vectorization.fit_transform(x_train)
xv_test = vectorization.transform(x_test)

### Classifiers

### 1. Logistic Regression

In [27]:
from sklearn.linear_model import LogisticRegression

In [28]:
LR = LogisticRegression()
LR.fit(xv_train,y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [29]:
pred_lr=LR.predict(xv_test)

In [30]:
LR.score(xv_test, y_test)

0.849714206050467

In [31]:
print(classification_report(y_test, pred_lr))
print(confusion_matrix(y_test, pred_lr))

              precision    recall  f1-score   support

           0       0.87      0.83      0.85      7568
           1       0.82      0.87      0.84      6778

    accuracy                           0.85     14346
   macro avg       0.85      0.85      0.85     14346
weighted avg       0.85      0.85      0.85     14346

[[6315 1253]
 [ 903 5875]]


### 2. Decision Tree 

In [32]:
from sklearn.tree import DecisionTreeClassifier

In [33]:
DT = DecisionTreeClassifier()
DT.fit(xv_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=None, splitter='best')

In [34]:
pred_dt = DT.predict(xv_test)

In [35]:
DT.score(xv_test, y_test)

0.7837027742924857

In [36]:
print(classification_report(y_test, pred_dt))
print(confusion_matrix(y_test, pred_dt))

              precision    recall  f1-score   support

           0       0.80      0.79      0.79      7568
           1       0.77      0.77      0.77      6778

    accuracy                           0.78     14346
   macro avg       0.78      0.78      0.78     14346
weighted avg       0.78      0.78      0.78     14346

[[5994 1574]
 [1529 5249]]


### 3. Gradient Boosting

In [37]:
from sklearn.ensemble import GradientBoostingClassifier

In [38]:
GBC = GradientBoostingClassifier(random_state=0)
GBC.fit(xv_train, y_train)

GradientBoostingClassifier(criterion='friedman_mse', init=None,
                           learning_rate=0.1, loss='deviance', max_depth=3,
                           max_features=None, max_leaf_nodes=None,
                           min_impurity_decrease=0.0, min_impurity_split=None,
                           min_samples_leaf=1, min_samples_split=2,
                           min_weight_fraction_leaf=0.0, n_estimators=100,
                           n_iter_no_change=None, presort='auto',
                           random_state=0, subsample=1.0, tol=0.0001,
                           validation_fraction=0.1, verbose=0,
                           warm_start=False)

In [39]:
pred_gbc = GBC.predict(xv_test)

In [40]:
GBC.score(xv_test, y_test)

0.7711557228495748

In [41]:
print(classification_report(y_test, pred_gbc))
print(confusion_matrix(y_test, pred_gbc))

              precision    recall  f1-score   support

           0       0.85      0.69      0.76      7568
           1       0.71      0.86      0.78      6778

    accuracy                           0.77     14346
   macro avg       0.78      0.78      0.77     14346
weighted avg       0.78      0.77      0.77     14346

[[5241 2327]
 [ 956 5822]]


### 4. Random Forest 

In [42]:
from sklearn.ensemble import RandomForestClassifier

In [43]:
RFC = RandomForestClassifier(random_state=0)
RFC.fit(xv_train, y_train)



RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=10,
                       n_jobs=None, oob_score=False, random_state=0, verbose=0,
                       warm_start=False)

In [44]:
pred_rfc = RFC.predict(xv_test)

In [45]:
RFC.score(xv_test, y_test)

0.8192527533807333

In [46]:
print(classification_report(y_test, pred_rfc))
print(confusion_matrix(y_test, pred_rfc))

              precision    recall  f1-score   support

           0       0.82      0.84      0.83      7568
           1       0.82      0.79      0.81      6778

    accuracy                           0.82     14346
   macro avg       0.82      0.82      0.82     14346
weighted avg       0.82      0.82      0.82     14346

[[6370 1198]
 [1395 5383]]


# 5. KNeighbors Classifier #

In [47]:
from sklearn.neighbors import KNeighborsClassifier
KN = KNeighborsClassifier(n_neighbors=5)
KN.fit(xv_train, y_train)
pred_kn = KN.predict(xv_test)

In [48]:
KN.score(xv_test, y_test)

0.7971560016729402

In [49]:
print(classification_report(y_test, pred_kn))
print(confusion_matrix(y_test, pred_kn))

              precision    recall  f1-score   support

           0       0.85      0.75      0.80      7568
           1       0.75      0.85      0.80      6778

    accuracy                           0.80     14346
   macro avg       0.80      0.80      0.80     14346
weighted avg       0.80      0.80      0.80     14346

[[5685 1883]
 [1027 5751]]


# Model Testing With Manual Entry

### News

In [50]:
def output_label(n):
    global count
    if n == 0:
        return "Fake News"
    elif n == 1:
        count=count+1
        return "Not A Fake News"
    
def manual_testing(news):
    global count
    testing_news = {"text":[news]}
    new_def_test = pd.DataFrame(testing_news)
    new_x_test = new_def_test["text"]
    new_xv_test = vectorization.transform(new_x_test)
    pred_LR = LR.predict(new_xv_test)
    pred_DT = DT.predict(new_xv_test)
    pred_GBC = GBC.predict(new_xv_test)
    pred_RFC = RFC.predict(new_xv_test)
    pred_KN = KN.predict(xv_test)
    output_label(pred_LR[0])
    output_label(pred_DT[0])
    output_label(pred_GBC[0])
    output_label(pred_RFC[0])
    output_label(pred_KN[0])
    #print(count/5)
    if count/5>0 and count/5<=0.25:
        return print("This news is false")
    elif count/5>0.25 and count/5<=0.5:
        return print("This news might be false")
    elif count/5>0.5 and count/5<=0.75:
        return print("This news might be true")
    if count/5>0.75 and count/5<=1:
        print("This news is true")


    

    '''return print("\n\nLR Prediction: {} \nDT Prediction: {} \nGBC Prediction: {} \nRFC Prediction: {} \nKNN Prediction: {}".format(output_label(pred_LR[0]), 
                                                                                                              output_label(pred_DT[0]), 
                                                                                                              output_label(pred_GBC[0]), 
                                                                                                              output_label(pred_RFC[0]),
                                                                                                              output_label(pred_KN[0])))'''

In [51]:
import pickle

pickle.dump(LR, open("LR.pickle ","wb"))
pickle.dump(DT, open("DT.pickle ","wb"))
pickle.dump(GBC, open("GBC.pickle ","wb"))
pickle.dump(RFC, open("RFC.pickle ","wb"))
pickle.dump(KN, open("KN.pickle ","wb"))

pickle.dump(vectorization, open("vector.pkl", "wb"))

In [52]:
vect = pickle.load(open("vector.pkl","rb"))
RandomFC = pickle.load(open("RFC.pickle ","rb"))

In [53]:
news = str(input())

testing_news = {"text":[news]}
new_def_test = pd.DataFrame(testing_news)
new_x_test = new_def_test["text"]
new_xv_test = vect.transform(new_x_test)
pred_RFC = RandomFC.predict(new_xv_test)

if pred_RFC == 0:
    print("This news is Fake 🙁")
else:
    print("This news is True 😃")

efhqioq
This news is True 😃
