<a href="https://colab.research.google.com/github/ibrahim-saleem/projects/blob/main/Fake_News_Detection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Fake News Detection by *'Syed Ibrahim Saleem'

### use the link given below to download the datasets

**Dataset**: https://www.kaggle.com/clmentbisaillon/fake-and-real-news-dataset

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn import feature_extraction, linear_model, model_selection, preprocessing
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline

In [None]:
from google.colab import files
files.upload()

Saving Fake.csv.zip to Fake.csv.zip
Saving True.csv.zip to True.csv.zip


In [None]:
fake = pd.read_csv('Fake.csv.zip')
true = pd.read_csv('True.csv.zip')

In [None]:
fake.shape,true.shape

In [None]:
fake.head()

In [None]:
true.head()

In [None]:
# Concatenate dataframes
data = pd.concat([fake, true]).reset_index(drop = True)
data.shape

(44898, 5)

In [None]:
data.head(5)

In [None]:
data.tail(5)

In [None]:
# Shuffle the data
from sklearn.utils import shuffle
data = shuffle(data)
data = data.reset_index(drop=True)

In [None]:
# Check the data
data.head()

In [None]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 44898 entries, 0 to 44897
Data columns (total 5 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   title    44898 non-null  object
 1   text     44898 non-null  object
 2   subject  44898 non-null  object
 3   date     44898 non-null  object
 4   target   44898 non-null  object
dtypes: object(5)
memory usage: 1.7+ MB


In [None]:
# Remove the title
data.drop(["title"],axis=1,inplace=True)
data.head()

Unnamed: 0,text,subject,date,target
0,tune in to the alternate current radio network...,US_News,"October 27, 2017",fake
1,how careless are these people clinton s righth...,politics,"Aug 25, 2016",fake
2,beirut reuters a saudi official has visited n...,worldnews,"October 19, 2017",true
3,shades of baltimore how can citizens or visito...,politics,"Mar 18, 2016",fake
4,former gop congressman joe walsh rill has alwa...,News,"May 3, 2017",fake


In [None]:
# Remove the date
data.drop(["date"],axis=1,inplace=True)
data.head()

Unnamed: 0,text,subject,target
0,tune in to the alternate current radio network...,US_News,fake
1,how careless are these people clinton s righth...,politics,fake
2,beirut reuters a saudi official has visited n...,worldnews,true
3,shades of baltimore how can citizens or visito...,politics,fake
4,former gop congressman joe walsh rill has alwa...,News,fake


In [None]:
# Convert to lower-case
data['text'] = data['text'].apply(lambda x: x.lower())
data.head()

Unnamed: 0,text,subject,target
0,tune in to the alternate current radio network...,US_News,fake
1,how careless are these people clinton s righth...,politics,fake
2,beirut reuters a saudi official has visited n...,worldnews,true
3,shades of baltimore how can citizens or visito...,politics,fake
4,former gop congressman joe walsh rill has alwa...,News,fake


In [None]:
# Remove punctuation
import string

def punctuation_removal(text):
    all_list = [char for char in text if char not in string.punctuation]
    clean_str = ''.join(all_list)
    return clean_str

data['text'] = data['text'].apply(punctuation_removal)

In [None]:
data.head()

In [None]:
# Remove stopwords
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
stop = stopwords.words('english')

data['text'] = data['text'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))

In [None]:
data.head()

In [None]:
# Function to plot the confusion matrix
from sklearn import metrics
import itertools

def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):

    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, cm[i, j],
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')

In [None]:
# Split the data
X_train,X_test,y_train,y_test = train_test_split(data['text'], data.target, test_size=0.2, random_state=42)

In [None]:
X_train.head()

In [None]:
y_train.head()

In [None]:
from sklearn.tree import DecisionTreeClassifier

# Vectorizing and applying TF-IDF
pipe = Pipeline([('vect', CountVectorizer()),
                 ('tfidf', TfidfTransformer()),
                 ('model', DecisionTreeClassifier(criterion= 'entropy',
                                           max_depth = 20,
                                           splitter='best',
                                           random_state=42))])
# Fitting the model
model = pipe.fit(X_train, y_train)

# Accuracy
prediction = model.predict(X_test)
print("accuracy: {}%".format(round(accuracy_score(y_test, prediction)*100,2)))

In [None]:
cm = metrics.confusion_matrix(y_test, prediction)
plot_confusion_matrix(cm, classes=['Fake', 'Real'])