In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.metrics import confusion_matrix, accuracy_score

In [None]:
nltk.download("punkt")
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
# kaggle api
!kaggle datasets download -d jruvika/fake-news-detection

Dataset URL: https://www.kaggle.com/datasets/jruvika/fake-news-detection
License(s): ODbL-1.0
Downloading fake-news-detection.zip to /content
  0% 0.00/4.89M [00:00<?, ?B/s]
100% 4.89M/4.89M [00:00<00:00, 84.8MB/s]


In [None]:
!unzip fake-news-detection.zip

Archive:  fake-news-detection.zip
  inflating: data.csv                
  inflating: data.h5                 


## Loading the dataset & taking insights

In [None]:
df = pd.read_csv('data.csv')
df.shape

(4009, 4)

In [None]:
df.tail()

Unnamed: 0,URLs,Headline,Body,Label
4003,http://beforeitsnews.com/u-s-politics/2017/10/...,CNN and Globalist Exposed - Steve Quayle and A...,"Vietnam Is in Great Danger, You Must Publish a...",0
4004,http://beforeitsnews.com/sports/2017/09/trends...,Trends to Watch,Trends to Watch\n% of readers think this story...,0
4005,http://beforeitsnews.com/u-s-politics/2017/10/...,Trump Jr. Is Soon To Give A 30-Minute Speech F...,Trump Jr. Is Soon To Give A 30-Minute Speech F...,0
4007,https://www.reuters.com/article/us-china-pharm...,China to accept overseas trial data in bid to ...,SHANGHAI (Reuters) - China said it plans to ac...,1
4008,http://beforeitsnews.com/u-s-politics/2017/10/...,Vice President Mike Pence Leaves NFL Game Beca...,Vice President Mike Pence Leaves NFL Game Beca...,0


In [None]:
df.isnull().sum()

Unnamed: 0,0
URLs,0
Headline,0
Body,21
Label,0


In [None]:
df.duplicated().sum()

0

In [None]:
df.dropna(axis=0, inplace=True)

In [None]:
X = df["Body"]
y = df["Label"]

## Text Preprcessing

In [None]:
CLEANR = re.compile('<.*?>')

# main function
def text_preprocess(text, technique="stem"):
    """a function to preprocess data"""

    text = text.lower() # convert text to lower case
    text = re.sub(CLEANR, '', text) # remove html tags

    no_punct_text = ""
    word_list = []
    preprocessed_text = []

    # removing punctuations & special symbols
    for char in text:
        if char not in string.punctuation:
            no_punct_text = no_punct_text + (char)

    # tokenization
    word_tokens = nltk.word_tokenize(no_punct_text)

    # removing stop words
    for word in word_tokens:
        if word not in stopwords.words("english"):
            word_list.append(word)

    # stemming
    if technique == 'stem':
        for word in word_list:
            preprocessed_text.append(PorterStemmer().stem(word))

    # lemmatizing
    if technique == 'lemm':
        for word in word_list:
            preprocessed_text.append(WordNetLemmatizer().lemmatize(word))

    return " ".join(preprocessed_text)

In [None]:
X = X.apply(lambda x: text_preprocess(x))

In [None]:
tfidf_v = TfidfVectorizer(max_features=5000, ngram_range=(1,3))
X = tfidf_v.fit_transform(X).toarray()

In [None]:
X.shape

(3988, 5000)

In [None]:
y = np.array(y)

In [None]:
print(type(X))
print(type(y))

<class 'numpy.ndarray'>
<class 'numpy.ndarray'>


## Splitting the Data

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

## Building the Model

In [None]:
model = PassiveAggressiveClassifier(n_iter_no_change=50)

In [None]:
model.fit(X_train, y_train)

## Evaluating the Model

In [None]:
# on training data
X_train_pred = model.predict(X_train)
training_data_acc = accuracy_score(X_train_pred, y_train)
training_data_confusion_metrix = confusion_matrix(X_train_pred, y_train)

In [None]:
print("Training Data Acc: ", training_data_acc)
print("Confusion Metrix: \n", training_data_confusion_metrix)

Training Data Acc:  1.0
Confusion Metrix: 
 [[1696    0]
 [   0 1494]]


In [None]:
# on test data
X_test_pred = model.predict(X_test)
test_data_acc = accuracy_score(X_test_pred, y_test)
test_data_confusion_metrix = confusion_matrix(X_test_pred, y_test)

In [None]:
print("Test Data Acc: ", test_data_acc)
print("Confusion Metrix: \n", test_data_confusion_metrix)

Test Data Acc:  0.9862155388471178
Confusion Metrix: 
 [[421   8]
 [  3 366]]


- PassiveAggressiveClassifier is given the high accuracy on both training & test data.
