## Training the model

In [38]:
import numpy as np
import pandas as pd
import sklearn

In [22]:
fake = pd.read_csv(r"Dataset/Fake.csv")
true = pd.read_csv(r"Dataset/True.csv")

In [23]:
fake['label'] = 0
true['label'] = 1

In [24]:
data = pd.concat([fake, true], axis = 0)

In [25]:
data = data.sample(frac=1)

In [26]:
data.reset_index(inplace = True, drop =True)

In [31]:
data.drop(['title', 'subject', 'date'], axis=1, inplace = True)

In [32]:
data.head()

Unnamed: 0,text,label
0,A city in Pennsylvania is removing a park benc...,0
1,The idea of a Donald Trump presidency has been...,0
2,21st Century Wire says While America s politic...,0
3,Organized protesters took to the streets outsi...,0
4,UNITED NATIONS (Reuters) - Amid a U.S. push to...,1


In [35]:
import re

def word(text):
    
    # Convert into lowercase
    text = text.lower()
    # Remove urls
    text = re.sub(r"https?://\S+|www\.\S+", "", text)
    # Remove HTML tags
    text = re.sub(r"<.*?>", "", text)
    # Remove punctations
    text = re.sub('[^\w\s]', "", text)
    # Remvoe digits
    text = re.sub(r"\d", "", text)
    # Remove newline characters
    text = re.sub(r"\n", " ", text)

    return text

  text = re.sub('[^\w\s]', "", text)


In [36]:
data['text'] = data['text'].apply(word)

In [67]:
x = data['text']
y = data['label']

<class 'pandas.core.series.Series'>


In [40]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.3)

In [43]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()
xt_train = vectorizer.fit_transform(x_train)
xt_test = vectorizer.transform(x_test)

In [55]:
from sklearn.tree import DecisionTreeClassifier

dtc = DecisionTreeClassifier()
dtc.fit(xt_train, y_train)

In [56]:
y_preds = dtc.predict(xt_test)

In [57]:
dtc.score(xt_test, y_test)

0.9953971789161099

In [70]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_preds))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      7064
           1       1.00      0.99      1.00      6406

    accuracy                           1.00     13470
   macro avg       1.00      1.00      1.00     13470
weighted avg       1.00      1.00      1.00     13470



## Testing the model

In [73]:
# Load the vectorizer
with open('vectorizer.pkl', 'rb') as file:
    vectorizer = pickle.load(file)

# Load the model
with open('model.pkl', 'rb') as file:
    model = pickle.load(file)

def manual_testing(news):
    df_test = {"text": pd.Series([news])}
    df_test['text'] = df_test['text'].apply(word)
    df_test['text'] = vectorizer.transform(df_test['text'])
    y_preds = dtc.predict(df_test['text'])
    if y_preds == 0:
        print("It is a fake news")
    else:
        print("It is a genuine news")

news_article = "Top Canada officials admit they leaked info on Amit Shah, Indian ‘interference’ to US daily"
manual_testing(news_article)

It is a fake news


## Saving vectorizer and model

In [72]:
import pickle

# Save the vectorizer
with open('vectorizer.pkl', 'wb') as file:
    pickle.dump(vectorizer, file)

# Save the model
with open('model.pkl', 'wb') as file:
    pickle.dump(dtc, file)