# DeFactify 📰

First, lets import the libraries.

In [136]:
import sklearn
import pandas
import seaborn
import nltk
import re

from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

print("scikit-learn version:", sklearn.__version__)  # 1.6.1
print("pandas version:", pandas.__version__)         # 2.2.3
print("seaborn version:", seaborn.__version__)       # 0.13.2
print("nltk version:", nltk.__version__)             # 3.9.1

scikit-learn version: 1.6.1
pandas version: 2.2.3
seaborn version: 0.13.2
nltk version: 3.9.1


## Importing stopwords

This package will help us filter out the unnecessary words in the contents of the articles.

In [137]:
stemmer = PorterStemmer()

stop_words = set(stopwords.words('english'))

print(stopwords.words('English'))

['a', 'about', 'above', 'after', 'again', 'against', 'ain', 'all', 'am', 'an', 'and', 'any', 'are', 'aren', "aren't", 'as', 'at', 'be', 'because', 'been', 'before', 'being', 'below', 'between', 'both', 'but', 'by', 'can', 'couldn', "couldn't", 'd', 'did', 'didn', "didn't", 'do', 'does', 'doesn', "doesn't", 'doing', 'don', "don't", 'down', 'during', 'each', 'few', 'for', 'from', 'further', 'had', 'hadn', "hadn't", 'has', 'hasn', "hasn't", 'have', 'haven', "haven't", 'having', 'he', "he'd", "he'll", 'her', 'here', 'hers', 'herself', "he's", 'him', 'himself', 'his', 'how', 'i', "i'd", 'if', "i'll", "i'm", 'in', 'into', 'is', 'isn', "isn't", 'it', "it'd", "it'll", "it's", 'its', 'itself', "i've", 'just', 'll', 'm', 'ma', 'me', 'mightn', "mightn't", 'more', 'most', 'mustn', "mustn't", 'my', 'myself', 'needn', "needn't", 'no', 'nor', 'not', 'now', 'o', 'of', 'off', 'on', 'once', 'only', 'or', 'other', 'our', 'ours', 'ourselves', 'out', 'over', 'own', 're', 's', 'same', 'shan', "shan't", 'she

## Data provisioning 📦

Let's first insert the BBC dataset.

In [138]:
bbc_dataset_v1 = pandas.read_csv('scraper/bbc_news_articles_v1.csv')
bbc_dataset_v1.shape

(39, 7)

In [139]:
bbc_dataset_v2 = pandas.read_csv('scraper/bbc_news_articles_v2.csv')
bbc_dataset_v2.shape

(38, 7)

Now lets also add the generated fake news data.

In [140]:
columns = ['id','title','text','subject','date','classification']
huggingface_dataset = pandas.read_csv('scraper/huggingface_dataset.csv', usecols=columns, low_memory=False)
huggingface_dataset.shape

(30016, 6)

## Combine both datasets

In [155]:
combined_dataset = pandas.concat([bbc_dataset_v1, bbc_dataset_v2, huggingface_dataset], ignore_index=True)

combined_dataset.shape

(30093, 9)

# Sample the data 📃

In [156]:
combined_dataset.sample(10)

Unnamed: 0,title,link,source,author,date,text,classification,id,subject
26921,Russia says Pyongyang wants direct talks with ...,,,,7-Dec-17,Russian Foreign Minister Sergei Lavrov said on...,1,12825,worldnews
29119,Timeline: Zika's origin and global spread,,,,11-Aug-16,80s: Zika detected in mosquitoes and monkeys a...,1,8482,politicsNews
18528,Rick Scott Was So Butthurt By The Woman Who Ca...,,,,9-Apr-16,Florida Governor Rick Scott isn t up for re-el...,0,7013,News
7790,“Trumpocalypse”: HACKED EMAIL EXPOSES DEMOCRAT...,,,,6-Jul-16,Let that sink in A major party in America is p...,0,20328,left-news
11738,NO RESPECT! US DIPLOMAT ATTACKED By Russian Gu...,,,,10-Jul-16,If you haven t seethe viral video of a takedow...,0,16503,Government News
10886,Rep. Maxine Waters Just Nailed Ben Carson To T...,,,,3-Jul-17,Rep. Maxine Waters just called Housing and Urb...,0,941,News
23752,Ukraine leader welcomes EU's extension of sanc...,,,,14-Dec-17,Ukrainian President Petro Poroshenko said on T...,1,12264,worldnews
18964,DELEGATES FOR DUMMIES: How They’re Awarded…And...,,,,2-Mar-16,Stop counting the votes! Your candidates nomin...,0,20929,left-news
5662,Iranian political activist shot dead in Nether...,,,,9-Nov-17,A political activist who founded an Arab natio...,1,15253,worldnews
10780,"Trump says concerns about Iran driving Israel,...",,,,22-May-17,U.S. President Donald Trump said on Monday tha...,1,3647,politicsNews


In [158]:
cleaned_columns = ['title','text','classification']
focused_data = combined_dataset.loc[:,cleaned_columns]

focused_data.sample(10)

Unnamed: 0,title,text,classification
8272,Brazil Congress advances bill to curb party pr...,Brazil s lower house of Congress on Tuesday ga...,1
25358,FINALLY: Two Members Of Bundy Militia Arrested,"For the last two weeks, armed domestic terrori...",0
27923,"Emirates, Etihad boarding as usual after secon...",Passengers from six mainly Muslim countries wh...,1
20492,KATIE COURIC TWEETS Disgust After Two Of Her P...,Katie Couric took to Twitter to call out Charl...,0
19015,U.S. candidate Rubio: conditions not right for...,U.S. Republican presidential candidate Marco R...,1
5155,"Police evacuate Bonn Christmas market, probe s...",Police brought in experts and an explosives ro...,1
2590,OPEN-BORDER LIBERALS Put Entire Nation On High...,Thank you Angela Merkel German security servic...,0
21106,"Ugandan MPs get $8,000 each for work on extend...",Ugandan legislators have each pocketed 29 mill...,1
19287,AMBASSADOR JOHN BOLTON: Susan Rice Has “Real L...,Ambassador John Bolton was on Lou Dobbs tonigh...,0
11685,"China confirms will amend party constitution, ...",China s ruling Communist Party has agreed to a...,1


In [159]:
focused_data.isnull().sum()

title              0
text               3
classification    16
dtype: int64

In [161]:
cleaned_dataset = focused_data.dropna()

cleaned_dataset.shape

(30074, 3)

In [168]:
# Filter rows where classification is either '0' or '1'
cleaned_dataset = cleaned_dataset[cleaned_dataset['classification'].isin(['0', '1'])]

cleaned_dataset.shape

(30058, 3)

In [169]:
cleaned_dataset['content'] = cleaned_dataset['title'] + ' : ' + cleaned_dataset['text']
print(cleaned_dataset['content'])

0        Istanbul mayor arrested ahead of selection to ...
1        More than 50,000 killed in Gaza since Israel o...
2        Ferrari's Hamilton disqualified from Chinese G...
3        A life spent waiting - and searching rows of u...
4        Istanbul mayor arrested ahead of selection to ...
                               ...                        
30088    U.S. aerospace industry urges Trump to help Ex...
30089    Highlights: Hong Kong leader Carrie Lam delive...
30090    Obama Literally LAUGHS At Claims That Brexit M...
30091    Syrian army takes full control of Deir al-Zor ...
30092    U.S., Israel sign $38 billion military aid pac...
Name: content, Length: 30058, dtype: object


## Stemming

In [170]:
def stemming(content):
    # remove non-alphabetic characters and tokenize
    tokens = word_tokenize(re.sub(r'[^a-zA-Z]', ' ', content.lower()))
    # stemming and remove stopwords
    stemmed_tokens = [stemmer.stem(word) for word in tokens if word not in stop_words]
    
    return ' '.join(stemmed_tokens)

In [171]:
cleaned_dataset.loc[:, 'content'] = cleaned_dataset['content'].apply(stemming)

In [172]:
print(cleaned_dataset[['content']])

                                                 content
0      istanbul mayor arrest ahead select run erdogan...
1      kill gaza sinc israel offens began hama run mi...
2      ferrari hamilton disqualifi chines gp lewi ham...
3      life spent wait search row unclaim bodi saira ...
4      istanbul mayor arrest ahead select run erdogan...
...                                                  ...
30088  u aerospac industri urg trump help ex im bank ...
30089  highlight hong kong leader carri lam deliv mai...
30090  obama liter laugh claim brexit mean trump win ...
30091  syrian armi take full control deir al zor isla...
30092  u israel sign billion militari aid packag unit...

[30058 rows x 1 columns]


In [173]:
X = cleaned_dataset['text'].values
Y = cleaned_dataset['classification'].values

In [174]:
print(X)

['The main rival to Turkey\'s President Recep Tayyip Erdogan has been formally arrested and charged with corruption.\r\n\r\nEkrem Imamoglu, the mayor of Istanbul, is expected to be selected as the opposition Republican People\'s Party\'s (CHP) 2028 presidential candidate in a ballot on Sunday.\r\n\r\nHe denies the allegations and says they are politically motivated. "I will never bow," he wrote on X before he was remanded in custody.\r\n\r\nHis detention sparked some of Turkey\'s largest protests in more than a decade. Erdogan has condemned the demonstrations and accused the CHP of trying to "disturb the peace and polarise our people".\r\n\r\nImamoglu was one of more than 100 people, including other politicians, journalists and businessmen, detained as part of an investigation on Wednesday, triggering four consecutive nights of demonstrations.\r\n\r\nOn Sunday, he was formally arrested and charged with "establishing and managing a criminal organisation, taking bribes, extortion, unlawf

In [175]:
print(Y)

['1' '1' '1' ... '0' '1' '1']


In [176]:
# converting the textual data to numerical data
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(cleaned_dataset['content']).toarray()
y = cleaned_dataset['classification']

In [177]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [183]:
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)

In [184]:
y_pred = knn.predict(X_test)

## Evaluation

In [185]:
accuracy = accuracy_score(y_test, y_pred)

print(f"Accuracy of KNN model: {accuracy * 100:.2f}%")

Accuracy of KNN model: 85.99%
