# DeFactify 📰

First, lets import the libraries.

In [1]:
import sklearn
import pandas
import seaborn
import nltk

from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

print("scikit-learn version:", sklearn.__version__)  # 1.6.1
print("pandas version:", pandas.__version__)         # 2.2.3
print("seaborn version:", seaborn.__version__)       # 0.13.2
print("nltk version:", nltk.__version__)             # 3.9.1

scikit-learn version: 1.6.1
pandas version: 2.2.3
seaborn version: 0.13.2
nltk version: 3.9.1


## Importing stopwords

This package will help us filter out the unnecessary words in the contents of the articles.

In [2]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ivetk\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
print(stopwords.words('English'))

['a', 'about', 'above', 'after', 'again', 'against', 'ain', 'all', 'am', 'an', 'and', 'any', 'are', 'aren', "aren't", 'as', 'at', 'be', 'because', 'been', 'before', 'being', 'below', 'between', 'both', 'but', 'by', 'can', 'couldn', "couldn't", 'd', 'did', 'didn', "didn't", 'do', 'does', 'doesn', "doesn't", 'doing', 'don', "don't", 'down', 'during', 'each', 'few', 'for', 'from', 'further', 'had', 'hadn', "hadn't", 'has', 'hasn', "hasn't", 'have', 'haven', "haven't", 'having', 'he', "he'd", "he'll", 'her', 'here', 'hers', 'herself', "he's", 'him', 'himself', 'his', 'how', 'i', "i'd", 'if', "i'll", "i'm", 'in', 'into', 'is', 'isn', "isn't", 'it', "it'd", "it'll", "it's", 'its', 'itself', "i've", 'just', 'll', 'm', 'ma', 'me', 'mightn', "mightn't", 'more', 'most', 'mustn', "mustn't", 'my', 'myself', 'needn', "needn't", 'no', 'nor', 'not', 'now', 'o', 'of', 'off', 'on', 'once', 'only', 'or', 'other', 'our', 'ours', 'ourselves', 'out', 'over', 'own', 're', 's', 'same', 'shan', "shan't", 'she

## Data provisioning 📦

Let's first insert the BBC dataset.

In [4]:
bbc_dataset = pandas.read_csv('scraper/bbc_news_articles_v1.csv')
bbc_dataset.shape

(39, 7)

# Sample the data 📃

In [5]:
bbc_dataset.sample(10)

Unnamed: 0,title,link,source,author,date,text,classification
36,Archer records worst bowling figures in IPL hi...,https://www.bbc.com/sport/cricket/articles/cgj...,BBC,,2025-03-23T12:35:01.697Z,Jofra Archer has returned to his former side R...,1
12,'Wonderful teenagers helped my son on Hallowee...,https://www.bbc.com/news/articles/ckg14rxnw8jo,BBC,George Sandeman,2025-03-23T02:05:57.482Z,Readers have told the BBC about strangers' ran...,1
25,Watch: Moment police chase ends in four car pi...,https://www.bbc.com/news/videos/c89ynj20p9po,BBC,,2025-03-22T15:55:21.268Z,,1
1,"More than 50,000 killed in Gaza war, Hamas-run...",https://www.bbc.com/news/articles/clyz4nnqgvdo,BBC,Tom Bennett,2025-03-23T13:18:23.851Z,"More than 50,000 Palestinians have been killed...",1
22,'Longest running pop band in history' to end t...,https://www.bbc.com/news/articles/cvg1w93x9ggo,BBC,,2025-03-22T18:04:46.177Z,The Searchers will end nearly 70 years of tour...,1
17,Five key moments in the battle for Khartoum,https://www.bbc.com/news/articles/ckgy341v680o,BBC,Peter Mwai,2025-03-22T13:01:35.753Z,The Sudanese army has regained control of key ...,1
18,'My husband is a fighter pilot in Ukraine. Her...,https://www.bbc.com/news/articles/c70wgq7y11qo,BBC,Zhanna Bezpiatchuk,2025-03-23T01:54:53.193Z,Maria's life has been reduced to waiting for t...,1
34,Tuchel 'not afraid' to make changes as injured...,https://www.bbc.com/sport/football/articles/cw...,BBC,,2025-03-23T10:33:56.935Z,Anthony Gordon replaced Marcus Rashford to ear...,1
20,Are Nigerians abroad widening the class divide...,https://www.bbc.com/news/articles/cvg1p5ek72vo,BBC,Danai Nesta Kupemba,2025-03-23T02:21:11.613Z,Scenes playing out in Nigeria during holiday p...,1
9,South Africa envoy expelled from US 'has no re...,https://www.bbc.com/news/articles/cg4k9v0vpv6o,BBC,Wedaeli Chibelushi,2025-03-23T12:41:45.789Z,The South African ambassador who was expelled ...,1


In [6]:
bbc_dataset.isnull().sum()

title              0
link               0
source             0
author            16
date               2
text               1
classification     0
dtype: int64

In [7]:
cleaned_dataset = bbc_dataset.dropna()

In [8]:
cleaned_dataset['content'] = cleaned_dataset['author'] + ' : ' + cleaned_dataset['title']
print(cleaned_dataset['content'])

0     Emily Wither : Istanbul mayor arrested ahead o...
1     Tom Bennett : More than 50,000 killed in Gaza ...
3     Farhat Javed : A life spent waiting - and sear...
4     Emily Wither : Istanbul mayor arrested ahead o...
5     Bethany Bell : Pope Francis is discharged from...
6     James Landale : Trump envoy dismisses Starmer ...
8     Tom Bennett : More than 50,000 killed in Gaza ...
9     Wedaeli Chibelushi : South Africa envoy expell...
10    Guy Lambert : One of the last journalists to i...
11    Lara Lewington, Liv McMahon & Tom Gerken : The...
12    George Sandeman : 'Wonderful teenagers helped ...
13    Ana Faguy : Columbia University agrees to Trum...
14    Zhanna Bezpiatchuk : 'My husband is a fighter ...
15    Sarah Rainsford : Fear and anger mount as 'bat...
17    Peter Mwai : Five key moments in the battle fo...
18    Zhanna Bezpiatchuk : 'My husband is a fighter ...
20    Danai Nesta Kupemba : Are Nigerians abroad wid...
21    Sarah Rainsford : Fear and anger mount as 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cleaned_dataset['content'] = cleaned_dataset['author'] + ' : ' + cleaned_dataset['title']


## Stemming

In [9]:
stemmer = PorterStemmer()

def apply_stemming(text):
    words = word_tokenize(text)
    stemmed_words = [stemmer.stem(word) for word in words]
    return ' '.join(stemmed_words)

cleaned_dataset['content_stemmed'] = cleaned_dataset['content'].apply(apply_stemming)

print(cleaned_dataset[['content_stemmed']])

                                      content_stemmed
0   emili wither : istanbul mayor arrest ahead of ...
1   tom bennett : more than 50,000 kill in gaza wa...
3   farhat jave : a life spent wait - and search r...
4   emili wither : istanbul mayor arrest ahead of ...
5   bethani bell : pope franci is discharg from ro...
6   jame landal : trump envoy dismiss starmer plan...
8   tom bennett : more than 50,000 kill in gaza wa...
9   weda chibelushi : south africa envoy expel fro...
10  guy lambert : one of the last journalist to in...
11  lara lewington , liv mcmahon & tom gerken : th...
12  georg sandeman : 'wonder teenag help my son on...
13  ana faguy : columbia univers agre to trump adm...
14  zhanna bezpiatchuk : 'mi husband is a fighter ...
15  sarah rainsford : fear and anger mount as 'bat...
17  peter mwai : five key moment in the battl for ...
18  zhanna bezpiatchuk : 'mi husband is a fighter ...
20  danai nesta kupemba : are nigerian abroad wide...
21  sarah rainsford : fear a

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cleaned_dataset['content_stemmed'] = cleaned_dataset['content'].apply(apply_stemming)


In [10]:
X = cleaned_dataset['text'].values
Y = cleaned_dataset['classification'].values

print(X)
print(Y)

['The main rival to Turkey\'s President Recep Tayyip Erdogan has been formally arrested and charged with corruption.\n\nEkrem Imamoglu, the mayor of the Turkish city of Istanbul, is expected to be selected as the opposition Republican People\'s Party\'s (CHP) 2028 presidential candidate in a ballot on Sunday.\n\nHe denies the allegations and says they are politically motivated. "I will never bow", he was quoted as saying after being remanded in custody pending trial.\n\nHis detention has sparked off some of the largest protests in more than a decade. Erdogan has condemned the demonstrations and accused the CHP of trying to "disturb the peace and polarise our people".\n\nImamoglu was one of more than 100 people, including other politicians, journalists and businessmen, detained as part of an investigation on Wednesday.\n\nOn Sunday, he was formally arrested and charged with "establishing and managing a criminal organisation, taking bribes, extortion, unlawfully recording personal data a

In [11]:
# converting the textual data to numerical data
vectorizer = TfidfVectorizer()
vectorizer.fit(X)

X = vectorizer.transform(X)

print(X)

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 8581 stored elements and shape (23, 3435)>
  Coords	Values
  (0, 3)	0.0318195951612149
  (0, 4)	0.0318195951612149
  (0, 48)	0.0636391903224298
  (0, 50)	0.026541279490594247
  (0, 68)	0.0318195951612149
  (0, 70)	0.024657367083807197
  (0, 101)	0.028847001930702883
  (0, 102)	0.019379051413186542
  (0, 103)	0.011712678915205211
  (0, 113)	0.021684773853295185
  (0, 121)	0.026541279490594247
  (0, 125)	0.04336954770659037
  (0, 131)	0.017495139006399495
  (0, 164)	0.0318195951612149
  (0, 173)	0.039916512478511405
  (0, 175)	0.020467732236911507
  (0, 176)	0.07751620565274617
  (0, 181)	0.0318195951612149
  (0, 191)	0.0318195951612149
  (0, 193)	0.019379051413186542
  (0, 205)	0.030378833132581714
  (0, 206)	0.024657367083807197
  (0, 207)	0.026541279490594247
  (0, 209)	0.028847001930702883
  (0, 237)	0.046850715660820844
  :	:
  (22, 3335)	0.04744894071515649
  (22, 3338)	0.01581631357171883
  (22, 3342)	0.0232083902834175

In [12]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

In [None]:
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, Y_train)

Y_pred = knn.predict(X_test)

Accuracy of KNN model: 1.0000


## Evaluation

In [14]:
accuracy = accuracy_score(Y_test, Y_pred)
print(f'Accuracy of KNN model: {accuracy:.4f}')

Accuracy of KNN model: 1.0000
