# Classifying News Articles with Naive Bayes

### Loading Libraries

In [2]:
# Numerical Computing
import numpy as np

# Data Manipulation
import pandas as pd

# Path
from pathlib import Path


# Scikit-Learn
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score, confusion_matrix

### News article classification

#### Read BBC articles

In [3]:
DATA_DIR = Path('..', 'data')

In [4]:
path = DATA_DIR / 'bbc'

files = sorted(list(path.glob('**/*.txt')))

doc_list = []

for i, file in enumerate(files):
    topic = file.parts[-2]
    article = file.read_text(encoding='latin1').split('\n')
    heading = article[0].strip()
    body = ' '.join([l.strip() for l in article[1:]])
    doc_list.append([topic, heading, body])

In [5]:
docs = pd.DataFrame(doc_list, columns=['topic', 'heading', 'body'])

docs.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 0 entries
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   topic    0 non-null      object
 1   heading  0 non-null      object
 2   body     0 non-null      object
dtypes: object(3)
memory usage: 132.0+ bytes


#### Creating Stratified Train-Test Split

In [7]:
y = pd.factorize(docs.topic)[0]

X = docs.body

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, stratify=y)

#### Vectorize Text Data

In [8]:
vectorizer = CountVectorizer()

X_train_dtm = vectorizer.fit_transform(X_train)
X_test_dtm = vectorizer.transform(X_test)

In [9]:
X_train_dtm.shape, X_test_dtm.shape

### Training Multi-class Naive Bayes Model

In [10]:
nb = MultinomialNB()

nb.fit(X_train_dtm, y_train)
y_pred_class = nb.predict(X_test_dtm)

### Evaluating Results

#### Accuracy

In [14]:
accuracy_score(y_test, y_pred_class)

### Confusion Matrix

In [15]:
pd.DataFrame(confusion_matrix(y_true=y_test, y_pred=y_pred_class))