# Q2: News Categorization with Naive Bayes

Building three Multinomial Naive Bayes classifiers to categorize news articles.

Assignment parts:
- Parse RDF/XML ontology
- Train models using headlines, descriptions, and combined features
- Evaluate and compare performance
- Save best model

In [2]:
import rdflib
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
import pickle

## Load and Parse RDF Data

In [5]:
graph = rdflib.Graph()
graph.parse('News_Categorizer_RDF.xml')

<Graph identifier=N15a2b9779f4a4f7e986ab897203f513e (<class 'rdflib.graph.Graph'>)>

In [24]:
# Extract article data correctly by grouping by subject
articles = {}

# Extract article data from RDF triples
for subject, predicate, obj in graph:
    pred_str = str(predicate)
    subj_str = str(subject)

    # Group all properties by article subject 
    if subj_str not in articles:
        articles[subj_str] = {}

    # Match exact predicate URIs
    if pred_str.endswith('#headline'):
        articles[subj_str]['headline'] = str(obj)
    elif pred_str.endswith('#short_description'):
        articles[subj_str]['description'] = str(obj)
    elif pred_str.endswith('#category'):
        articles[subj_str]['category'] = str(obj)
    elif pred_str.endswith('#place'):
        articles[subj_str]['location'] = str(obj)

# Convert dictionary to lists
headlines = []
descriptions = []
categories = []
locations = []

for article_id, article_data in articles.items():
    # Only include complete articles with all four fields
    if all(key in article_data for key in ['headline', 'description', 'category', 'location']):
        headlines.append(article_data['headline'])
        descriptions.append(article_data['description'])
        categories.append(article_data['category'])
        locations.append(article_data['location'])

print(f"Extracted {len(headlines)} articles")
print(f"Sample headline: {headlines[0] if headlines else 'None'}")
print(f"Sample category: {categories[0] if categories else 'None'}")

KeyError: 'descriptions'

In [20]:
df = pd.DataFrame({
    'headline': headlines,
    'description': descriptions,
    'category': categories,
    'location': locations
})

print(f"Total articles: {len(df)}")
print(f"\nCategory distribution:")
print(df['category'].value_counts())
print(f"\nFirst 5 rows:")
df.head()

Total articles: 0

Category distribution:
Series([], Name: count, dtype: int64)

First 5 rows:


Unnamed: 0,headline,description,category,location


## Model 1: Headlines Only

In [11]:
print("=" * 60)
print("Model 1: Headlines Only")
print("=" * 60)

# Convert headlines to numerical features
vectorizer1 = CountVectorizer()
X1 = vectorizer1.fit_transform(df['headline'])
y = df['category']

# Split into training (70%) and testing (30%) sets
X1_train, X1_test, y1_train, y1_test = train_test_split(X1, y, test_size=0.3, random_state=42)

Model 1: Headlines Only


In [21]:
# Train Multinomial Naive Bayes classifier
model1 = MultinomialNB()
model1.fit(X1_train, y1_train)

# Make predictions on test set
y1_pred = model1.predict(X1_test)
accuracy1 = accuracy_score(y1_test, y1_pred)

# Display results
print(f"\nAccuracy: {accuracy1:.4f} ({accuracy1*100:.2f}%)")
print(f"\nClassification Report:")
print(classification_report(y1_test, y1_pred))


Accuracy: 0.1000 (10.00%)

Classification Report:
                precision    recall  f1-score   support

      BUSINESS       0.09      0.09      0.09       296
 ENTERTAINMENT       0.10      0.13      0.11       286
  FOOD & DRINK       0.10      0.08      0.09       313
     PARENTING       0.10      0.10      0.10       305
      POLITICS       0.10      0.14      0.12       280
        SPORTS       0.10      0.09      0.10       295
STYLE & BEAUTY       0.11      0.09      0.10       307
        TRAVEL       0.14      0.10      0.11       323
      WELLNESS       0.09      0.10      0.10       294
    WORLD NEWS       0.08      0.08      0.08       301

      accuracy                           0.10      3000
     macro avg       0.10      0.10      0.10      3000
  weighted avg       0.10      0.10      0.10      3000



## Model 2: Descriptions Only

In [15]:
print("=" * 60)
print("Model 2: Descriptions Only")
print("=" * 60)

# Convert descriptions to numerical features
vectorizer2 = CountVectorizer()
X2 = vectorizer2.fit_transform(df['description'])

# Split into training and testing sets
X2_train, X2_test, y2_train, y2_test = train_test_split(X2, y, test_size=0.3, random_state=42)

Model 2: Descriptions Only


In [22]:
# Train multinomial Naive Bayes classifier
model2 = MultinomialNB()
model2.fit(X2_train, y2_train)

# Make predictions on test set
y2_pred = model2.predict(X2_test)
accuracy2 = accuracy_score(y2_test, y2_pred)

# Check description content
print("Sample descriptions")
for i in range(5):
    print(f"\n{i+1}. Category: {df['category'].iloc[i]}")
    print(f" Description: {df['description'].iloc[i][:200]}...")

# Display results
print(f"\nAccuracy: {accuracy2:.4f} ({accuracy2*100:.2f}%)")
print(f"\nClassification Report:")
print(classification_report(y2_test, y2_pred))

Sample descriptions


IndexError: single positional indexer is out-of-bounds

## Model 3: Combined Features

In [None]:
df['combined'] = df['headline'] + " " + df['description']

vectorizer3 = CountVectorizer()
X3 = vectorizer3.fit_transform(df['combined'])

X3_train, X3_test, y3_train, y3_test = train_test_split(X3, y, test_size=0.3, random_state=42)

In [None]:
model3 = MultinomialNB()
model3.fit(X3_train, y3_train)

y3_pred = model3.predict(X3_test)
accuracy3 = accuracy_score(y3_test, y3_pred)

## Model Comparison

In [None]:
# Compare accuracy scores

In [None]:
# Plot confusion matrices

## Save Best Model

In [None]:
# Determine which model performed best and save it