# IMPORTS

In [None]:
!pip install requests-html

Collecting requests-html
  Downloading requests_html-0.10.0-py3-none-any.whl.metadata (15 kB)
Collecting pyquery (from requests-html)
  Downloading pyquery-2.0.0-py3-none-any.whl.metadata (9.0 kB)
Collecting fake-useragent (from requests-html)
  Downloading fake_useragent-1.5.1-py3-none-any.whl.metadata (15 kB)
Collecting parse (from requests-html)
  Downloading parse-1.20.2-py2.py3-none-any.whl.metadata (22 kB)
Collecting bs4 (from requests-html)
  Downloading bs4-0.0.2-py2.py3-none-any.whl.metadata (411 bytes)
Collecting w3lib (from requests-html)
  Downloading w3lib-2.2.1-py3-none-any.whl.metadata (2.1 kB)
Collecting pyppeteer>=0.0.14 (from requests-html)
  Downloading pyppeteer-2.0.0-py3-none-any.whl.metadata (7.1 kB)
Collecting appdirs<2.0.0,>=1.4.3 (from pyppeteer>=0.0.14->requests-html)
  Downloading appdirs-1.4.4-py2.py3-none-any.whl.metadata (9.0 kB)
Collecting pyee<12.0.0,>=11.0.0 (from pyppeteer>=0.0.14->requests-html)
  Downloading pyee-11.1.0-py3-none-any.whl.metadata (2.8

In [None]:
!pip install joblib



In [None]:
from requests_html import HTMLSession
from datetime import datetime
from dateutil.relativedelta import relativedelta
import csv
import os
from bs4 import BeautifulSoup

import pandas as pd
import numpy as np
import re
import joblib

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, Dense
from tensorflow.keras.callbacks import EarlyStopping

import nltk
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

# CRAWL DATA FROM GOOGLE NEWS

In [None]:
def get_news_articles(queries, start_date, end_date):
    base_url = "https://news.google.com/rss/search?q="
    date_format = "%Y-%m-%d"

    current_date = start_date
    total_articles = 0
    articles = []

    while current_date < end_date:
        next_date = current_date + relativedelta(weeks=1)
        formatted_start_date = current_date.strftime(date_format)
        formatted_end_date = next_date.strftime(date_format)

        for query in queries:
            url = f"{base_url}{query}+after:{formatted_start_date}+before:{formatted_end_date}"

            s = HTMLSession()
            r = s.get(url)

            for item in r.html.find('item'):
                title = item.find('title', first=True).text if item.find('title', first=True) else ''
                pubDate = item.find('pubDate', first=True).text if item.find('pubDate', first=True) else ''
                description_html = item.find('description', first=True).text if item.find('description', first=True) else ''
                soup = BeautifulSoup(description_html, 'html.parser')
                description = soup.find('a').text if soup.find('a') else ''

                articles.append({
                    'headline': title,
                    'date': pubDate,
                    'short_description': description,
                    'category': query.upper()
                })

                total_articles += 1

        current_date = next_date

    print(f'Total number of articles: {total_articles}')

    # Save articles to CSV
    with open('news_articles.csv', 'w', newline='', encoding='utf-8') as csvfile:
        fieldnames = ['headline', 'date', 'short_description', 'category']
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)

        writer.writeheader()
        for article in articles:
            writer.writerow(article)

In [None]:
queries = ["war", "science", "entertainment"]
start_date = datetime(2024, 1, 1)
end_date = datetime.today()

In [None]:
articles = get_news_articles(queries, start_date, end_date)

  k = self.parse_starttag(i)


Total number of articles: 9000


# LOAD THE DATASET

In [None]:
dataset = pd.read_csv('/content/news_articles.csv')
dataset.drop(['date'], axis = 1, inplace = True)
dataset.head()

Unnamed: 0,headline,short_description,category
0,National Park Service Announces New American W...,National Park Service Announces New American W...,WAR
1,Is the Middle East on the Verge of a Wider War...,Is the Middle East on the Verge of a Wider War?,WAR
2,Remaking Mistakes in Gaza - War On The Rocks,Remaking Mistakes in Gaza,WAR
3,Opinion: I am half Israeli and half Palestinia...,Opinion: I am half Israeli and half Palestinia...,WAR
4,10 Conflicts to Watch in 2024 - Crisis Group,10 Conflicts to Watch in 2024,WAR


In [None]:
cat_nums = dataset.category.nunique()

In [None]:
print(f"We have a total of {cat_nums} categories")
dataset['category'].value_counts()

We have a total of 3 categories


category
WAR              3000
SCIENCE          3000
ENTERTAINMENT    3000
Name: count, dtype: int64

# PRE-PROCESSING

In [None]:
df = dataset.copy()

In [None]:
df.duplicated().sum()

887

In [None]:
df.drop_duplicates(keep='last', inplace=True)

In [None]:
df.duplicated(subset=['short_description','headline']).sum() #duplicates under 'short_description' and 'headline'

4

In [None]:
df.drop_duplicates(subset=['short_description','headline'],keep='last',inplace=True)

In [None]:
print(len(df[df['headline'] == ""]))

0


In [None]:
df.loc[df['headline'] == "", 'headline'] = np.nan
df.dropna(subset=['headline'], inplace=True)
print(len(df[df['headline'] == ""]))

0


In [None]:
df.head()

Unnamed: 0,headline,short_description,category
0,National Park Service Announces New American W...,National Park Service Announces New American W...,WAR
3,Opinion: I am half Israeli and half Palestinia...,Opinion: I am half Israeli and half Palestinia...,WAR
4,10 Conflicts to Watch in 2024 - Crisis Group,10 Conflicts to Watch in 2024,WAR
5,JBSA-Fort Sam Houston street renamed after Civ...,JBSA-Fort Sam Houston street renamed after Civ...,WAR
6,"90 days in, California politicians keep trying...","90 days in, California politicians keep trying...",WAR


In [None]:
print(len(df[df['short_description'] == ""]))

0


In [None]:
df.loc[df['short_description'] == "", 'short_description'] = np.nan
df.dropna(subset=['short_description'], inplace=True)
print(len(df[df['short_description'] == ""]))

0


In [None]:
df.head()

Unnamed: 0,headline,short_description,category
0,National Park Service Announces New American W...,National Park Service Announces New American W...,WAR
3,Opinion: I am half Israeli and half Palestinia...,Opinion: I am half Israeli and half Palestinia...,WAR
4,10 Conflicts to Watch in 2024 - Crisis Group,10 Conflicts to Watch in 2024,WAR
5,JBSA-Fort Sam Houston street renamed after Civ...,JBSA-Fort Sam Houston street renamed after Civ...,WAR
6,"90 days in, California politicians keep trying...","90 days in, California politicians keep trying...",WAR


In [None]:
df = shuffle(df)
df.reset_index(inplace=True, drop=True)

In [None]:
df.head()

Unnamed: 0,headline,short_description,category
0,The Division of Science - The City College of ...,The Division of Science,SCIENCE
1,Monumental Sports & Entertainment and Monument...,Monumental Sports & Entertainment and Monument...,ENTERTAINMENT
2,AMC Entertainment (AMC) Enters $250M ATM Agree...,AMC Entertainment (AMC) Enters $250M ATM Agree...,ENTERTAINMENT
3,Indian Ocean temperature anomalies predict lon...,Indian Ocean temperature anomalies predict lon...,SCIENCE
4,Amex Platinum is Losing Another Way to Use Mon...,Amex Platinum is Losing Another Way to Use Mon...,ENTERTAINMENT


In [None]:
df['desc'] = df['headline'].astype(str)+"-"+df['short_description']
df.drop(columns =['headline','short_description'],axis = 1, inplace=True)
df.astype(str)
df.head()

Unnamed: 0,category,desc
0,SCIENCE,The Division of Science - The City College of ...
1,ENTERTAINMENT,Monumental Sports & Entertainment and Monument...
2,ENTERTAINMENT,AMC Entertainment (AMC) Enters $250M ATM Agree...
3,SCIENCE,Indian Ocean temperature anomalies predict lon...
4,ENTERTAINMENT,Amex Platinum is Losing Another Way to Use Mon...


In [None]:
df['desc'] = df['desc'].apply(lambda x: x.lower())
df.head()

Unnamed: 0,category,desc
0,SCIENCE,the division of science - the city college of ...
1,ENTERTAINMENT,monumental sports & entertainment and monument...
2,ENTERTAINMENT,amc entertainment (amc) enters $250m atm agree...
3,SCIENCE,indian ocean temperature anomalies predict lon...
4,ENTERTAINMENT,amex platinum is losing another way to use mon...


In [None]:
stop = set(stopwords.words('english'))
df['desc'] = df['desc'].apply(lambda x: ' '.join([word for word in x.split() if word not in stop]))
df.head()

Unnamed: 0,category,desc
0,SCIENCE,division science - city college new york news-...
1,ENTERTAINMENT,monumental sports & entertainment monumental s...
2,ENTERTAINMENT,amc entertainment (amc) enters $250m atm agree...
3,SCIENCE,indian ocean temperature anomalies predict lon...
4,ENTERTAINMENT,amex platinum losing another way use monthly e...


In [None]:
df['desc'] = df['desc'].apply(lambda x: re.sub(r'\W', ' ', str(x)))
df.head()

Unnamed: 0,category,desc
0,SCIENCE,division science city college new york news ...
1,ENTERTAINMENT,monumental sports entertainment monumental s...
2,ENTERTAINMENT,amc entertainment amc enters 250m atm agree...
3,SCIENCE,indian ocean temperature anomalies predict lon...
4,ENTERTAINMENT,amex platinum losing another way use monthly e...


In [None]:
stemmer = PorterStemmer()

df['desc'] = df['desc'].apply(lambda x: ' '.join([stemmer.stem(word) for word in x.split()]))
df.head()

Unnamed: 0,category,desc
0,SCIENCE,divis scienc citi colleg new york news the div...
1,ENTERTAINMENT,monument sport entertain monument sport networ...
2,ENTERTAINMENT,amc entertain amc enter 250m atm agreement str...
3,SCIENCE,indian ocean temperatur anomali predict long t...
4,ENTERTAINMENT,amex platinum lose anoth way use monthli enter...


# DATA SPLITTING

In [None]:
X, Y = df['desc'],df['category']

#80% to train , 10% for validation , 10% for testing
X_train, X_val, y_train, y_val = train_test_split(X, Y, test_size=0.2, random_state=42)
X_val, X_test , y_val, y_test= train_test_split(X_val, y_val, test_size=0.5, random_state=42)

In [None]:
vocab_size =20000
max_length = 150
trunc_type='post'
padding_type='post'
oov_tok = "<OOV>"

In [None]:
tokenizer = Tokenizer(num_words = vocab_size, oov_token=oov_tok)
tokenizer.fit_on_texts(X_train)

word_index = tokenizer.word_index

X_train = tokenizer.texts_to_sequences(X_train)
X_train = pad_sequences(X_train,maxlen= max_length,padding=padding_type, truncating=trunc_type)

X_val = tokenizer.texts_to_sequences(X_val)
X_val = pad_sequences(X_val,maxlen= max_length,padding=padding_type, truncating=trunc_type)

X_test_words = X_test
X_test = tokenizer.texts_to_sequences(X_test)
X_test = pad_sequences(X_test,maxlen= max_length,padding=padding_type, truncating=trunc_type)

X_train = np.array(X_train)
X_val = np.array(X_val)
X_test = np.array(X_test)

le = LabelEncoder()
y_train = le.fit_transform(y_train)
y_val = le.fit_transform(y_val)
y_test = le.fit_transform(y_test)

y_train = tf.keras.utils.to_categorical(y_train, num_classes=cat_nums)
y_val = tf.keras.utils.to_categorical(y_val, num_classes=cat_nums)
y_test = tf.keras.utils.to_categorical(y_test, num_classes=cat_nums)

print(X_train.shape)
print(y_train.shape)

print(X_val.shape)
print(y_val.shape)

print(X_test.shape)
print(y_test.shape)

(6487, 150)
(6487, 3)
(811, 150)
(811, 3)
(811, 150)
(811, 3)


# EMBEDDING

In [None]:
path_to_glove_file =  'glove.6B.100d.txt'
if not os.path.exists(path_to_glove_file):
  !wget http://nlp.stanford.edu/data/glove.6B.zip #downloadingu glove vec word embeddings
  !unzip -q glove.6B.zip #unzipping

--2024-07-26 06:42:33--  http://nlp.stanford.edu/data/glove.6B.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://nlp.stanford.edu/data/glove.6B.zip [following]
--2024-07-26 06:42:33--  https://nlp.stanford.edu/data/glove.6B.zip
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip [following]
--2024-07-26 06:42:34--  https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 862182613 (822M) [application/zip]
Saving to: ‘glove.6B.zip’


202

In [None]:
#Initialising the embedding matrix with glove vec embeddings
num_tokens = len(tokenizer.word_index.items()) + 2
embedding_dim = 100
hits = 0
misses = 0

embeddings_index = {}
with open(path_to_glove_file) as f:
    for line in f:
        word, coefs = line.split(maxsplit=1)
        coefs = np.fromstring(coefs, "f", sep=" ")
        embeddings_index[word] = coefs

print("Found %s word vectors." % len(embeddings_index))

# Prepare embedding matrix
embedding_matrix = np.zeros((num_tokens, embedding_dim))
for word, i in tokenizer.word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # Words not found in embedding index will be all-zeros.
        # This includes the representation for "padding" and "OOV"
        embedding_matrix[i] = embedding_vector
        hits += 1
    else:
        misses += 1
print("Converted %d words (%d misses)" % (hits, misses))

Found 400000 word vectors.
Converted 6824 words (2865 misses)


# LSTM MODEL

In [None]:
early_stop = tf.keras.callbacks.EarlyStopping(
    monitor='val_loss',
    patience=3,
    min_delta=0.0001
)

In [None]:
embed_size = 100
model = Sequential([
    Embedding(num_tokens,
        embedding_dim,
        embeddings_initializer=tf.keras.initializers.Constant(embedding_matrix),
        mask_zero=True, input_shape=[None], trainable=False),
    Bidirectional(LSTM(256, dropout=0.4)),
    Dense(y_train.shape[1], activation="softmax")
])

In [None]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, None, 100)         969100    
                                                                 
 bidirectional (Bidirection  (None, 512)               731136    
 al)                                                             
                                                                 
 dense (Dense)               (None, 3)                 1539      
                                                                 
Total params: 1701775 (6.49 MB)
Trainable params: 732675 (2.79 MB)
Non-trainable params: 969100 (3.70 MB)
_________________________________________________________________


In [None]:
opt = tf.keras.optimizers.Adam(learning_rate=0.001)
model.compile(loss="categorical_crossentropy", optimizer=opt, metrics=["accuracy"])

In [None]:
history = model.fit(
    X_train, y_train,
    batch_size=32,
    validation_data=(X_val, y_val),
    epochs=100,
    callbacks=[early_stop]
)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100


In [None]:
model.save('lstm.h5')

  saving_api.save_model(


In [None]:
#making predictions
preds = model.predict(X_test,verbose = 1)
#converting the one hot vector output to a linear numpy array.
pred_classes = np.argmax(preds, axis = 1)



In [None]:
#extracting the classes from the label encoder
encoded_classes = le.classes_
#mapping the encoded output to actual categories
predicted_category = [encoded_classes[x] for x in pred_classes]
true_category = [encoded_classes[np.argmax(x)] for x in y_test]

In [None]:
# Calculate accuracy
accuracy = accuracy_score(true_category, predicted_category)
print(f"Accuracy: {accuracy}")

# Calculate precision, recall, f1-score
precision = precision_score(true_category, predicted_category, average='weighted')
recall = recall_score(true_category, predicted_category, average='weighted')
f1 = f1_score(true_category, predicted_category, average='weighted')

print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1-Score: {f1}")

# Detailed classification report
report = classification_report(true_category, predicted_category)
print("Classification Report:\n", report)

Accuracy: 0.968019680196802
Precision: 0.9685804332790802
Recall: 0.968019680196802
F1-Score: 0.968157944353166
Classification Report:
                precision    recall  f1-score   support

ENTERTAINMENT       0.99      0.96      0.98       278
      SCIENCE       0.94      0.96      0.95       256
          WAR       0.97      0.97      0.97       279

     accuracy                           0.97       813
    macro avg       0.97      0.97      0.97       813
 weighted avg       0.97      0.97      0.97       813



# RANDOM FORREST

In [None]:
# Flatten the sequences to use with traditional ML models
X_train_flat = X_train.reshape(X_train.shape[0], -1)
X_val_flat = X_val.reshape(X_val.shape[0], -1)
X_test_flat = X_test.reshape(X_test.shape[0], -1)

In [None]:
# Train Random Forest
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train_flat, y_train)

# Predict on test set
y_test_pred_rf = rf_model.predict(X_test_flat)

# Evaluation
accuracy_rf = accuracy_score(y_test, y_test_pred_rf)
precision_rf = precision_score(y_test, y_test_pred_rf, average='weighted')
recall_rf = recall_score(y_test, y_test_pred_rf, average='weighted')
f1_rf = f1_score(y_test, y_test_pred_rf, average='weighted')

print("Random Forest Classification Report:")
print(classification_report(y_test, y_test_pred_rf))
print(f"Accuracy: {accuracy_rf}")
print(f"Precision: {precision_rf}")
print(f"Recall: {recall_rf}")
print(f"F1-Score: {f1_rf}")

Random Forest Classification Report:
              precision    recall  f1-score   support

           0       0.92      0.41      0.57       278
           1       0.95      0.48      0.64       256
           2       0.51      0.94      0.66       279

   micro avg       0.65      0.61      0.63       813
   macro avg       0.79      0.61      0.62       813
weighted avg       0.79      0.61      0.62       813
 samples avg       0.61      0.61      0.61       813

Accuracy: 0.6125461254612546
Precision: 0.7902288085587709
Recall: 0.6125461254612546
F1-Score: 0.6221786212741318


  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
joblib.dump(rf_model, 'random_forest_model.pkl')

['random_forest_model.pkl']

# K-Nearest Neighbors (KNN)

In [None]:
# Train KNN
knn_model = KNeighborsClassifier(n_neighbors=5)
knn_model.fit(X_train_flat, y_train)

# Predict on test set
y_test_pred_knn = knn_model.predict(X_test_flat)

# Evaluation
accuracy_knn = accuracy_score(y_test, y_test_pred_knn)
precision_knn = precision_score(y_test, y_test_pred_knn, average='weighted')
recall_knn = recall_score(y_test, y_test_pred_knn, average='weighted')
f1_knn = f1_score(y_test, y_test_pred_knn, average='weighted')

print("K-Nearest Neighbors Classification Report:")
print(classification_report(y_test, y_test_pred_knn))
print(f"Accuracy: {accuracy_knn}")
print(f"Precision: {precision_knn}")
print(f"Recall: {recall_knn}")
print(f"F1-Score: {f1_knn}")

K-Nearest Neighbors Classification Report:
              precision    recall  f1-score   support

           0       0.50      0.27      0.35       278
           1       0.39      0.30      0.34       256
           2       0.45      0.42      0.43       279

   micro avg       0.44      0.33      0.38       813
   macro avg       0.45      0.33      0.37       813
weighted avg       0.45      0.33      0.38       813
 samples avg       0.33      0.33      0.33       813

Accuracy: 0.3296432964329643
Precision: 0.4502669210365573
Recall: 0.3296432964329643
F1-Score: 0.3753443480564826


  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
joblib.dump(knn_model, 'knn_model.pkl')

['knn_model.pkl']