<a href="https://colab.research.google.com/github/huynqcharles/News_Classification_NLP/blob/master/News_Classification_NLP.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# DEPENDENCIES

In [None]:
import pandas as pd
import numpy as np
import re
from nltk.stem import WordNetLemmatizer
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
import sklearn
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, Dense
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from google.colab import drive
import os

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


# LOAD THE DATASET

In [None]:
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
dataset = pd.read_json('/content/drive/MyDrive/Datasets/News_Category_Dataset_v3.json', lines=True)
dataset.drop(['authors', 'link', 'date'], axis = 1, inplace = True)
dataset.head()

Unnamed: 0,headline,category,short_description
0,Over 4 Million Americans Roll Up Sleeves For O...,U.S. NEWS,Health experts said it is too early to predict...
1,"American Airlines Flyer Charged, Banned For Li...",U.S. NEWS,He was subdued by passengers and crew when he ...
2,23 Of The Funniest Tweets About Cats And Dogs ...,COMEDY,"""Until you have a dog you don't understand wha..."
3,The Funniest Tweets From Parents This Week (Se...,PARENTING,"""Accidentally put grown-up toothpaste on my to..."
4,Woman Who Called Cops On Black Bird-Watcher Lo...,U.S. NEWS,Amy Cooper accused investment firm Franklin Te...


In [None]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 209527 entries, 0 to 209526
Data columns (total 3 columns):
 #   Column             Non-Null Count   Dtype 
---  ------             --------------   ----- 
 0   headline           209527 non-null  object
 1   category           209527 non-null  object
 2   short_description  209527 non-null  object
dtypes: object(3)
memory usage: 4.8+ MB


In [None]:
dataset.describe()

Unnamed: 0,headline,category,short_description
count,209527,209527,209527.0
unique,207996,42,187022.0
top,Sunday Roundup,POLITICS,
freq,90,35602,19712.0


In [None]:
cat_nums = dataset.category.nunique()

In [None]:
print(f"We have a total of {cat_nums} categories")
dataset['category'].value_counts()

We have a total of 42 categories


category
POLITICS          35602
WELLNESS          17945
ENTERTAINMENT     17362
TRAVEL             9900
STYLE & BEAUTY     9814
PARENTING          8791
HEALTHY LIVING     6694
QUEER VOICES       6347
FOOD & DRINK       6340
BUSINESS           5992
COMEDY             5400
SPORTS             5077
BLACK VOICES       4583
HOME & LIVING      4320
PARENTS            3955
THE WORLDPOST      3664
WEDDINGS           3653
WOMEN              3572
CRIME              3562
IMPACT             3484
DIVORCE            3426
WORLD NEWS         3299
MEDIA              2944
WEIRD NEWS         2777
GREEN              2622
WORLDPOST          2579
RELIGION           2577
STYLE              2254
SCIENCE            2206
TECH               2104
TASTE              2096
MONEY              1756
ARTS               1509
ENVIRONMENT        1444
FIFTY              1401
GOOD NEWS          1398
U.S. NEWS          1377
ARTS & CULTURE     1339
COLLEGE            1144
LATINO VOICES      1130
CULTURE & ARTS     1074
EDUCATI

# PRE-PROCESSING

In [None]:
df = dataset.copy()

In [None]:
df.duplicated().sum()

471

In [None]:
df.drop_duplicates(keep='last', inplace=True)

In [None]:
df.duplicated(subset=['short_description','headline']).sum() #duplicates under 'short_description' and 'headline'

18

In [None]:
df.drop_duplicates(subset=['short_description','headline'],keep='last',inplace=True)

In [None]:
print(len(df[df['headline'] == ""]))

2


In [None]:
df.loc[df['headline'] == "", 'headline'] = np.nan
df.dropna(subset=['headline'], inplace=True)
print(len(df[df['headline'] == ""]))

0


In [None]:
print(len(df[df['short_description'] == ""]))

19610


In [None]:
df.loc[df['short_description'] == "", 'short_description'] = np.nan
df.dropna(subset=['short_description'], inplace=True)
print(len(df[df['short_description'] == ""]))

0


In [None]:
df = shuffle(df)
df.reset_index(inplace=True, drop=True)

In [None]:
df.head()

Unnamed: 0,headline,category,short_description
0,Noise,IMPACT,We had a wonderful time and enjoyed all the at...
1,"The Week In Arts & Culture: Shia Quits, Marina...",CULTURE & ARTS,We've been waiting for the day when Carrie Bro...
2,Anti-Hate Group Condemns Donald Trump's Closin...,POLITICS,"""This needs to stop.”"
3,Let's Make Puglia the New Umbria,TRAVEL,You know how certain regions of certain countr...
4,Bill Striking Derogatory Terms From Federal La...,POLITICS,"Congress passed the bill, which would remove w..."


In [None]:
df['desc'] = df['headline'].astype(str)+"-"+df['short_description']
df.drop(columns =['headline','short_description'],axis = 1, inplace=True)
df.astype(str)
df.head()

Unnamed: 0,category,desc
0,IMPACT,Noise-We had a wonderful time and enjoyed all ...
1,CULTURE & ARTS,"The Week In Arts & Culture: Shia Quits, Marina..."
2,POLITICS,Anti-Hate Group Condemns Donald Trump's Closin...
3,TRAVEL,Let's Make Puglia the New Umbria-You know how ...
4,POLITICS,Bill Striking Derogatory Terms From Federal La...


# DATA SPLITTING

In [None]:
X, Y = df['desc'],df['category']

#80% to train , 10% for validation , 10% for testing
X_train, X_val, y_train, y_val = train_test_split(X, Y, test_size=0.2, random_state=42)
X_val, X_test , y_val, y_test= train_test_split(X_val, y_val, test_size=0.5, random_state=42)

In [None]:
vocab_size =20000
max_length = 150
trunc_type='post'
padding_type='post'
oov_tok = "<OOV>"

In [None]:
tokenizer = Tokenizer(num_words = vocab_size, oov_token=oov_tok)
tokenizer.fit_on_texts(X_train)

word_index = tokenizer.word_index

X_train = tokenizer.texts_to_sequences(X_train)
X_train = pad_sequences(X_train,maxlen= max_length,padding=padding_type, truncating=trunc_type)

X_val = tokenizer.texts_to_sequences(X_val)
X_val = pad_sequences(X_val,maxlen= max_length,padding=padding_type, truncating=trunc_type)

X_test_words = X_test
X_test = tokenizer.texts_to_sequences(X_test)
X_test = pad_sequences(X_test,maxlen= max_length,padding=padding_type, truncating=trunc_type)

X_train = np.array(X_train)
X_val = np.array(X_val)
X_test = np.array(X_test)

le = LabelEncoder()
y_train = le.fit_transform(y_train)
y_val = le.fit_transform(y_val)
y_test = le.fit_transform(y_test)

y_train = tf.keras.utils.to_categorical(y_train, num_classes=cat_nums)
y_val = tf.keras.utils.to_categorical(y_val, num_classes=cat_nums)
y_test = tf.keras.utils.to_categorical(y_test, num_classes=cat_nums)

print(X_train.shape)
print(y_train.shape)

print(X_val.shape)
print(y_val.shape)

print(X_test.shape)
print(y_test.shape)

(151540, 150)
(151540, 42)
(18943, 150)
(18943, 42)
(18943, 150)
(18943, 42)


# EMBEDDING

In [None]:
path_to_glove_file =  'glove.6B.100d.txt'
if not os.path.exists(path_to_glove_file):
  !wget http://nlp.stanford.edu/data/glove.6B.zip #downloadingu glove vec word embeddings
  !unzip -q glove.6B.zip #unzipping

--2024-06-06 11:49:51--  http://nlp.stanford.edu/data/glove.6B.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://nlp.stanford.edu/data/glove.6B.zip [following]
--2024-06-06 11:49:52--  https://nlp.stanford.edu/data/glove.6B.zip
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip [following]
--2024-06-06 11:49:52--  https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 862182613 (822M) [application/zip]
Saving to: ‘glove.6B.zip’


202

In [None]:
#Initialising the embedding matrix with glove vec embeddings
num_tokens = len(tokenizer.word_index.items()) + 2
embedding_dim = 100
hits = 0
misses = 0

embeddings_index = {}
with open(path_to_glove_file) as f:
    for line in f:
        word, coefs = line.split(maxsplit=1)
        coefs = np.fromstring(coefs, "f", sep=" ")
        embeddings_index[word] = coefs

print("Found %s word vectors." % len(embeddings_index))

# Prepare embedding matrix
embedding_matrix = np.zeros((num_tokens, embedding_dim))
for word, i in tokenizer.word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # Words not found in embedding index will be all-zeros.
        # This includes the representation for "padding" and "OOV"
        embedding_matrix[i] = embedding_vector
        hits += 1
    else:
        misses += 1
print("Converted %d words (%d misses)" % (hits, misses))

Found 400000 word vectors.
Converted 66562 words (39694 misses)


# BUILD THE MODEL LSTM

In [None]:
early_stop = tf.keras.callbacks.EarlyStopping(
    monitor='val_loss',
    patience=3,
    min_delta=0.0001
)

In [None]:
embed_size = 100
model = Sequential([
    Embedding(num_tokens,
        embedding_dim,
        embeddings_initializer=tf.keras.initializers.Constant(embedding_matrix),
        mask_zero=True, input_shape=[None], trainable=False),
    Bidirectional(LSTM(256, dropout=0.4)),
    Dense(y_train.shape[1], activation="softmax")
])

In [None]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, None, 100)         10625800  
                                                                 
 bidirectional (Bidirection  (None, 512)               731136    
 al)                                                             
                                                                 
 dense (Dense)               (None, 42)                21546     
                                                                 
Total params: 11378482 (43.41 MB)
Trainable params: 752682 (2.87 MB)
Non-trainable params: 10625800 (40.53 MB)
_________________________________________________________________


In [None]:
opt = tf.keras.optimizers.Adam(learning_rate=0.001)
model.compile(loss="categorical_crossentropy", optimizer=opt, metrics=["accuracy"])

In [None]:
history = model.fit(
    X_train, y_train,
    batch_size=32,
    validation_data=(X_val, y_val),
    epochs=100,
    callbacks=[early_stop]
)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100


In [None]:
model.save('lstm.h5')

  saving_api.save_model(


# MODEL EVALUATION

In [None]:
#making predictions
preds = model.predict(X_test,verbose = 1)
#converting the one hot vector output to a linear numpy array.
pred_classes = np.argmax(preds, axis = 1)



In [None]:
#extracting the classes from the label encoder
encoded_classes = le.classes_
#mapping the encoded output to actual categories
predicted_category = [encoded_classes[x] for x in pred_classes]
true_category = [encoded_classes[np.argmax(x)] for x in y_test]

In [None]:
result_df = pd.DataFrame({'description':X_test_words,'true_category':true_category, 'predicted_category':predicted_category})
result_df

Unnamed: 0,description,true_category,predicted_category
2788,Ronald McDonald and Friends Sue Seattle to Sto...,POLITICS,POLITICS
127236,Silence = Death-When is silence lethal? Thirty...,QUEER VOICES,QUEER VOICES
86139,South Korea's Olympic Stadium Will Host Just 4...,SPORTS,SPORTS
108884,Goji Berry Could Protect Eyes From Diabetic Re...,WELLNESS,WELLNESS
151644,100 Millionaires And Billionaires Sign Open Le...,U.S. NEWS,BUSINESS
...,...,...,...
107868,Muslim Clothing Gets Chic With 'Hijab Couture'...,STYLE & BEAUTY,STYLE & BEAUTY
90109,7 Must-Haves for Effective Meetings-Running ef...,BUSINESS,WELLNESS
2268,Texas Senate Candidate Beto O'Rourke Says He's...,POLITICS,POLITICS
29451,D.C. Judge Approves Government Warrant Seeking...,POLITICS,POLITICS


In [None]:
mismatched_df = result_df[result_df['true_category'] != result_df['predicted_category']]
mismatched_df

Unnamed: 0,description,true_category,predicted_category
151644,100 Millionaires And Billionaires Sign Open Le...,U.S. NEWS,BUSINESS
178309,6 Ways to Overcome Disappointment-Setbacks are...,HEALTHY LIVING,WELLNESS
52970,"Washington Woman Quits Job, Takes 57 Days To F...",U.S. NEWS,WOMEN
24480,FBI Director Blames Crime On Police Misconduct...,POLITICS,CRIME
148765,Why We Must Help Bridge the Gap For Women In T...,WOMEN,BUSINESS
...,...,...,...
113,The Government Made Me Buy a Catalytic Convert...,MONEY,POLITICS
118287,Every Girl At This High School Got A Flower Fo...,GOOD NEWS,PARENTS
108855,Cute Clogs That Will Take You From Summer Into...,STYLE & BEAUTY,TRAVEL
165398,28 Rules for Fathers of Sons-Being a father --...,PARENTING,DIVORCE


In [None]:
check_df = dataset.loc[dataset['headline'] == "Elephant Killings: Thefts From Humanity"]

print(check_df)

                                      headline category  \
85881  Elephant Killings: Thefts From Humanity    GREEN   

                                       short_description  
85881  Something was taken from you, me, and all huma...  


In [None]:
result_df

Unnamed: 0,description,true_category,predicted_category
2788,Ronald McDonald and Friends Sue Seattle to Sto...,POLITICS,POLITICS
127236,Silence = Death-When is silence lethal? Thirty...,QUEER VOICES,QUEER VOICES
86139,South Korea's Olympic Stadium Will Host Just 4...,SPORTS,SPORTS
108884,Goji Berry Could Protect Eyes From Diabetic Re...,WELLNESS,WELLNESS
151644,100 Millionaires And Billionaires Sign Open Le...,U.S. NEWS,BUSINESS
...,...,...,...
107868,Muslim Clothing Gets Chic With 'Hijab Couture'...,STYLE & BEAUTY,STYLE & BEAUTY
90109,7 Must-Haves for Effective Meetings-Running ef...,BUSINESS,WELLNESS
2268,Texas Senate Candidate Beto O'Rourke Says He's...,POLITICS,POLITICS
29451,D.C. Judge Approves Government Warrant Seeking...,POLITICS,POLITICS


In [None]:
print(f"Accuracy is {sklearn.metrics.accuracy_score(result_df['true_category'], result_df['predicted_category'])}")

Accuracy is 0.6458322335427334


# NEW INPUT DATA

In [None]:
# Define function to preprocess and predict
def predict_category(text, tokenizer, model, le, max_length=150):
    # Preprocess the input text
    text = [text]
    sequences = tokenizer.texts_to_sequences(text)
    padded_sequences = pad_sequences(sequences, maxlen=max_length, padding='post', truncating='post')

    # Make prediction
    prediction = model.predict(padded_sequences, verbose=0)
    pred_class = np.argmax(prediction, axis=1)

    # Map the prediction to the category
    predicted_category = le.inverse_transform(pred_class)

    return predicted_category[0]

In [None]:
new_text = 'The Times: Manchester City is taking legal action against the English Premier League over commercial rules'
predicted_category = predict_category(new_text, tokenizer, model, le)
print(f"Predicted category: {predicted_category}")

Predicted category: SPORTS


In [None]:
dataset['category'].value_counts()

category
POLITICS          35602
WELLNESS          17945
ENTERTAINMENT     17362
TRAVEL             9900
STYLE & BEAUTY     9814
PARENTING          8791
HEALTHY LIVING     6694
QUEER VOICES       6347
FOOD & DRINK       6340
BUSINESS           5992
COMEDY             5400
SPORTS             5077
BLACK VOICES       4583
HOME & LIVING      4320
PARENTS            3955
THE WORLDPOST      3664
WEDDINGS           3653
WOMEN              3572
CRIME              3562
IMPACT             3484
DIVORCE            3426
WORLD NEWS         3299
MEDIA              2944
WEIRD NEWS         2777
GREEN              2622
WORLDPOST          2579
RELIGION           2577
STYLE              2254
SCIENCE            2206
TECH               2104
TASTE              2096
MONEY              1756
ARTS               1509
ENVIRONMENT        1444
FIFTY              1401
GOOD NEWS          1398
U.S. NEWS          1377
ARTS & CULTURE     1339
COLLEGE            1144
LATINO VOICES      1130
CULTURE & ARTS     1074
EDUCATI