In [70]:
from google.colab import drive
drive.mount('/content/drive')

# Access the dataset on https://drive.google.com/file/d/1uUXRx6jd-UpxGrakahBy_i14JyXdD6nz/view?usp=drive_link
# Download and upload the dataset to your drive and access it.

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import pandas as pd
df = pd.read_json('News_Category_Dataset_v3.json',lines = True)
df.drop(['link','authors','date'],axis=1,inplace=True)

exclude_categories = ['U.S. NEWS', 'WORLD NEWS', 'WEIRD NEWS', 'LATINO VOICES', 'IMPACT', 'WEDDINGS', 'COLLEGE', 'PARENTS','THE WORLDPOST', 'GOOD NEWS', 'WORLDPOST','FIFTY', 'DIVORCE',
                      'EDUCATION', 'RELIGION']

df = df[~df['category'].isin(exclude_categories)]
df

In [None]:
#CLEANING UP THE DATA SET
import re
import nltk

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')

from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer


ps = PorterStemmer()

def replace_hyphen(text):
  return text.replace('-', ' ')
def clean_text(text):
  text = text.lower()
  text = re.sub(r'<.*?>', '', text)
  text = re.sub(r'[^a-zA-Z\s]','',text)
  return text

def tokenize(text):
  return word_tokenize(text)

stopwords = set(stopwords.words('english'))
def remove_stopwords(tokens):
  return [word for word in tokens if word not in stopwords]

def lemmatize_words(tokens):
  lemmatizer = WordNetLemmatizer()
  return [lemmatizer.lemmatize(word) for word in tokens]

In [21]:
def preprocess(text):
  text = replace_hyphen(text)
  text = clean_text(text)
  tokens = tokenize(text)
  tokens = remove_stopwords(tokens)
  # tokens = stem_words(tokens)
  tokens = lemmatize_words(tokens)
  return ' '.join(tokens)

In [22]:
df['headline'] = df['headline'].apply(preprocess)
df['short_description'] = df['short_description'].apply(preprocess)
df

Unnamed: 0,headline,category,short_description
2,funniest tweet cat dog week sept,COMEDY,dog dont understand could eaten
3,funniest tweet parent week sept,PARENTING,accidentally put grown toothpaste toddler toot...
8,new documentary capture complexity child immig...,CULTURE & ARTS,mija director isabel castro combined music doc...
13,twitch ban gambling site streamer scam folk,TECH,one man claim scammed people platform caused s...
16,reboot clever navel gazey look inside tv reboots,CULTURE & ARTS,starring keegan michael key judy greer johnny ...
...,...,...,...
209522,rim ceo thorsten heins significant plan blackb...,TECH,verizon wireless att already promoting lte dev...
209523,maria sharapova stunned victoria azarenka aust...,SPORTS,afterward azarenka effusive press normal credi...
209524,giant patriot jet colt among improbable super ...,SPORTS,leading super bowl xlvi talked game could end ...
209525,aldon smith arrested er linebacker busted duo,SPORTS,correction earlier version story incorrectly s...


In [None]:
df['news'] = df['headline'] + ' ' + df['short_description']
df.drop(['headline','short_description'],axis=1,inplace=True)
df


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

def merge_categories(df, categories_to_merge, new_category_name):
  df['category'] = df['category'].apply(lambda x: new_category_name if x in categories_to_merge else x)
  return df

df = merge_categories(df, ['ARTS & CULTURE', 'CULTURE & ARTS','ARTS'], 'ARTS & CULTURE')
df = merge_categories(df, ['WELLNESS', 'HEALTHY LIVING', 'TASTE'], 'WELLNESS & HEALTH')
df = merge_categories(df, ['COMEDY', 'ENTERTAINMENT'], 'ENTERTAINMENT')
df = merge_categories(df, ['ENVIRONMENT', 'GREEN'], 'ENVIORNMENT')
df = merge_categories(df, ['SCIENCE', 'TECH'], 'TECH')
df = merge_categories(df, ['BUSINESS', 'MONEY'], 'MONEY & BUSINESS')
df = merge_categories(df, ['STYLE & BEAUTY', 'STYLE'], 'STYLE & BEAUTY')
df = merge_categories(df, ['QUEER VOICES','BLACK VOICES','WOMEN', 'HOME'], 'EMPOWERMENT')
df = merge_categories(df, ['PARENTING', 'PARENTS'], 'PARENTING')

plt.figure(figsize=(10,6))
sns.countplot(y='category',data=df)
plt.yticks(rotation=0)
plt.show()

In [57]:

# print(df['news'].str.len().max())
# df = df[df['news'].str.len() <= 200]
# print(df['news'].str.len().max())

1008
200


In [72]:
#MODEL IMPLEMENTATION

from tensorflow.keras.layers import Embedding, Conv1D, MaxPooling1D, LSTM, GRU, Dense, Dropout, GlobalMaxPooling1D
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.model_selection import train_test_split
import tensorflow as tf

MAX_VOCAB_SIZE = 20000
MAX_SEQUENCE_LENGTH = 1010
EMBEDDING_DIM = 100
FILTERS = 128
KERNEL_SIZE = 5
LSTM_UNITS = 64
DROPOUT_RATE = 0.5
BATCH_SIZE = 32
EPOCHS = 10


#tokenize and pad sequence
tokenizer = Tokenizer(num_words=MAX_VOCAB_SIZE, oov_token="<OOV>")
tokenizer.fit_on_texts(df['news'])
sequences = tokenizer.texts_to_sequences(df['news'])
padded_sequences = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)

X_train, X_test, y_train, y_test = train_test_split(padded_sequences, df['category'], test_size=0.3, random_state=42)

In [67]:
#applying SMOTE(Synthetic minority over-sampling technique) on training dataset.
# from imblearn.over_sampling import SMOTE

# smote = SMOTE(random_state=42)
# X_train, y_train = smote.fit_resample(X_train, y_train)


In [73]:
#one hot encoding of categorical values

from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()

y_train = label_encoder.fit_transform(y_train)
y_test = label_encoder.transform(y_test) #using the same encoder.

y_train = to_categorical(y_train, num_classes=len(df['category'].unique()))
y_test = to_categorical(y_test, num_classes=len(df['category'].unique()))

In [74]:
model = Sequential()
model.add(Embedding(input_dim=MAX_VOCAB_SIZE+1, output_dim=EMBEDDING_DIM, input_length=MAX_SEQUENCE_LENGTH)) # Embedding layer

model.add(Conv1D(filters=FILTERS, kernel_size=KERNEL_SIZE, activation='relu'))
model.add(MaxPooling1D(pool_size=2))

model.add(LSTM(LSTM_UNITS, return_sequences=False))
model.add(Dense(len(df['category'].unique()), activation='relu'))
model.add(Dropout(DROPOUT_RATE))
model.add(Dense(len(df['category'].unique()), activation='softmax'))

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(X_train, y_train, batch_size=BATCH_SIZE, epochs=EPOCHS, validation_data=(X_test, y_test))







Epoch 1/10
[1m   2/3537[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m24:31:27[0m 25s/step - accuracy: 0.0703 - loss: 2.7742

KeyboardInterrupt: 

In [None]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import numpy as np


y_pred = model.predict(X_test)
y_pred = np.argmax(y_pred, axis=1)
accuracy = accuracy_score(np.argmax(y_test, axis=1), y_pred)
print("Accuracy:", accuracy)
print(classification_report(np.argmax(y_test, axis=1), y_pred))
print(confusion_matrix(np.argmax(y_test, axis=1), y_pred))
model.summary()


In [None]:
from tensorflow.keras.models import load_model
model.save('news_category_model.h5')

Accuracy: 0.6691789098772499    

LETS TRY DOING HYPERPARAMETER TUNING TO FURTHER IMPROVE THE MODEL.

In [None]:


# for i, class_name in enumerate(label_encoder.classes_):
#   print(f"Class {i}: {class_name}")

'''
  According to the previous report:
  poor precision classes: Class 0, 1, 2, 4, 5, 9, 10, 12, 15, 16, 19, 20, and 23
  good precision classes: Class 6, 8, 13, 14, 17, 18, 21, and 22
'''

Class 0: ARTS & CULTURE
Class 1: BUSINESS
Class 2: COMEDY
Class 3: CRIME
Class 4: CULTURE & ARTS
Class 5: EDUCATION
Class 6: ENTERTAINMENT
Class 7: ENVIRONMENT
Class 8: FOOD & DRINK
Class 9: GREEN
Class 10: HEALTHY LIVING
Class 11: MEDIA
Class 12: MONEY
Class 13: PARENTING
Class 14: POLITICS
Class 15: RELIGION
Class 16: SCIENCE
Class 17: SPORTS
Class 18: STYLE & BEAUTY
Class 19: TASTE
Class 20: TECH
Class 21: TRAVEL
Class 22: WELLNESS
Class 23: WOMEN


'\n  According to the previous report:\n  poor precision classes: Class 0, 1, 2, 4, 5, 9, 10, 12, 15, 16, 19, 20, and 23\n  good precision classes: Class 6, 8, 13, 14, 17, 18, 21, and 22\n'