# News Article Classification

In [None]:
!pip install feedparser 
!pip install newspaper3k

In [None]:
import pandas as pd
import feedparser as fp
from  newspaper import Article
from datetime import datetime
import requests
from bs4 import BeautifulSoup
import numpy as np
import time
import pytz
import random

In [None]:
sources = {'terrorism': ['https://www.state.gov/rss-feed/counterterrorism/feed/',
                          'https://globalnews.ca/tag/terrorism/feed/',
                          'https://www.9news.com.au/terrorism/rss'],
           
 'political': ['https://globalnews.ca/politics/feed/',
              'https://www.aljazeera.com/xml/rss/all.xml',
              'https://globalnews.ca/politics/feed/'],
           
 'protest': ['https://www.theguardian.com/world/protest/rss',
              'https://globalnews.ca/tag/protest/feed/'],
           
 'natural_disaster': ['https://www.theguardian.com/world/natural--disasters/rss',
                      'https://globalnews.ca/tag/natural-disasters/feed/'],
           
 'positive': ['https://www.positive.news/feed/',
              'https://www.goodnewsnetwork.org/feed/'],
           
 'others': ['https://www.theguardian.com/science/rss',
              'https://globalnews.ca/science/feed/',
              'https://www.sciencenews.org/feed']}

In [None]:
def get_content(url):
    content = Article(url)
    content.download()
    content.parse()
    return content.text.replace("\n" , " ").lower()

In [None]:
get_content("https://globalnews.ca/news/10219172/rcmp-quebec-restauarant-stabbing-terrorism/")[:10]

In [None]:
def fetch_data(sources):
    main_ = { "title" : [] , "content" : [] , "label" : [] }
    for class_ , li in sources.items():
        for i in li :
            source_ = fp.parse(i)
            if source_.bozo == False :
                print(len(source_.entries))
                for data in source_.entries : 
                    main_["title"].append(data.title.lower())
                    content = get_content(data.link)
                    if content == '':
                        print(i.link)
                        content = data.summary_detial.value
                    main_["content"].append(content)
                    main_["label"].append(class_)
            elif source_.bozo == True :
                print(i)

    return main_

In [None]:
data = fetch_data(sources)

In [None]:
len(data["title"]), len(data["content"]), len(data["label"])

In [None]:
df = pd.DataFrame(data)
df.head()

In [None]:
value_counts = df.label.value_counts()

In [None]:
import plotly.express as px

fig = px.bar(x=value_counts.index, y=value_counts.values, labels={'x':'Category', 'y':'Count'})
fig.update_layout(title='Value Counts of Categories',
                  xaxis_title='Category',
                  yaxis_title='Count')

# Show the plot
fig.show()

In [None]:
import spacy
from nltk.stem.snowball import PorterStemmer , SnowballStemmer
from spacy.lang.en.stop_words import STOP_WORDS


stop_words_set = set(STOP_WORDS)
nlp = spacy.load("en_core_web_sm", disable=["parser", "ner"])
stemmer_ = SnowballStemmer("english")

def remove_stopwords(text):
    content = []
    for word in text.lower().split():
        if word not in stop_words_set :
            content.append(word)
            
    return " ".join(content)


def stemmer(text):
    doc = nlp(text)
    stemmed_content = [stemmer_.stem(token.lemma_) for token in doc]
    return " ".join(stemmed_content)

In [None]:
from collections import Counter
import string
def word_counts(word_list) :
    word_count = Counter()
    for i in word_list:
        word_count.update([j for j in i.split()])
        
    return word_count

def remove_puncs(text):
    translator = str.maketrans('', '', string.punctuation)
    return text.translate(translator)
    
    return text

In [None]:
df["soup"] = df["title"] + " " + df["content"].replace('continue reading' , " ")
df["soup"] = df["soup"].apply(remove_stopwords)
df["soup"]  = df["soup"].apply(stemmer)
df["soup"]  = df["soup"].apply(remove_puncs)

In [None]:
word_counts(df["soup"].to_list()).most_common()[:5]

In [None]:
classes = list(sources.keys())
for i in sources :
    temp = df.query(f" label == '{i}'")
    print(word_counts(temp["soup"].to_list()).most_common()[:6] , i)
    print()

In [None]:
vocab = word_counts(df["soup"])
len(vocab)

In [None]:
vocab_len = len(vocab)
vocab_len

In [None]:
import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer

tokenizer = Tokenizer()
tokenizer.fit_on_texts(df.soup.to_list())

In [None]:
len(tokenizer.index_word)

In [None]:
len_ = sorted([len(i.split()) for i in df.soup.to_list()] , reverse = True)

In [None]:
len_ = np.array(len_)

In [None]:
Counter(len_).most_common()[:5]

In [None]:
def process_text(texts) : 
    sequences = tokenizer.texts_to_sequences(texts)
    padded_sequences = pad_sequences(sequences, maxlen = 500 ,truncating='post', padding='post')
    return padded_sequences

In [None]:
shuffled_df = df.sample(frac=1).reset_index(drop=True)
shuffled_df.head()

In [None]:
sequences = process_text(shuffled_df.soup.to_list())
sequences

In [None]:
len(sequences[0])

In [None]:
classes

In [None]:
shuffled_df.head()

In [None]:
labels = np.array(shuffled_df.label.to_list()).reshape(-1 , 1)
labels[:3]

In [None]:
from sklearn.preprocessing import OneHotEncoder , OrdinalEncoder

one_hot = OneHotEncoder(sparse = False)
labels = one_hot.fit_transform(labels )
labels

In [None]:
dict_ = { i:o for i,o in  enumerate(list(one_hot.categories_[0]))}
dict_

In [None]:
from sklearn.model_selection import train_test_split

x_train, x_temp, y_train, y_temp = train_test_split(sequences , labels, test_size=0.3, random_state=42)
x_test, x_val, y_test, y_val = train_test_split(x_temp, y_temp, test_size=0.5, random_state=42)

In [None]:
train_dataset = tf.data.Dataset.from_tensor_slices((x_train, y_train))
test_dataset = tf.data.Dataset.from_tensor_slices((x_test, y_test))
val_dataset = tf.data.Dataset.from_tensor_slices((x_val, y_val))

In [None]:
267/8

In [None]:
batch_size = 32
train_dataset = train_dataset.shuffle(buffer_size=10000).batch(batch_size)
test_dataset = test_dataset.shuffle(buffer_size=10000).batch(batch_size)
val_dataset = val_dataset.shuffle(buffer_size=10000).batch(batch_size)

In [None]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, GlobalAveragePooling1D , Dropout , GlobalMaxPooling1D , Input

In [None]:
device = tf.test.gpu_device_name()
device

In [None]:
with tf.device(device) :
    model = Sequential([
        Embedding(input_dim=vocab_len , output_dim = 64 ,   input_length=500),
        GlobalMaxPooling1D(),
        Dense(128, activation='relu'),
        Dropout(0.5),  # Adjust dropout rate as needed
        Dense(24, activation='relu'),
        Dropout(0.5),  # Adjust dropout rate as needed
        Dense(6, activation='softmax')  # Use 'softmax' for multi-class, 'sigmoid' for multi-label
    ])
    
    
    model.compile(loss='categorical_crossentropy', optimizer=Adam(0.01), metrics=['accuracy'])

In [None]:
from tensorflow.keras.callbacks import EarlyStopping , LearningRateScheduler , ModelCheckpoint , ReduceLROnPlateau , TensorBoard

early_stopping = EarlyStopping(monitor='val_accuracy', 
                                   patience=20, verbose=1, 
                                   mode='max', restore_best_weights=True)

model_checkpoint = ModelCheckpoint(filepath='model_article.h5', 
                                       monitor='val_accuracy', verbose=1, 
                                   save_best_only=True, mode='max')
    
reduce_lr = ReduceLROnPlateau(monitor='val_accuracy', factor=0.2,
                              patience=2, min_lr=0.001)
    

lr_ind = []
def scheduler(epoch, lr):
    if epoch < 30:
        return lr
    else:
        lr_ind.append(lr * tf.math.exp(-0.2))
        return lr * tf.math.exp(-0.2)
    
lr_schedule = LearningRateScheduler(scheduler)

In [None]:
with tf.device(device):
    model.fit(
        train_dataset ,
        epochs=300 ,
        callbacks=[
            early_stopping #, model_checkpoint  , lr_schedule ,
        ],
    validation_data=val_dataset
    )