<a href="https://colab.research.google.com/github/kilos11/DeeP_Learning_Tensor/blob/main/Classifying_News_Headlines.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Classifying News Headlines

This notebook explains the classification of news headlines as sarcastic and non-sarcastic. We are using the same headlines data as used before.


In [1]:
##import the required libraries and APIs
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

print(tf.__version__)

2.15.0


## Downloading the News Headlines data

In [2]:
!wget --no-check-certificate \
    https://storage.googleapis.com/wdd-2-node.appspot.com/x1.json \
    -o /tmp/headlines.json

In [3]:
##read the the json file using pandas
import pandas as pd

data = pd.read_json("./x1.json")
data.head()

Unnamed: 0,is_sarcastic,headline,article_link
0,1,thirtysomething scientists unveil doomsday clo...,https://www.theonion.com/thirtysomething-scien...
1,0,dem rep. totally nails why congress is falling...,https://www.huffingtonpost.com/entry/donna-edw...
2,0,eat your veggies: 9 deliciously different recipes,https://www.huffingtonpost.com/entry/eat-your-...
3,1,inclement weather prevents liar from getting t...,https://local.theonion.com/inclement-weather-p...
4,1,mother comes pretty close to using word 'strea...,https://www.theonion.com/mother-comes-pretty-c...


In [4]:
##store headlines and labels in respective lists
headlines = list(data['headline'])
labels = list(data['is_sarcastic'])

## Set the parameters

In [5]:
# Define tokenizing and padding parameters

# Vocabulary size, which represents the maximum number of unique words in the dataset
vocab_size = 10000

# Maximum length of sequences after tokenization and padding
max_length = 120

# Embedding dimension, representing the size of the vector space in which words will be embedded
# (The actual value for embedding_dim needs to be specified, it's currently left blank with "__")
embedding_dim = __

# Truncation type for sequences that exceed the specified max_length ('post' truncates from the end)
trunc_type = 'post'

# Padding type for sequences shorter than the specified max_length ('post' pads at the end)
padding_type = 'post'

# Out-of-vocabulary token, used to represent words that are not present in the vocabulary
oov_tok = "<OOV>"

# Training size, representing the number of samples used for training the model
training_size = 20000


## Splitting the training and testing set

In [6]:
# Sentences

# Extract a subset of headlines for training from the 'headlines' dataset
training_sentences = headlines[0:training_size]

# Extract the remaining headlines for testing from the 'headlines' dataset
testing_sentences = headlines[training_size:]

# Labels

# Extract corresponding labels for training from the 'labels' dataset
training_labels = labels[0:training_size]

# Extract corresponding labels for testing from the 'labels' dataset
testing_labels = labels[training_size:]


## Preprocess sentences


In [7]:
# Tokenizer

# Create a Tokenizer object with a specified vocabulary size and out-of-vocabulary token
tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_tok)

# Fit the tokenizer on the training sentences to generate the vocabulary
tokenizer.fit_on_texts(training_sentences)

# Retrieve the word index, which is a dictionary mapping words to their numerical indices
word_index = tokenizer.word_index

# Tokenization and Padding for Training Set

# Convert the training sentences into sequences of numerical indices using the trained tokenizer
training_sequences = tokenizer.texts_to_sequences(training_sentences)

# Pad the sequences to ensure uniform length using the specified parameters
training_padded = pad_sequences(
    training_sequences,
    maxlen=max_length,
    padding=padding_type,
    truncating=trunc_type
)

# Tokenization and Padding for Testing Set

# Convert the testing sentences into sequences of numerical indices using the trained tokenizer
testing_sequences = tokenizer.texts_to_sequences(testing_sentences)

# Pad the sequences to ensure uniform length using the specified parameters
testing_padded = pad_sequences(
    testing_sequences,
    maxlen=max_length,
    padding=padding_type,
    truncating=trunc_type
)


In [None]:
# convert lists into numpy arrays to make it work with TensorFlow 2.x
training_padded = np.array(training_padded)
training_labels = np.array(training_labels)
testing_padded = np.array(testing_padded)
testing_labels = np.array(testing_labels)

## Define the neural network model with the following layers:
1. Embedding layer
2. Global Average pooling layer(1D)
3. Dense layer with 24 nodes
4. Output Dense layer with `sigmoid` activation

In [None]:
# Define a Sequential model using TensorFlow's Keras API

# Create a Sequential model, which is a linear stack of layers
model = tf.keras.Sequential([

    # Embedding Layer
    # Embedding layer for word embeddings with input length set to max_length
    tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length),

    # Global Average Pooling Layer
    # Reduces the spatial dimensions of the embeddings by taking the average over all positions
    tf.keras.layers.global_average_pooling1D(),

    # Dense Layer with ReLU Activation
    # Dense layer with 24 units and ReLU activation function
    tf.keras.layers.Dense(24, activation='relu'),

    # Output Dense Layer with Sigmoid Activation
    # Dense layer with 1 unit and sigmoid activation function for binary classification
    tf.keras.layers.Dense(1, activation='sigmoid')
])

# Compile the Model

# Compile the model with binary crossentropy loss, Adam optimizer, and accuracy metric
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])


In [None]:
model.summary()

## Train the model

Save it into history

In [None]:
# Number of Training Epochs

# Define the number of training epochs
num_epochs = 30

# Model Training

# Train the model on the training data

#  training_padded: Padded sequences of training sentences
#  training_labels: Corresponding labels for the training sentences
#  epochs: Number of times to iterate through the entire training dataset
#  validation_data: Data on which to evaluate the loss and any model metrics at the end of each epoch
#  verbose: Level of detail in training progress display (2 for more detailed output)
history = model.fit(
    training_padded,
    training_labels,
    epochs=num_epochs,
    validation_data=(testing_padded, testing_labels),
    verbose=2
)


## Visualise the train & validation accuracy and loss

In [None]:
import matplotlib.pyplot as plt

##plot the scores from history
def plot_graphs(history, string):
  plt.plot(history.history[string])
  plt.plot(history.history['val_'+string])
  plt.legend([string, 'val_'+string])
  plt.xlabel("Epochs")
  plt.ylabel(string)
  plt.show()

plot_graphs(history, "accuracy")
plot_graphs(history, "loss")

## Classifying a new sentence

In [11]:
sentence = ["the baby boy fears spiders in the garden might be real", "game of thrones season finale showing this sunday night"]

##prepare the sequences of the sentences in question
sequences = tokenizer.____(____)
padded_seqs = ____(____, maxlen=max_length, padding=padding_type, truncating=trunc_type)

print(model.predict(padded_seqs))

AttributeError: 'Tokenizer' object has no attribute '____'