In [1]:
!pip install -U sentence-transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting sentence-transformers
  Downloading sentence-transformers-2.2.2.tar.gz (85 kB)
[K     |████████████████████████████████| 85 kB 648 kB/s 
[?25hCollecting transformers<5.0.0,>=4.6.0
  Downloading transformers-4.24.0-py3-none-any.whl (5.5 MB)
[K     |████████████████████████████████| 5.5 MB 15.4 MB/s 
Collecting sentencepiece
  Downloading sentencepiece-0.1.97-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[K     |████████████████████████████████| 1.3 MB 40.6 MB/s 
[?25hCollecting huggingface-hub>=0.4.0
  Downloading huggingface_hub-0.11.1-py3-none-any.whl (182 kB)
[K     |████████████████████████████████| 182 kB 58.0 MB/s 
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[K     |████████████████████████████████| 7.6 MB 40.3 MB/s 
Building wheels for collected pa

In [2]:
import pandas as pd

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## Importing Dataset

In [22]:
train_Data =  pd.read_csv("/content/drive/MyDrive/SMAI/Data/ClickBait/Raw/ClickBaitTrain.csv")
train_bodies = train_Data[['targetTitle', 'appendedTargetParagraphs']]
train_stances =train_Data[['truthClass']]

test_Data =  pd.read_csv("/content/drive/MyDrive/SMAI/Data/ClickBait/Raw/ClickBaitTest.csv")
test_bodies = train_Data[['targetTitle', 'appendedTargetParagraphs']]
test_stances =train_Data[['truthClass']]

In [7]:
print(train_stances.columns)
train_stances.describe()

Index(['truthClass'], dtype='object')


Unnamed: 0,truthClass
count,21997.0
mean,0.25108
std,0.433644
min,0.0
25%,0.0
50%,0.0
75%,1.0
max,1.0


In [8]:
print(train_bodies.columns)
train_bodies.describe()

Index(['targetTitle', 'appendedTargetParagraphs'], dtype='object')


Unnamed: 0,targetTitle,appendedTargetParagraphs
count,21997,21881
unique,21048,21138
top,CBSN - Live Streaming Video News Channel,Still Watching?Rotate Device
freq,31,32


## Preprocessing and Lemmatization

In [9]:
# Load the data
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

In [16]:
import nltk #Import NLTK ---> Natural Language Toolkit
nltk.download('punkt')
nltk.download('stopwords')

from nltk.tokenize import sent_tokenize, word_tokenize, RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer

# create a function to tokenize the data
def preprocess_data(data, column):
  
  # 1. Tokenization
  tk = RegexpTokenizer('\s+', gaps = True)
  text_data = [] # List for storing the tokenized data
  for values in data.loc[ : , column].values.tolist():
    values = str(values)
    tokenized_data = tk.tokenize(values) # Tokenize the news
    text_data.append(tokenized_data) # append the tokenized data

  # 2. Stopword Removal

  # Extract the stopwords
  sw = stopwords.words('english')
  clean_data = [] # List for storing the clean text
  # Remove the stopwords using stopwords
  for data in text_data:
    clean_text = [words.lower() for words in data if words.lower() not in sw]
    clean_data.append(clean_text) # Appned the clean_text in the clean_data list
  
  # 3. Stemming

  # Create a stemmer object
  ps = PorterStemmer()
  stemmed_data = [] # List for storing the stemmed data
  for data in clean_data:
    stemmed_text = [ps.stem(words) for words in data] # Stem the words
    stemmed_text = " ".join(stemmed_text)
    stemmed_data.append(stemmed_text) # Append the stemmed text
  
  return stemmed_data



[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [17]:
train_titles_ppd=preprocess_data(train_bodies, 'targetTitle')
train_bodies_ppd=preprocess_data(train_bodies, 'appendedTargetParagraphs')

In [23]:
test_titles_ppd=preprocess_data(test_bodies, 'targetTitle')
test_bodies_ppd=preprocess_data(test_bodies, 'appendedTargetParagraphs')

In [24]:
train_titles_ppd_df = pd.DataFrame(train_titles_ppd,columns=["targetTitle"])
train_bodies_ppd_df = pd.DataFrame(train_bodies_ppd,columns=["TargetParagraphs"])
train_data = pd.concat([train_titles_ppd_df,train_bodies_ppd_df,train_stances],axis=1)
train_data.head()

test_titles_ppd_df = pd.DataFrame(test_titles_ppd,columns=["targetTitle"])
test_bodies_ppd_df = pd.DataFrame(test_bodies_ppd,columns=["TargetParagraphs"])
test_data = pd.concat([test_titles_ppd_df,test_bodies_ppd_df,test_stances],axis=1)


In [25]:
train_data.to_csv("/content/drive/MyDrive/SMAI/Data/ClickBait/preprocessed/ClickBaitTrain.csv")
test_data.to_csv("/content/drive/MyDrive/SMAI/Data/ClickBait/preprocessed/ClickBaitTest.csv")

## Vectorization

In [26]:
from sentence_transformers import SentenceTransformer


In [None]:
import csv
from tqdm import tqdm

model = SentenceTransformer('all-mpnet-base-v2')

filename = '/content/drive/MyDrive/SMAI/Data/ClickBait/vectorized/ClickBaitTrainTitles'
with open(filename, 'w', newline="") as file:
    csvwriter = csv.writer(file)
    for headline in tqdm (train_titles_ppd, desc="Encoding Titles..."):
      e = model.encode(headline)
      csvwriter.writerow(e)

filename = '/content/drive/MyDrive/SMAI/Data/ClickBait/vectorized/ClickBaitTrainBodies'
with open(filename, 'w', newline="") as file:
    csvwriter = csv.writer(file)
    for headline in tqdm (train_bodies_ppd, desc="Encoding Bodies..."):
      e = model.encode(headline)
      csvwriter.writerow(e)

Encoding Titles...:  69%|██████▉   | 15124/21997 [34:06<14:02,  8.16it/s]

## Learning Latent Representations

In [None]:
from tensorflow.keras import initializers, regularizers, constraints, optimizers, layers,Sequential
from tensorflow.keras import backend as K
from tensorflow.keras.layers import Bidirectional, GRU, Layer
import tensorflow as tf

In [None]:
def dot_product(x, kernel):
    """
    Wrapper for dot product operation, in order to be compatible with both
    Theano and Tensorflow
    Args:
        x (): input
        kernel (): weights
    Returns:
    """
    if K.backend() == 'tensorflow':
        return K.squeeze(K.dot(x, K.expand_dims(kernel)), axis=-1)
    else:
        return K.dot(x, kernel)


In [None]:
class AttentionWithContext(Layer):
    """
    Attention operation, with a context/query vector, for temporal data.
    Supports Masking.
    Follows the work of Yang et al. [https://www.cs.cmu.edu/~diyiy/docs/naacl16.pdf]
    "Hierarchical Attention Networks for Document Classification"
    by using a context vector to assist the attention
    # Input shape
        3D tensor with shape: `(samples, steps, features)`.
    # Output shape
        2D tensor with shape: `(samples, features)`.
    How to use:
    Just put it on top of an RNN Layer (GRU/LSTM/SimpleRNN) with return_sequences=True.
    The dimensions are inferred based on the output shape of the RNN.
    Note: The layer has been tested with Keras 2.0.6
    Example:
        model.add(LSTM(64, return_sequences=True))
        model.add(AttentionWithContext())
        # next add a Dense layer (for classification/regression) or whatever...
    """

    

    def __init__(self,
                 W_regularizer=None, u_regularizer=None, b_regularizer=None,
                 W_constraint=None, u_constraint=None, b_constraint=None,
                 bias=True, **kwargs):


        self.supports_masking = True
        self.init = initializers.get('glorot_uniform')

        self.W_regularizer = regularizers.get(W_regularizer)
        self.u_regularizer = regularizers.get(u_regularizer)
        self.b_regularizer = regularizers.get(b_regularizer)

        self.W_constraint = constraints.get(W_constraint)
        self.u_constraint = constraints.get(u_constraint)
        self.b_constraint = constraints.get(b_constraint)

        self.bias = bias
        super(AttentionWithContext, self).__init__(**kwargs)


    def build(self, input_shape):
        assert len(input_shape) == 3

        self.W = self.add_weight((input_shape[-1], input_shape[-1],),
                                 initializer=self.init,
                                 name='{}_W'.format(self.name),
                                 regularizer=self.W_regularizer,
                                 constraint=self.W_constraint)
        if self.bias:
            self.b = self.add_weight((input_shape[-1],),
                                     initializer='zero',
                                     name='{}_b'.format(self.name),
                                     regularizer=self.b_regularizer,
                                     constraint=self.b_constraint)

        self.u = self.add_weight((input_shape[-1],),
                                 initializer=self.init,
                                 name='{}_u'.format(self.name),
                                 regularizer=self.u_regularizer,
                                 constraint=self.u_constraint)

        super(AttentionWithContext, self).build(input_shape)

    def compute_mask(self, input, input_mask=None):
        # do not pass the mask to the next layers
        return None

    def call(self, x, mask=None):
        uit = dot_product(x, self.W)

        if self.bias:
            uit += self.b

        uit = K.tanh(uit)
        ait = dot_product(uit, self.u)

        a = K.exp(ait)

        # apply mask after the exp. will be re-normalized next
        if mask is not None:
            # Cast the mask to floatX to avoid float64 upcasting in theano
            a *= K.cast(mask, K.floatx())


        # in some cases especially in the early stages of training the sum may be almost zero
        # and this results in NaN's. A workaround is to add a very small positive number ε to the sum.
        # a /= K.cast(K.sum(a, axis=1, keepdims=True), K.floatx())
        a /= K.cast(K.sum(a, axis=1, keepdims=True) + K.epsilon(), K.floatx())

        a = K.expand_dims(a)
        weighted_input = x * a
        return K.sum(weighted_input, axis=1)

    def compute_output_shape(self, input_shape):
        return input_shape[0], input_shape[-1]


In [None]:
model = Sequential()
model.add(GRU(10, return_sequences=True))
model.add(AttentionWithContext())


In [None]:
model.summary()