<a href="https://colab.research.google.com/github/jmlDC/MediaBias-Thesis22-23/blob/Modeling/bi_LSTM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

reference
https://colab.research.google.com/github/keras-team/keras-io/blob/master/examples/nlp/ipynb/bidirectional_lstm_imdb.ipynb#scrollTo=6Rf3oHV3zOzD 

*   https://towardsdatascience.com/multiclass-text-classification-using-lstm-in-pytorch-eac56baed8df
*   https://towardsdatascience.com/sentiment-analysis-using-lstm-and-glove-embeddings-99223a87fe8e



## Installs

In [1]:
import numpy as np
from tensorflow import keras
from tensorflow.keras import layers
from keras.preprocessing.text import Tokenizer
from keras.layers import Embedding
from keras.layers import LSTM, Activation, Dropout, Dense, Input, Conv1D, MaxPooling1D, GlobalMaxPooling1D
from keras.models import Model

import re
import spacy
import string
from collections import Counter
import torch
from sklearn.metrics import mean_squared_error
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F
import torch.nn as nn


In [1]:
# !pip install datasets

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting datasets
  Downloading datasets-2.12.0-py3-none-any.whl (474 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m474.6/474.6 kB[0m [31m8.4 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.7,>=0.3.0 (from datasets)
  Downloading dill-0.3.6-py3-none-any.whl (110 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m110.5/110.5 kB[0m [31m10.8 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.2.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (212 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m212.5/212.5 kB[0m [31m17.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting multiprocess (from datasets)
  Downloading multiprocess-0.70.14-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.3/134.3 kB[0m [31m10.0 MB/s[0m eta [36m0:00:00[0m
Collec

## Gdrive

In [2]:
from google.colab import drive
drive.mount('/content/gdrive')

dir  = "/content/gdrive/MyDrive/THESIS-MS/Git-Thesis22-23/"

Mounted at /content/gdrive


## Dataset

In [27]:
max_features = 200000  # Only consider the top 20k words
maxLen = 150  # Only consider the first 200 words of each movie review

In [33]:
import pandas as pd

df = pd.read_csv(f'{dir}Official/MFC_prepared.csv', usecols=["code_frames", "annotations"], header=0)
df.code_frames = df.code_frames.astype(int)

label_dict = {}
for x in range(15):
    label_dict[x+1] = x

label_dict
df['label'] = df.code_frames.replace(label_dict)
df = df.drop(["code_frames"], axis=1)
df

Unnamed: 0,annotations,label
0,Immigrants without HOPE need help entering col...,9
1,"But in the eyes of the law, he is an illegal i...",4
2,"Reaction to Tancredo, Lamm as predicted",14
3,"That, said the congressman, is what always hap...",12
4,"$50,000 per entry",0
...,...,...
46794,Smoking is becoming a social taboo,10
46795,Nor does it aid lawyers seeking novel ways to...,4
46796,'Ashes to Ashes',14
46797,SMOKE SCREEN IS SEEN BEHIND THE SMOKELESS,14


## Prep

### Tain and Test Splitting

In [34]:
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(df.annotations.values, 
                                                  df.label.values, 
                                                  test_size=0.15, 
                                                  random_state=42,
                                                  stratify=df.label.values
                                                )

In [35]:
X_train.shape

(39779,)

### tokenizing

In [46]:
tokenizer = Tokenizer(num_words=200)
tokenizer.fit_on_texts(X_train)

In [47]:
words_to_index = tokenizer.word_index

### GLOVE

In [51]:
def read_glove_vector(glove_vec):
  with open(glove_vec, 'r', encoding='UTF-8') as f:
    words = set()
    word_to_vec_map = {}
    for line in f:
      w_line = line.split()
      curr_word = w_line[0]
      word_to_vec_map[curr_word] = np.array(w_line[1:], dtype=np.float64)

  return word_to_vec_map

word_to_vec_map = read_glove_vector(f'{dir}glove.6B/glove.6B.100d.txt')

In [52]:
# added minus 1, due to out of bounds
vocab_len = len(words_to_index) 
embed_vector_len = word_to_vec_map['moon'].shape[0]

emb_matrix = np.zeros((vocab_len, embed_vector_len))

for word, index in words_to_index.items():
  embedding_vector = word_to_vec_map.get(word)
  if embedding_vector is not None:
    emb_matrix[index-1, :] = embedding_vector

embedding_layer = Embedding(input_dim=vocab_len, output_dim=embed_vector_len, input_length=maxLen, weights = [emb_matrix], trainable=False)


## Model

### Model1

In [48]:
# No glove

# Input for variable-length sequences of integers
inputs = keras.Input(shape=(None,), dtype="int32")
# embeddings = embedding_layer(inputs)                    #added this for glove
# Embed each integer in a 128-dimensional vector
x = layers.Embedding(max_features, 128)(inputs)

# Add 2 bidirectional LSTMs
x = layers.Bidirectional(layers.LSTM(64, return_sequences=True))(x)
x = layers.Bidirectional(layers.LSTM(64))(x)
# Add a classifier
outputs = layers.Dense(1, activation="sigmoid")(x)
model = keras.Model(inputs, outputs)
model.summary()


Model: "model_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_3 (InputLayer)        [(None, None)]            0         
                                                                 
 embedding_1 (Embedding)     (None, None, 128)         25600000  
                                                                 
 bidirectional_2 (Bidirectio  (None, None, 128)        98816     
 nal)                                                            
                                                                 
 bidirectional_3 (Bidirectio  (None, 128)              98816     
 nal)                                                            
                                                                 
 dense_1 (Dense)             (None, 1)                 129       
                                                                 
Total params: 25,797,761
Trainable params: 25,797,761
Non-t

In [49]:
X_train_indices = tokenizer.texts_to_sequences(X_train)
X_val_indices = tokenizer.texts_to_sequences(X_val)

x_train = keras.utils.pad_sequences(X_train_indices, maxlen=maxLen)
x_val = keras.utils.pad_sequences(X_val_indices, maxlen=maxLen)


In [50]:
model.compile("adam", "sparse_categorical_crossentropy", metrics=["accuracy"])
model.fit(x_train, y_train, batch_size=32, epochs=10, validation_data=(x_val, y_val))


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f39984b5150>

### Model2

In [53]:
def media_frames(input_shape):
  X_indices = keras.Input(input_shape)
  embeddings = embedding_layer(X_indices)
  X = LSTM(128, return_sequences=True) (embeddings)
  X = Dropout(0.6)(X)
  X = LSTM(128, return_sequences=True)(X)
  X = Dropout(0.6)(X)
  X = LSTM(128)(X)
  X = Dense(1, activation='sigmoid')(X)
  model = Model(inputs=X_indices, outputs=X)
  return model

In [56]:
model_2 = media_frames((maxLen,))
model_2.summary()

Model: "model_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_5 (InputLayer)        [(None, 150)]             0         
                                                                 
 embedding_2 (Embedding)     (None, 150, 100)          3089000   
                                                                 
 lstm_7 (LSTM)               (None, 150, 128)          117248    
                                                                 
 dropout_2 (Dropout)         (None, 150, 128)          0         
                                                                 
 lstm_8 (LSTM)               (None, 150, 128)          131584    
                                                                 
 dropout_3 (Dropout)         (None, 150, 128)          0         
                                                                 
 lstm_9 (LSTM)               (None, 128)               1315

In [57]:
X_train_indices = tokenizer.texts_to_sequences(X_train)
X_train_indices = keras.utils.pad_sequences(X_train_indices, maxlen=maxLen, padding='post')
adam = keras.optimizers.Adam(learning_rate = 0.0001)
model_2.compile(optimizer=adam, loss='sparse_categorical_crossentropy', metrics=['accuracy'])

In [58]:
model_2.fit(X_train_indices, y_train, batch_size=32, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f39983092d0>

In [59]:
X_test_indices = tokenizer.texts_to_sequences(X_val)
X_test_indices = keras.utils.pad_sequences(X_test_indices, maxlen=maxLen, padding='post')
     

In [60]:
model_2.evaluate(X_test_indices, y_val)



[nan, 0.07122506946325302]

# Restart

In [61]:
# from datasets import Dataset
import pandas as pd

df = pd.read_csv(f'{dir}Official/MFC_prepared.csv', usecols=["code_frames", "annotations"], header=0)
df.code_frames = df.code_frames.astype(int)

label_dict = {}
for x in range(15):
    label_dict[x+1] = x

label_dict
df['label'] = df['code_frames'].apply(lambda x: label_dict[x])
df

Unnamed: 0,code_frames,annotations,label
0,10,Immigrants without HOPE need help entering col...,9
1,5,"But in the eyes of the law, he is an illegal i...",4
2,15,"Reaction to Tancredo, Lamm as predicted",14
3,13,"That, said the congressman, is what always hap...",12
4,1,"$50,000 per entry",0
...,...,...,...
46794,11,Smoking is becoming a social taboo,10
46795,5,Nor does it aid lawyers seeking novel ways to...,4
46796,15,'Ashes to Ashes',14
46797,15,SMOKE SCREEN IS SEEN BEHIND THE SMOKELESS,14


In [62]:
df_2 = df.copy()
df_2['annotations_length'] = df_2['annotations'].apply(lambda x: len(x.split()))


In [63]:
np.mean(df_2['annotations_length'])

18.55368704459497

In [64]:
df_2= df_2.drop(['code_frames', 'annotations_length'], axis=1)
df_2

Unnamed: 0,annotations,label
0,Immigrants without HOPE need help entering col...,9
1,"But in the eyes of the law, he is an illegal i...",4
2,"Reaction to Tancredo, Lamm as predicted",14
3,"That, said the congressman, is what always hap...",12
4,"$50,000 per entry",0
...,...,...
46794,Smoking is becoming a social taboo,10
46795,Nor does it aid lawyers seeking novel ways to...,4
46796,'Ashes to Ashes',14
46797,SMOKE SCREEN IS SEEN BEHIND THE SMOKELESS,14


In [65]:
#tokenization
tok = spacy.load('en_core_web_sm')
def tokenize (text):
    text = re.sub(r"[^\x00-\x7F]+", " ", text)
    regex = re.compile('[' + re.escape(string.punctuation) + '0-9\\r\\t\\n]') # remove punctuation and numbers
    nopunct = regex.sub(" ", text.lower())
    return [token.text for token in tok.tokenizer(nopunct)]

In [88]:
#count number of occurences of each word
counts = Counter()
for index, row in df_2.iterrows():
    counts.update(tokenize(row['annotations']))
    
#deleting infrequent words
print("num_words before:",len(counts.keys()))

for word in list(counts):
    if counts[word] < 2:
        del counts[word]
print("num_words after:",len(counts.keys()))


num_words before: 28683
num_words after: 17847


In [89]:
#creating vocabulary
vocab2index = {"":0, "UNK":1}
words = ["", "UNK"]
for word in counts:
    vocab2index[word] = len(words)
    words.append(word)

In [91]:
def encode_sentence(text, vocab2index, N=70):
    tokenized = tokenize(text)
    encoded = np.zeros(N, dtype=int)
    enc1 = np.array([vocab2index.get(word, vocab2index["UNK"]) for word in tokenized])
    length = min(N, len(enc1))
    encoded[:length] = enc1[:length]
    return encoded, length

In [92]:
df_2['encoded'] = df_2['annotations'].apply(lambda x: np.array(encode_sentence(x,vocab2index )))
df_2.head()

  df_2['encoded'] = df_2['annotations'].apply(lambda x: np.array(encode_sentence(x,vocab2index )))


Unnamed: 0,annotations,label,encoded
0,Immigrants without HOPE need help entering col...,9,"[[2, 3, 4, 5, 6, 7, 8, 0, 0, 0, 0, 0, 0, 0, 0,..."
1,"But in the eyes of the law, he is an illegal i...",4,"[[9, 10, 11, 12, 13, 11, 14, 15, 16, 17, 18, 1..."
2,"Reaction to Tancredo, Lamm as predicted",14,"[[21, 22, 23, 15, 24, 25, 26, 0, 0, 0, 0, 0, 0..."
3,"That, said the congressman, is what always hap...",12,"[[27, 15, 28, 11, 29, 15, 17, 30, 31, 32, 15, ..."
4,"$50,000 per entry",0,"[[44, 45, 46, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,..."


In [93]:
Counter(df_2['label'])

Counter({9: 1686,
         4: 9690,
         14: 1324,
         12: 9812,
         0: 3336,
         6: 5039,
         10: 2818,
         7: 1122,
         8: 2575,
         13: 407,
         3: 1187,
         1: 254,
         5: 3413,
         2: 1486,
         11: 2650})

In [94]:
X = list(df_2['encoded'])
y = list(df_2['label'])
from sklearn.model_selection import train_test_split
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, stratify=df_2['label'])

In [96]:
class customDataset(Dataset):
    def __init__(self, X, Y):
        self.X = X
        self.y = Y
        
    def __len__(self):
        return len(self.y)
    
    def __getitem__(self, idx):
        return torch.from_numpy(self.X[idx][0].astype(np.int32)), self.y[idx], self.X[idx][1]

In [97]:
train_ds = customDataset(X_train, y_train)
valid_ds = customDataset(X_valid, y_valid)

In [99]:
def train_model(model, epochs=10, lr=0.001):
    parameters = filter(lambda p: p.requires_grad, model.parameters())
    optimizer = torch.optim.Adam(parameters, lr=lr)
    for i in range(epochs):
        model.train()
        sum_loss = 0.0
        total = 0
        for x, y, l in train_dl:
            x = x.long()
            y = y.long()
            print(y)
            y_pred = model(x, l)
            optimizer.zero_grad()
            loss = F.cross_entropy(y_pred, y)
            loss.backward()
            optimizer.step()
            sum_loss += loss.item()*y.shape[0]
            total += y.shape[0]
        val_loss, val_acc, val_rmse = validation_metrics(model, val_dl)
        if i % 5 == 1:
            print("train loss %.3f, val loss %.3f, val accuracy %.3f, and val rmse %.3f" % (sum_loss/total, val_loss, val_acc, val_rmse))

def validation_metrics (model, valid_dl):
    model.eval()
    correct = 0
    total = 0
    sum_loss = 0.0
    sum_rmse = 0.0
    for x, y, l in valid_dl:
        x = x.long()
        y = y.long()
        y_hat = model(x, l)
        loss = F.cross_entropy(y_hat, y)
        pred = torch.max(y_hat, 1)[1]
        correct += (pred == y).float().sum()
        total += y.shape[0]
        sum_loss += loss.item()*y.shape[0]
        sum_rmse += np.sqrt(mean_squared_error(pred, y.unsqueeze(-1)))*y.shape[0]
    return sum_loss/total, correct/total, sum_rmse/total

In [100]:
batch_size = 5000
vocab_size = len(words)
train_dl = DataLoader(train_ds, batch_size=batch_size, shuffle=True)
val_dl = DataLoader(valid_ds, batch_size=batch_size)

In [102]:
def load_glove_vectors(glove_file=f"{dir}glove.6B/glove.6B.50d.txt"):
    """Load the glove word vectors"""
    word_vectors = {}
    with open(glove_file, 'r', encoding='UTF-8') as f:
        for line in f:
            split = line.split()
            word_vectors[split[0]] = np.array([float(x) for x in split[1:]])
    return word_vectors

In [114]:
def get_emb_matrix(pretrained, word_counts, emb_size = 50):
    """ Creates embedding matrix from word vectors"""
    vocab_size = len(word_counts) + 2
    vocab_to_idx = {}
    vocab = ["", "UNK"]
    W = np.zeros((vocab_size, emb_size), dtype="float32")
    W[0] = np.zeros(emb_size, dtype='float32') # adding a vector for padding
    W[1] = np.random.uniform(-0.25, 0.25, emb_size) # adding a vector for unknown words 
    vocab_to_idx["UNK"] = 1
    i = 2
    for word in word_counts:
        if word in word_vecs:
            W[i] = word_vecs[word]
        else:
            W[i] = np.random.uniform(-0.25,0.25, emb_size)
        vocab_to_idx[word] = i
        vocab.append(word)
        i += 1   
    return W, np.array(vocab), vocab_to_idx

In [115]:
word_vecs = load_glove_vectors()
pretrained_weights, vocab, vocab2index = get_emb_matrix(word_vecs, counts)

In [118]:
class LSTM_glove_vecs(torch.nn.Module) :
    def __init__(self, vocab_size, embedding_dim, hidden_dim, glove_weights) :
        super().__init__()
        self.embeddings = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        self.embeddings.weight.data.copy_(torch.from_numpy(glove_weights))
        self.embeddings.weight.requires_grad = False ## freeze embeddings
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        self.linear = nn.Linear(hidden_dim, 15)
        self.dropout = nn.Dropout(0.2)
        
    def forward(self, x, l):
        x = self.embeddings(x)
        x = self.dropout(x)
        lstm_out, (ht, ct) = self.lstm(x)
        return self.linear(ht[-1])

In [119]:
model = LSTM_glove_vecs(vocab_size, 50, 50, pretrained_weights)

In [None]:
train_model(model, epochs=10, lr=0.1)

tensor([12,  4, 12,  ..., 12,  5, 12])
tensor([ 9, 12,  4,  ...,  5, 10,  6])
tensor([ 6,  6,  2,  ...,  8, 12,  4])
tensor([12, 12,  4,  ...,  2, 12,  5])
tensor([5, 5, 5,  ..., 4, 4, 6])
tensor([ 6,  0,  5,  ...,  4,  5, 12])
tensor([ 4,  0, 12,  ...,  4, 13,  9])
tensor([ 0,  4,  6,  ..., 12,  6, 11])
