<a href="https://colab.research.google.com/github/jmlDC/MediaBias-Thesis22-23/blob/Modeling/bi_LSTM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

reference
https://colab.research.google.com/github/keras-team/keras-io/blob/master/examples/nlp/ipynb/bidirectional_lstm_imdb.ipynb#scrollTo=6Rf3oHV3zOzD 

*   https://towardsdatascience.com/multiclass-text-classification-using-lstm-in-pytorch-eac56baed8df
*   https://towardsdatascience.com/sentiment-analysis-using-lstm-and-glove-embeddings-99223a87fe8e



### Installs

In [199]:
import numpy as np
from tensorflow import keras
from keras.preprocessing.text import Tokenizer
from keras.layers import Embedding
from keras.layers import LSTM, Activation, Dropout, Dense, Input, Conv1D, MaxPooling1D, GlobalMaxPooling1D
from keras.models import Model

import re
import spacy
import string
from collections import Counter
import torch
from sklearn.metrics import mean_squared_error
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F
import torch.nn as nn


## Gdrive

In [16]:
from google.colab import drive
drive.mount('/content/gdrive')

dir  = "/content/gdrive/MyDrive/THESIS-MS/Git-Thesis22-23/"

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


## Dataset

In [17]:
import pandas as pd

df = pd.read_csv(f'{dir}Official/MFC_prepared.csv', usecols=["code_frames", "annotations"], header=0)
df.code_frames = df.code_frames.astype(int)

label_dict = {}
for x in range(15):
    label_dict[x+1] = x

label_dict
df['label'] = df.code_frames.replace(label_dict)
df = df.drop(["code_frames"], axis=1)
df

Unnamed: 0,annotations,label
0,Immigrants without HOPE need help entering col...,9
1,"But in the eyes of the law, he is an illegal i...",4
2,"Reaction to Tancredo, Lamm as predicted",14
3,"That, said the congressman, is what always hap...",12
4,"$50,000 per entry",0
...,...,...
46794,Smoking is becoming a social taboo,10
46795,Nor does it aid lawyers seeking novel ways to...,4
46796,'Ashes to Ashes',14
46797,SMOKE SCREEN IS SEEN BEHIND THE SMOKELESS,14


## Model LSTM

### Prep

In [201]:
max_features = 200000  # Only consider the top 20k words
maxLen = 150  # Only consider the first 200 words of each movie review

#### Tain and Test Splitting

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(df.annotations.values, 
                                                  df.label.values, 
                                                  test_size=0.15, 
                                                  random_state=42,
                                                  stratify=df.label.values
                                                )

In [None]:
X_train.shape

(39779,)

#### tokenizing

In [None]:
tokenizer = Tokenizer(num_words=200)
tokenizer.fit_on_texts(X_train)

In [None]:
words_to_index = tokenizer.word_index

#### GLOVE

In [21]:
def read_glove_vector(glove_vec):
  with open(glove_vec, 'r', encoding='UTF-8') as f:
    words = set()
    word_to_vec_map = {}
    for line in f:
      w_line = line.split()
      curr_word = w_line[0]
      word_to_vec_map[curr_word] = np.array(w_line[1:], dtype=np.float64)

  return word_to_vec_map

word_to_vec_map = read_glove_vector(f'{dir}glove.6B/glove.6B.100d.txt')

In [None]:
# added minus 1, due to out of bounds
vocab_len = len(words_to_index) 
embed_vector_len = word_to_vec_map['moon'].shape[0]

emb_matrix = np.zeros((vocab_len, embed_vector_len))

for word, index in words_to_index.items():
  embedding_vector = word_to_vec_map.get(word)
  if embedding_vector is not None:
    emb_matrix[index-1, :] = embedding_vector

embedding_layer = Embedding(input_dim=vocab_len, output_dim=embed_vector_len, input_length=maxLen, weights = [emb_matrix], trainable=False)


### Model1 no glove

In [205]:
# No glove

# Input for variable-length sequences of integers
inputs = keras.Input(shape=(None,), dtype="int32")
# embeddings = embedding_layer(inputs)                    #added this for glove
# Embed each integer in a 128-dimensional vector
x = layers.Embedding(max_features, 128)(inputs)
# Add 2 bidirectional LSTMs
x = layers.Bidirectional(layers.LSTM(64, return_sequences=True))(x)
x = layers.Bidirectional(layers.LSTM(64))(x)
# Add a classifier
outputs = layers.Dense(15, activation="sigmoid")(x)
model = keras.Model(inputs, outputs)
model.summary()


Model: "model_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_6 (InputLayer)        [(None, None)]            0         
                                                                 
 embedding_5 (Embedding)     (None, None, 128)         25600000  
                                                                 
 bidirectional_4 (Bidirectio  (None, None, 128)        98816     
 nal)                                                            
                                                                 
 bidirectional_5 (Bidirectio  (None, 128)              98816     
 nal)                                                            
                                                                 
 dense_5 (Dense)             (None, 15)                1935      
                                                                 
Total params: 25,799,567
Trainable params: 25,799,567
Non-t

In [None]:
X_train_indices = tokenizer.texts_to_sequences(X_train)
X_val_indices = tokenizer.texts_to_sequences(X_val)

x_train = keras.utils.pad_sequences(X_train_indices, maxlen=maxLen)
x_val = keras.utils.pad_sequences(X_val_indices, maxlen=maxLen)


In [None]:
model.compile("adam", "sparse_categorical_crossentropy", metrics=["accuracy"])
model.fit(x_train, y_train, batch_size=32, epochs=10, validation_data=(x_val, y_val))


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f39984b5150>

### Model2 with glove


** https://github.com/ketanvaidya25/IMDb-Movie-Sentiment-Analysis/blob/main/IMDb_Movie_Sentiment_Analysis.ipynb



In [None]:
def media_frames(input_shape):
  X_indices = keras.Input(input_shape)
  embeddings = embedding_layer(X_indices)
  X = LSTM(128, return_sequences=True) (embeddings)
  X = Dropout(0.6)(X)
  X = LSTM(128, return_sequences=True)(X)
  X = Dropout(0.6)(X)
  X = LSTM(128)(X)
  X = Dense(14, activation='sigmoid')(X)
  model = Model(inputs=X_indices, outputs=X)
  return model

In [None]:
model_2 = media_frames((maxLen,))
model_2.summary()

Model: "model_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_5 (InputLayer)        [(None, 150)]             0         
                                                                 
 embedding_2 (Embedding)     (None, 150, 100)          3089000   
                                                                 
 lstm_7 (LSTM)               (None, 150, 128)          117248    
                                                                 
 dropout_2 (Dropout)         (None, 150, 128)          0         
                                                                 
 lstm_8 (LSTM)               (None, 150, 128)          131584    
                                                                 
 dropout_3 (Dropout)         (None, 150, 128)          0         
                                                                 
 lstm_9 (LSTM)               (None, 128)               1315

In [None]:
X_train_indices = tokenizer.texts_to_sequences(X_train)
X_train_indices = keras.utils.pad_sequences(X_train_indices, maxlen=maxLen, padding='post')
adam = keras.optimizers.Adam(learning_rate = 0.0001)
model_2.compile(optimizer=adam, loss='sparse_categorical_crossentropy', metrics=['accuracy'])

In [None]:
model_2.fit(X_train_indices, y_train, batch_size=32, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f39983092d0>

In [None]:
X_test_indices = tokenizer.texts_to_sequences(X_val)
X_test_indices = keras.utils.pad_sequences(X_test_indices, maxlen=maxLen, padding='post')
     

In [None]:
model_2.evaluate(X_test_indices, y_val)



[nan, 0.07122506946325302]

### Model3

In [18]:
import numpy as np 
import pandas as pd
from keras.preprocessing.text import Tokenizer
from keras.utils import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D
from sklearn.model_selection import train_test_split
from keras.utils.np_utils import to_categorical
from keras.callbacks import EarlyStopping
from keras.layers import Dropout
import re

In [19]:
# The maximum number of words to be used. (most frequent)
MAX_NB_WORDS = 20000
# Max number of words in each complaint.
MAX_SEQUENCE_LENGTH = 128
# This is fixed.
EMBEDDING_DIM = 100

In [20]:
tokenizer = Tokenizer(num_words=MAX_NB_WORDS, filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~', lower=True)
tokenizer.fit_on_texts(df['annotations'].values)
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

Found 34612 unique tokens.


In [21]:
X = tokenizer.texts_to_sequences(df['annotations'].values)
X = pad_sequences(X, maxlen=MAX_SEQUENCE_LENGTH)

print('Shape of data tensor:', X.shape)


Shape of data tensor: (46799, 128)


In [22]:
Y = pd.get_dummies(df['label']).values
print('Shape of label tensor:', Y.shape)

Shape of label tensor: (46799, 15)


In [23]:
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, 
                                                    test_size = 0.15, 
                                                    random_state = 42, 
                                                    stratify=df.label.values)
print(X_train.shape,Y_train.shape)
print(X_test.shape,Y_test.shape)

(39779, 128) (39779, 15)
(7020, 128) (7020, 15)


Glove

In [24]:
embeddings_index = {}
f = open(f'{dir}glove.6B/glove.6B.100d.txt')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Found %s word vectors.' % len(embeddings_index))

Found 400000 word vectors.


In [25]:
embedding_matrix = np.zeros((len(word_index) + 1, EMBEDDING_DIM))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

Model

In [26]:
model_lstm = Sequential()
model_lstm.add(Embedding(len(word_index) + 1, 
                    EMBEDDING_DIM, 
                    weights=[embedding_matrix], 
                    input_length=X.shape[1], 
                    trainable=False))
model_lstm.add(SpatialDropout1D(0.2))
model_lstm.add(LSTM(120, dropout=0.2, recurrent_dropout=0.2))
model_lstm.add(Dense(15, activation='softmax'))
model_lstm.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model_lstm.summary())

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_2 (Embedding)     (None, 128, 100)          3461300   
                                                                 
 spatial_dropout1d_2 (Spatia  (None, 128, 100)         0         
 lDropout1D)                                                     
                                                                 
 lstm_2 (LSTM)               (None, 120)               106080    
                                                                 
 dense_2 (Dense)             (None, 15)                1815      
                                                                 
Total params: 3,569,195
Trainable params: 107,895
Non-trainable params: 3,461,300
_________________________________________________________________
None


Model Training

In [27]:
epochs = 10
batch_size = 32

frames = model_lstm.fit(X_train, Y_train, epochs=epochs, batch_size=batch_size,validation_split=0.1,callbacks=[EarlyStopping(monitor='val_loss', patience=3, min_delta=0.0001)])


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [254]:
model=load_model(f"{dir}LSTM-keras-model")
accr = model.evaluate(X_test,Y_test)
print('Test set\n  Loss: {:0.3f}\n  Accuracy: {:0.3f}'.format(accr[0],accr[1]))

Test set
  Loss: 0.941
  Accuracy: 0.708


In [94]:
# model_lstm.save(f'{dir}LSTM-keras-model')



##### Prediction

In [None]:
print(df.label.value_counts())

desc_count_frames = ['12', '4', '6', '5', '0', '10', '11', '8', '9', '2', '14', '3', '7', '13', '1']
zero_index_label = ['0','1','2','3','4','5','6','7','8','9','10','11','12','13','14']
code = ['1','2','3', '4','5','6', '7', '8', '9', '10', '11', '12', '13', '14','15']

In [255]:
from keras.models import load_model
tokenizer = Tokenizer(num_words=MAX_NB_WORDS, filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~', lower=True)
tokenizer.fit_on_texts(df['annotations'].values)

def lstm_predict(text, labels=code, model=load_model(f"{dir}LSTM-keras-model")):
    seq = tokenizer.texts_to_sequences(text)
    # print(text, '\n seq',seq)
    padded = pad_sequences(seq, maxlen=128)
    pred = model.predict(padded)
    # print(pred, labels[np.argmax(pred)]
    try:
        return labels[np.argmax(pred)]
    except:
        return 'error'


In [244]:
text = ['Immigrants without HOPE need help entering college'] #9, og 10
lstm_predict(text)



'10'

In [None]:
accuracy_c=0
rang = 1000
for i,row in df.iloc[:rang].iterrows():
    y = row['label']
    pred = lstm_predict([row['annotations']])
    print(row['annotations'], '\n truth:',y, "pred:", pred)
    if (y==pred ):
        accuracy_c += 1

print(accuracy_c/rang)

Immigrants without HOPE need help entering college 
 truth: 9 pred: 10
But in the eyes of the law, he is an illegal immigrant. 
 truth: 4 pred: 6
Reaction to Tancredo, Lamm as predicted 
 truth: 14 pred: 13
That, said the congressman, is what always happens  when the opposition to something can't fashion a coherent and  logical argument. 
 truth: 12 pred: 13
$50,000 per entry 
 truth: 0 pred: 1
If they are illegal and they are working, then the companies that employ them are breaking the law and should be fined to the fullest. 
 truth: 6 pred: 7
HURTADO WINS INS APPEAL IMMIGRANT FROM EL SALVADOR HOPES 19-YEAR FIGHT TO STAY IN U.S. IS ENDING 
 truth: 4 pred: 5
Increase in Latino residents brings anti-immigration backlash 
 truth: 10 pred: 12
Mexico's Open Southern Border Lures Migrants Headed to U.S. 
 truth: 7 pred: 8
Government officials report that hundreds of immigrants -- who come from as far away as Ecuador, Somalia and China -- are assaulted and raped by bandits, robbed by corrup

### Model4 created torch.nn model

** https://towardsdatascience.com/multiclass-text-classification-using-lstm-in-pytorch-eac56baed8df

In [28]:
# from datasets import Dataset
import pandas as pd

df = pd.read_csv(f'{dir}Official/MFC_prepared.csv', usecols=["code_frames", "annotations"], header=0)
df.code_frames = df.code_frames.astype(int)

label_dict = {}
for x in range(15):
    label_dict[x+1] = x

label_dict
df['label'] = df['code_frames'].apply(lambda x: label_dict[x])
df

Unnamed: 0,code_frames,annotations,label
0,10,Immigrants without HOPE need help entering col...,9
1,5,"But in the eyes of the law, he is an illegal i...",4
2,15,"Reaction to Tancredo, Lamm as predicted",14
3,13,"That, said the congressman, is what always hap...",12
4,1,"$50,000 per entry",0
...,...,...,...
46794,11,Smoking is becoming a social taboo,10
46795,5,Nor does it aid lawyers seeking novel ways to...,4
46796,15,'Ashes to Ashes',14
46797,15,SMOKE SCREEN IS SEEN BEHIND THE SMOKELESS,14


In [29]:
df_2 = df.copy()
df_2['annotations_length'] = df_2['annotations'].apply(lambda x: len(x.split()))


In [30]:
np.mean(df_2['annotations_length'])

18.55368704459497

In [31]:
df_2= df_2.drop(['code_frames', 'annotations_length'], axis=1)
df_2

Unnamed: 0,annotations,label
0,Immigrants without HOPE need help entering col...,9
1,"But in the eyes of the law, he is an illegal i...",4
2,"Reaction to Tancredo, Lamm as predicted",14
3,"That, said the congressman, is what always hap...",12
4,"$50,000 per entry",0
...,...,...
46794,Smoking is becoming a social taboo,10
46795,Nor does it aid lawyers seeking novel ways to...,4
46796,'Ashes to Ashes',14
46797,SMOKE SCREEN IS SEEN BEHIND THE SMOKELESS,14


In [32]:
#tokenization
tok = spacy.load('en_core_web_sm')
def tokenize (text):
    text = re.sub(r"[^\x00-\x7F]+", " ", text)
    regex = re.compile('[' + re.escape(string.punctuation) + '0-9\\r\\t\\n]') # remove punctuation and numbers
    nopunct = regex.sub(" ", text.lower())
    return [token.text for token in tok.tokenizer(nopunct)]

In [70]:
#count number of occurences of each word
counts = Counter()
for index, row in df_2.iterrows():
    counts.update(tokenize(row['annotations']))
    
#deleting infrequent words
print("num_words before:",len(counts.keys()))

for word in list(counts):
    if counts[word] < 3:
        del counts[word]
print("num_words after:",len(counts.keys()))


num_words before: 28683
num_words after: 13550


In [71]:
#creating vocabulary
vocab2index = {"":0, "UNK":1}
words = ["", "UNK"]
for word in counts:
    vocab2index[word] = len(words)
    words.append(word)

In [72]:
def encode_sentence(text, vocab2index, N=70):
    tokenized = tokenize(text)
    encoded = np.zeros(N, dtype=int)
    enc1 = np.array([vocab2index.get(word, vocab2index["UNK"]) for word in tokenized])
    length = min(N, len(enc1))
    encoded[:length] = enc1[:length]
    return encoded, length

In [73]:
df_2['encoded'] = df_2['annotations'].apply(lambda x: np.array(encode_sentence(x,vocab2index )))
df_2.head()

  df_2['encoded'] = df_2['annotations'].apply(lambda x: np.array(encode_sentence(x,vocab2index )))


Unnamed: 0,annotations,label,encoded
0,Immigrants without HOPE need help entering col...,9,"[[2, 3, 4, 5, 6, 7, 8, 0, 0, 0, 0, 0, 0, 0, 0,..."
1,"But in the eyes of the law, he is an illegal i...",4,"[[9, 10, 11, 12, 13, 11, 14, 15, 16, 17, 18, 1..."
2,"Reaction to Tancredo, Lamm as predicted",14,"[[21, 22, 23, 15, 1, 24, 25, 0, 0, 0, 0, 0, 0,..."
3,"That, said the congressman, is what always hap...",12,"[[26, 15, 27, 11, 28, 15, 17, 29, 30, 31, 15, ..."
4,"$50,000 per entry",0,"[[42, 43, 44, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,..."


In [74]:
Counter(df_2['label'])

Counter({9: 1686,
         4: 9690,
         14: 1324,
         12: 9812,
         0: 3336,
         6: 5039,
         10: 2818,
         7: 1122,
         8: 2575,
         13: 407,
         3: 1187,
         1: 254,
         5: 3413,
         2: 1486,
         11: 2650})

In [75]:
X = list(df_2['encoded'])
y = list(df_2['label'])
from sklearn.model_selection import train_test_split
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, stratify=df_2['label'])

In [76]:
class customDataset(Dataset):
    def __init__(self, X, Y):
        self.X = X
        self.y = Y
        
    def __len__(self):
        return len(self.y)
    
    def __getitem__(self, idx):
        return torch.from_numpy(self.X[idx][0].astype(np.int32)), self.y[idx], self.X[idx][1]

In [77]:
train_ds = customDataset(X_train, y_train)
valid_ds = customDataset(X_valid, y_valid)

#### train_model

In [78]:
def train_model(model, epochs=10, lr=0.001):
    parameters = filter(lambda p: p.requires_grad, model.parameters())
    optimizer = torch.optim.Adam(parameters, lr=lr)
    for i in range(epochs):
        model.train()
        sum_loss = 0.0
        total = 0
        for x, y, l in train_dl:
            x = x.long()
            y = y.long()
            y_pred = model(x, l)
            optimizer.zero_grad()
            loss = F.cross_entropy(y_pred, y)
            loss.backward()
            optimizer.step()
            sum_loss += loss.item()*y.shape[0]
            total += y.shape[0]
        val_loss, val_acc, val_rmse = validation_metrics(model, val_dl)
        # if i % 5 == 1:
        print("Epoch %d/%d: train loss %.3f, val loss %.3f, val accuracy %.3f, and val rmse %.3f" % (i+1,epochs,sum_loss/total, val_loss, val_acc, val_rmse))

def validation_metrics (model, valid_dl):
    model.eval()
    correct = 0
    total = 0
    sum_loss = 0.0
    sum_rmse = 0.0
    for x, y, l in valid_dl:
        x = x.long()
        y = y.long()
        y_hat = model(x, l)
        loss = F.cross_entropy(y_hat, y)
        pred = torch.max(y_hat, 1)[1]
        correct += (pred == y).float().sum()
        total += y.shape[0]
        sum_loss += loss.item()*y.shape[0]
        sum_rmse += np.sqrt(mean_squared_error(pred, y.unsqueeze(-1)))*y.shape[0]
    return sum_loss/total, correct/total, sum_rmse/total

In [79]:
batch_size = 5000
vocab_size = len(words)
train_dl = DataLoader(train_ds, batch_size=batch_size, shuffle=True)
val_dl = DataLoader(valid_ds, batch_size=batch_size)

In [80]:
def load_glove_vectors(glove_file=f"{dir}glove.6B/glove.6B.50d.txt"):
    """Load the glove word vectors"""
    word_vectors = {}
    with open(glove_file, 'r', encoding='UTF-8') as f:
        for line in f:
            split = line.split()
            word_vectors[split[0]] = np.array([float(x) for x in split[1:]])
    return word_vectors

In [81]:
def get_emb_matrix(pretrained, word_counts, emb_size = 50):
    """ Creates embedding matrix from word vectors"""
    vocab_size = len(word_counts) + 2
    vocab_to_idx = {}
    vocab = ["", "UNK"]
    W = np.zeros((vocab_size, emb_size), dtype="float32")
    W[0] = np.zeros(emb_size, dtype='float32') # adding a vector for padding
    W[1] = np.random.uniform(-0.25, 0.25, emb_size) # adding a vector for unknown words 
    vocab_to_idx["UNK"] = 1
    i = 2
    for word in word_counts:
        if word in word_vecs:
            W[i] = word_vecs[word]
        else:
            W[i] = np.random.uniform(-0.25,0.25, emb_size)
        vocab_to_idx[word] = i
        vocab.append(word)
        i += 1   
    return W, np.array(vocab), vocab_to_idx

In [82]:
word_vecs = load_glove_vectors()
pretrained_weights, vocab, vocab2index = get_emb_matrix(word_vecs, counts)

In [83]:
class LSTM_glove_vecs(torch.nn.Module) :
    def __init__(self, vocab_size, embedding_dim, hidden_dim, glove_weights) :
        super().__init__()
        self.embeddings = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        self.embeddings.weight.data.copy_(torch.from_numpy(glove_weights))
        self.embeddings.weight.requires_grad = False ## freeze embeddings
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        self.linear = nn.Linear(hidden_dim, 15)
        self.dropout = nn.Dropout(0.2)
        
    def forward(self, x, l):
        x = self.embeddings(x)
        x = self.dropout(x)
        lstm_out, (ht, ct) = self.lstm(x)
        return self.linear(ht[-1])

In [84]:
model = LSTM_glove_vecs(vocab_size, 50, 50, pretrained_weights)

#### training

In [89]:
train_model(model, epochs=100, lr=0.01)

Epoch 1/100: train loss 1.387, val loss 1.293, val accuracy 0.607, and val rmse 3.277
Epoch 2/100: train loss 1.281, val loss 1.252, val accuracy 0.626, and val rmse 3.327
Epoch 3/100: train loss 1.243, val loss 1.238, val accuracy 0.627, and val rmse 3.339
Epoch 4/100: train loss 1.218, val loss 1.214, val accuracy 0.631, and val rmse 3.284
Epoch 5/100: train loss 1.195, val loss 1.200, val accuracy 0.641, and val rmse 3.224
Epoch 6/100: train loss 1.185, val loss 1.193, val accuracy 0.640, and val rmse 3.274
Epoch 7/100: train loss 1.170, val loss 1.183, val accuracy 0.643, and val rmse 3.165
Epoch 8/100: train loss 1.160, val loss 1.172, val accuracy 0.643, and val rmse 3.204
Epoch 9/100: train loss 1.149, val loss 1.169, val accuracy 0.646, and val rmse 3.175
Epoch 10/100: train loss 1.145, val loss 1.159, val accuracy 0.650, and val rmse 3.162
Epoch 11/100: train loss 1.134, val loss 1.162, val accuracy 0.648, and val rmse 3.133
Epoch 12/100: train loss 1.126, val loss 1.148, val 

In [None]:
# Epoch 30/30: train loss 1.228, val loss 1.209, val accuracy 0.631, and val rmse 3.312
# Epoch 100/100: train loss 0.858, val loss 1.031, val accuracy 0.684, and val rmse 3.101

In [90]:
torch.save(model.state_dict(), f'{dir}LSTM_model')
# https://pytorch.org/tutorials/beginner/saving_loading_models.html

In [None]:
text = 'Immigrants without HOPE need help entering college'
df_2.annotations[0], '9'

# y_pred = model(x, l)
#  text -> enncode -> ds -> dl -> predict

In [125]:
encoded_text = np.array(encode_sentence(text,vocab2index ))
x,l = encoded_text
x = torch.from_numpy(x.astype(np.int32)).long()
model(x,l)

  encoded_text = np.array(encode_sentence(text,vocab2index ))


tensor([ 0.5939,  1.5913, -2.8517, -0.6758,  1.7557, -0.7119, -1.3342, -0.8125,
        -0.2294,  4.1930,  0.0908, -0.9462, -2.0648, -0.7570, -0.0934],
       grad_fn=<AddBackward0>)

# Model GRU

In [192]:
import numpy as np 
import pandas as pd
from keras.preprocessing.text import Tokenizer
from keras.utils import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Embedding, GRU, SpatialDropout1D
from sklearn.model_selection import train_test_split
from keras.utils.np_utils import to_categorical
from keras.callbacks import EarlyStopping
from keras.layers import Dropout
import re

In [193]:
# The maximum number of words to be used. (most frequent)
MAX_NB_WORDS = 20000
# Max number of words in each complaint.
MAX_SEQUENCE_LENGTH = 128
# This is fixed.
EMBEDDING_DIM = 100

In [194]:
tokenizer = Tokenizer(num_words=MAX_NB_WORDS, filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~', lower=True)
tokenizer.fit_on_texts(df['annotations'].values)
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

Found 34612 unique tokens.


In [195]:
X = tokenizer.texts_to_sequences(df['annotations'].values)
X = pad_sequences(X, maxlen=MAX_SEQUENCE_LENGTH)

print('Shape of data tensor:', X.shape)

Y = pd.get_dummies(df['label']).values
print('Shape of label tensor:', Y.shape)

Shape of data tensor: (46799, 128)
Shape of label tensor: (46799, 15)


In [196]:
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, 
                                                    test_size = 0.15, 
                                                    random_state = 42, 
                                                    stratify=df.label.values)
print(X_train.shape,Y_train.shape)
print(X_test.shape,Y_test.shape)

(39779, 128) (39779, 15)
(7020, 128) (7020, 15)


In [None]:
embeddings_index = {}
f = open(f'{dir}glove.6B/glove.6B.100d.txt')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Found %s word vectors.' % len(embeddings_index))

embedding_matrix = np.zeros((len(word_index) + 1, EMBEDDING_DIM))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

In [None]:
model_gru = Sequential()
model_gru.add(Embedding(len(word_index) + 1, 
                        EMBEDDING_DIM, 
                        weights=[embedding_matrix], 
                        input_length=X.shape[1], 
                        trainable=False))
model_gru.add(SpatialDropout1D(0.2))
model_gru.add(GRU(128, return_sequences = False))
model_gru.add(Dropout(0.2))
model_gru.add(Dense(15, activation = 'softmax'))
model_gru.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model_gru.summary())

In [None]:
epochs = 10
batch_size = 32

frames_gru = model_gru.fit(X_train, Y_train, epochs=epochs, batch_size=batch_size,validation_split=0.1,callbacks=[EarlyStopping(monitor='val_loss', patience=3, min_delta=0.0001)])


In [None]:
accr = model_gru.evaluate(X_test,Y_test)
print('Test set\n  Loss: {:0.3f}\n  Accuracy: {:0.3f}'.format(accr[0],accr[1]))

In [None]:
model_gru.save(f'{dir}GRU-keras-model')