## LSTM Model

- This notebook is about the architecture of LSTM model
- The first part is the text preprocessing; the second part is building the LSTM Model
- The steps of building LSTM Model:
  1. `Split data`         :Split our data into training datasets and testing datasets
  2. `Convert to tensors` :Convert these data into tensors to load them into Tensor datasets and separate our datasets into mini batches.
  4. `Build the model`    :initialize the input size,output size, embedding dimension,hidden dimension and dropout layer
  5. `Train the model`    : calculate the forward pass and backward pass
  6. `Test the model`     : test the data to get predicted output; compute the accuracy rate
  7. `Predict`            :tokenize the user_input_sentence and feed into the model ;get the predicted result (Popular or not popular)
 

In [None]:
import pandas as pd
import numpy as np

df_data=pd.read_csv("AMS_Data_200501.csv", header = None, names = ['Titles', 'Comments'])
df_data

Unnamed: 0,Titles,Comments
0,the justice department filed court documents i...,567.0
1,game thread oregon vs utah 8 00pm et,1300.0
2,homemade honey lemon almond pull apart cake,20.9
3,image trying is better than not trying,13.7
4,serious lawyers what's a case you regretted...,810.0
...,...,...
1161,🔥 jaguar eating underwater,31.5
1162,🔥 monitor mistakes a garden hose for a snake 🔥,25.2
1163,🔥 one in a million shot,52.6
1164,🔥 red eyed crocodile skink 🔥,24.0


In [None]:
# Remove the numeric strings in the title
df_data["Titles"] = df_data['Titles'].str.replace('\d+', '')
df_data

Unnamed: 0,Titles,Comments
0,the justice department filed court documents i...,567.0
1,game thread oregon vs utah pm et,1300.0
2,homemade honey lemon almond pull apart cake,20.9
3,image trying is better than not trying,13.7
4,serious lawyers what's a case you regretted...,810.0
...,...,...
1161,🔥 jaguar eating underwater,31.5
1162,🔥 monitor mistakes a garden hose for a snake 🔥,25.2
1163,🔥 one in a million shot,52.6
1164,🔥 red eyed crocodile skink 🔥,24.0


In [None]:
# sort the comments by ascending order
df_data=df_data.sort_values("Comments",ascending=True)
df_data

Unnamed: 0,Titles,Comments
44,suspected campaign from russia on reddit,0.3
47,tom holland’s drunk crying phone call with di...,1.6
679,ohio man dies from covid after criticizing go...,4.4
853,thanks i hate meatballs,8.5
1115,when the uk government use the same font as ca...,9.1
...,...,...
379,giveaway for a nintendo switch lite and your c...,3420.0
609,megathread senator bernie sanders endorses jo...,3450.0
66,giveaway xbox gift card,3850.0
1088,what is a movie you find terrible but critics ...,4000.0


In [None]:
#append all the reddit titles in reddit list
reddit=[]
reddit=[i for i in df_data['Titles']]

In [None]:
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
# define a function for removing stopwords from text
def remove_stops(sentence):
    '''Takes in a string containing text and returns a list with no stopwords.'''
    words = sentence.split()
    stops = set(stopwords.words('english'))
    no_stops = [w.lower() for w in words if w not in stops]
    return no_stops

In [None]:
tokens = word_tokenize(' '.join(list(set([t for t in df_data['Titles']]))))
tokens[:10]

['the', 'best', 'selfie', 'doesnt', 'exi', '“', 'there', 'is', 'a', 'new']

In [None]:
# get rid of non-alphabetic things and stopwords
# (decided not to use remove_stops() function here)
stops = set(stopwords.words('english'))
words = [w.lower() for w in tokens if w.isalpha() and w not in stops]
words[:10]

['best',
 'selfie',
 'doesnt',
 'exi',
 'new',
 'restaurant',
 'called',
 'karma',
 'menu',
 'get']

In [None]:
# count the frequency of word in reddit titles
from collections import Counter
counts=Counter(words)   
counts

Counter({'best': 17,
         'selfie': 1,
         'doesnt': 3,
         'exi': 1,
         'new': 23,
         'restaurant': 3,
         'called': 6,
         'karma': 1,
         'menu': 1,
         'get': 24,
         'deserve': 2,
         'getting': 12,
         'remember': 2,
         'monopoly': 1,
         'man': 18,
         'trolled': 1,
         'google': 1,
         'ceo': 4,
         'live': 11,
         'hearing': 1,
         'back': 20,
         'movie': 12,
         'titanic': 1,
         'shows': 8,
         'life': 22,
         'chief': 2,
         'baker': 1,
         'charles': 1,
         'joughin': 1,
         'throughout': 1,
         'little': 13,
         'snake': 2,
         'giving': 6,
         'big': 8,
         'yawn': 1,
         'retiring': 1,
         'frocket': 1,
         'sen': 2,
         'ted': 2,
         'cruz': 1,
         'set': 4,
         'introduce': 3,
         'bill': 3,
         'barring': 1,
         'pentagon': 2,
         'working': 9

In [None]:
# delete duplicated word by extracting count dictionary key
vocab=sorted(counts,key=counts.get,reverse=True) 
len(vocab)

4309

In [None]:
import numpy as np
# get word embeddings from downloaded .txt file with pretrained word vectors
embeddings_dict = {}
with open('/Users/karinahu/Desktop/glove.twitter.27B/glove.twitter.27B.25d.txt', encoding='utf-8') as f:
    for line in f:
        values = line.split()
        word = values[0]
        vector = np.asarray(values[1:], 'float32')
        embeddings_dict[word] = vector
print(list(embeddings_dict)[:10])

['<user>', '.', ':', 'rt', ',', '<repeat>', '<hashtag>', '<number>', '<url>', '!']


In [None]:
# collect relevant word vectors (the ones for words in our titles)
missing_word=[w for w in vocab if w not in embeddings_dict.keys()]
title_words_vectors = {w:embeddings_dict[w] for w in vocab if w in embeddings_dict.keys()}

print(title_words_vectors.values())

dict_values([array([-0.56174 ,  0.69419 ,  0.16733 ,  0.055867, -0.26266 , -0.6303  ,
       -0.28311 , -0.88244 ,  0.57317 , -0.82376 ,  0.46728 ,  0.48607 ,
       -2.1942  , -0.41972 ,  0.31795 , -0.70063 ,  0.060693,  0.45279 ,
        0.6564  ,  0.20738 ,  0.84496 , -0.087537, -0.38856 , -0.97028 ,
       -0.40427 ], dtype=float32), array([ 0.23221 ,  0.19274 , -0.20191 ,  0.55183 ,  0.22028 , -1.733   ,
       -0.5179  , -0.48583 ,  0.99621 , -0.61992 ,  1.3618  ,  0.51214 ,
       -0.27673 , -0.26378 ,  0.9227  ,  1.0396  ,  0.44228 ,  0.83579 ,
        1.3481  ,  0.55633 , -0.64568 ,  0.25664 , -0.19045 , -1.9823  ,
       -0.043051], dtype=float32), array([ 0.39657 ,  0.15653 ,  0.50676 , -0.039995, -0.1177  , -0.011625,
        1.7677  ,  0.33504 , -0.84748 , -0.27969 ,  0.036325, -0.146   ,
       -5.2788  ,  0.053348, -0.60437 ,  0.26285 ,  0.15334 , -0.31598 ,
       -0.18437 , -0.21645 , -0.095925, -0.07569 ,  0.18185 , -0.18519 ,
       -0.33499 ], dtype=float32), array(

In [None]:
# match word vectors from the dictionary value because PCA requires list datatype
word_vector=[list(embeddings_dict[w]) for w in vocab if w in embeddings_dict.keys()]
nonrepeated_word=[str(w) for w in vocab if w in embeddings_dict.keys()]
word_vector

[[-0.56174,
  0.69419,
  0.16733,
  0.055867,
  -0.26266,
  -0.6303,
  -0.28311,
  -0.88244,
  0.57317,
  -0.82376,
  0.46728,
  0.48607,
  -2.1942,
  -0.41972,
  0.31795,
  -0.70063,
  0.060693,
  0.45279,
  0.6564,
  0.20738,
  0.84496,
  -0.087537,
  -0.38856,
  -0.97028,
  -0.40427],
 [0.23221,
  0.19274,
  -0.20191,
  0.55183,
  0.22028,
  -1.733,
  -0.5179,
  -0.48583,
  0.99621,
  -0.61992,
  1.3618,
  0.51214,
  -0.27673,
  -0.26378,
  0.9227,
  1.0396,
  0.44228,
  0.83579,
  1.3481,
  0.55633,
  -0.64568,
  0.25664,
  -0.19045,
  -1.9823,
  -0.043051],
 [0.39657,
  0.15653,
  0.50676,
  -0.039995,
  -0.1177,
  -0.011625,
  1.7677,
  0.33504,
  -0.84748,
  -0.27969,
  0.036325,
  -0.146,
  -5.2788,
  0.053348,
  -0.60437,
  0.26285,
  0.15334,
  -0.31598,
  -0.18437,
  -0.21645,
  -0.095925,
  -0.07569,
  0.18185,
  -0.18519,
  -0.33499],
 [0.61531,
  0.40895,
  -0.34224,
  0.21094,
  -0.046403,
  0.29744,
  1.9485,
  -0.91247,
  -0.85772,
  0.57293,
  -0.99537,
  -0.15858,
  

In [None]:
#use PCA to compress the word dimension from 25 to 1
from sklearn.decomposition import PCA
pca=PCA(n_components=1)
dim=pca.fit_transform(word_vector)
dim

array([[ 1.05129676],
       [ 3.11819939],
       [-3.43426978],
       ...,
       [-0.11107866],
       [ 2.97117368],
       [-0.52591503]])

In [None]:
#standardize PCA word vectors
from sklearn.preprocessing import StandardScaler
sc_d = StandardScaler()
dim_std = sc_d.fit_transform(dim)
dim_std

array([[ 0.69276211],
       [ 2.05476747],
       [-2.26304509],
       ...,
       [-0.07319635],
       [ 1.95788347],
       [-0.34655676]])

In [None]:
# match word vector after PCA
word_vec_dim={}
for i in range(len(dim)):
    word_vec_dim[nonrepeated_word[i]]=abs(dim_std[i][0])
word_vec_dim

{'trump': 0.6927621071303286,
 'coronavirus': 2.0547674676477854,
 'one': 2.2630450909656603,
 'people': 2.0043453012597063,
 'til': 0.8862315583749114,
 'like': 2.3467942713813987,
 'us': 1.7689377587950792,
 'would': 1.9996965297761673,
 'time': 2.1814583107838996,
 'day': 2.237960296986458,
 'game': 1.767334575926647,
 'years': 1.5278592131473758,
 'world': 1.6440038938677686,
 'says': 1.0396454003868287,
 'today': 1.9959907196208806,
 'get': 2.3788757613078535,
 'first': 1.861752768544571,
 'new': 1.9047207020073638,
 'life': 2.040144998112184,
 'made': 1.6859389394942736,
 'president': 0.3667127447386324,
 'back': 2.2381017986239287,
 'found': 1.1934101655611127,
 'post': 1.2975965096230317,
 'every': 1.858118492444981,
 'workers': 0.3096374315611073,
 'make': 2.096026131154261,
 'old': 1.6977735061180037,
 'good': 2.4099133202425627,
 'man': 1.8749402623737215,
 'home': 1.838368868559868,
 'best': 2.1521678902446517,
 'quarantine': 1.5518606702867013,
 'u': 2.348031400773667,
 'm

In [None]:
#Merge all the word vectors into Reddit titles
reddit_int=[]   

for a in reddit:
    reddit_int.append([word_vec_dim[word] for word in a.split() if word in word_vec_dim.keys()])
print(reddit_int)

[[0.9587954655390687, 0.07689902833583434, 0.06637267526825405, 0.3981091238457368], [0.8890903202710757, 1.3842801179302242, 1.526292010907195, 1.8694841840864733, 1.8020437093702826, 0.7225799406468187, 0.2117916471359396, 0.6635237684901899, 0.0036787834752544926, 1.8749402623737215, 1.7986493313034013, 0.38828826442727965], [0.17382624218613157, 1.8749402623737215, 0.1548637262900951, 0.8144021169017238, 2.0547674676477854, 0.6501527166895035], [1.9762642134896427, 1.8846745270489358, 0.7101394954758642], [0.734528928039614, 0.06480890386051542, 1.402367711339255, 0.38042084929482645, 0.6181985321429622, 0.18834992136779347], [0.1573926717178852, 0.06480890386051542, 0.508363513101793, 0.49549834001015486, 0.504428127140697, 1.7114245890554036, 0.6459663705667872, 0.1573926717178852, 0.45259995837620415, 2.096026131154261, 0.8103880200010602, 0.737268879908655, 0.5081391471484477, 0.31769335918580166, 1.6909845551997793], [0.5569358581504588, 1.3433963146717398], [1.898774634384874

In [None]:
#delete all empty lists,so that the data can feed into the LSTM model
reddit=[]
for i in reddit_int:
    if len(i)>1:
        reddit.append(i)
print(len(reddit))

1097


In [None]:
#create label arrays
import numpy as np
encoded_labels=np.ones(len(reddit),dtype=float)
for i in range(500):
    encoded_labels[i]=encoded_labels[i]-1
encoded_labels

array([0., 0., 0., ..., 1., 1., 1.])

In [None]:
#set every input layer to be 25 words
seq_length=25
features = np.zeros((len(reddit),seq_length),dtype=float)
features

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [None]:
#fill in the words vectors to the feature array
for i in range(len(reddit)):
    features[i,-len(reddit[i]):]=np.array(reddit[i])[:seq_length]
features

array([[0.        , 0.        , 0.        , ..., 0.07689903, 0.06637268,
        0.39810912],
       [0.        , 0.        , 0.        , ..., 1.87494026, 1.79864933,
        0.38828826],
       [0.        , 0.        , 0.        , ..., 0.81440212, 2.05476747,
        0.65015272],
       ...,
       [0.        , 0.        , 0.        , ..., 0.72938011, 1.07375167,
        0.90447066],
       [0.        , 0.        , 0.        , ..., 0.48842446, 1.2379662 ,
        2.49503696],
       [0.        , 0.        , 0.        , ..., 1.24199342, 1.76893776,
        2.10541291]])

In [None]:
#split the training data and testing data
split_frac=0.6
split_idx=int(len(features)*split_frac)
split_idx

658

In [None]:
train_x, test_x=features[:split_idx], features[split_idx:]
train_y, test_y=encoded_labels[:split_idx], encoded_labels[split_idx:]
print(train_x)
print(test_x)

[[0.         0.         0.         ... 0.07689903 0.06637268 0.39810912]
 [0.         0.         0.         ... 1.87494026 1.79864933 0.38828826]
 [0.         0.         0.         ... 0.81440212 2.05476747 0.65015272]
 ...
 [0.         0.         0.         ... 0.50795842 1.18887166 0.59441293]
 [0.         0.         0.         ... 0.8048108  2.09602613 2.14170973]
 [0.         0.         0.         ... 1.68022058 1.51292214 0.86832473]]
[[0.         0.         0.         ... 1.40016772 2.40991332 0.61300898]
 [0.         0.         0.         ... 0.         1.72475576 0.15569561]
 [0.         0.         0.         ... 1.66872927 1.83836887 0.04033255]
 ...
 [0.         0.         0.         ... 0.72938011 1.07375167 0.90447066]
 [0.         0.         0.         ... 0.48842446 1.2379662  2.49503696]
 [0.         0.         0.         ... 1.24199342 1.76893776 2.10541291]]


## DataLoader

In [None]:
import torch
from torch.utils.data import TensorDataset,DataLoader

In [None]:
#create Tensor datasets
train_data=TensorDataset(torch.from_numpy(train_x),torch.from_numpy(train_y))
test_data=TensorDataset(torch.from_numpy(test_x),torch.from_numpy(test_y))
train_data[0][0].shape

torch.Size([25])

In [None]:
#dataloader
batch_size=50

In [None]:
#make sure to SHUFFLE our training data
train_loader=DataLoader(train_data,shuffle=True,batch_size=batch_size,drop_last=True)
test_loader=DataLoader(test_data,shuffle=True,batch_size=batch_size,drop_last=True)

## LSTM Model

In [None]:
import torch.nn as nn

In [None]:
class PopularityRNN(nn.Module):
    def __init__(self,input_size,output_size,embedding_dim,hidden_dim,n_layers,drop_prob=0.5):
        super(PopularityRNN, self).__init__()
        
        self.output_size=output_size
        self.n_layers=n_layers
        self.hidden_dim=hidden_dim
        
        #embedding and LSTM layers
        self.embedding=nn.Embedding(input_size,embedding_dim)
        self.lstm=nn.LSTM(embedding_dim,hidden_dim,n_layers,dropout=drop_prob,batch_first=True)
                         
        
        #dropout layer
        self.dropout=nn.Dropout(0.3)
        
        #linear and sigmoid layers
        self.fc=nn.Linear(hidden_dim,output_size)
        self.sig=nn.Sigmoid()
        
    def forward(self, x, hidden):
        #execute the forward pass
        batch_size=x.size()
        
        #embed the input data and fill into the LSTM model
        x=x.long()
        embeds=self.embedding(x)
        lstm_out,hidden=self.lstm(embeds,hidden)
        # view function requires the tensor data set to be contiguous in memory
        lstm_out=lstm_out.contiguous().view(-1,self.hidden_dim) 
        
        #dropout and fully_connected layer
        out=self.dropout(lstm_out)
        out=self.fc(out)
        #sigmoid function
        sig_out=self.sig(out)
        
        #reshape to be batch_size first
        sig_out=sig_out.view(batch_size,-1)
        #get last batch of labels
        sig_out=sig_out[:,-1]  
        
        #return last sigmoid output and hidden state
        return sig_out, hidden
    
    def init_hidden(self,batch_size):
        #initializes hidden state and cell state in LSTM RNN
    
        weight=next(self.parameters()).data
        
        hidden=(weight.new(self.n_layers,batch_size,self.hidden_dim).zero_(),
                weight.new(self.n_layers,batch_size,self.hidden_dim).zero_())
        
        return hidden
        

## Train

In [None]:
input_size=len(word_vec_dim)+1 
output_size=1
embedding_dim=400
hidden_dim=256
n_layers=2
net=PopularityRNN(input_size,output_size,embedding_dim,hidden_dim,n_layers)

In [None]:
# loss and optimization functions
lr=0.001
criterion=nn.BCELoss() #binary cross entropy between target and output
optimizer=torch.optim.Adam(net.parameters(),lr=lr)

In [None]:
#training parameters
epochs=2  # loop over the dataset 2 times

net.train()
# train some number of epochs
for e in range(epochs):
    
    h=net.init_hidden(batch_size)
    #batch loop
    for inputs,labels in train_loader:
        #create a new hidden state and cell state
        h=tuple([each.data for each in h])
        net.zero_grad() #clears old gradients from the last step
        
        #Backpropagation
        # calculate the forward pass
        output,h=net(inputs, h)
        #calculate the backward pass
        loss=criterion(output.squeeze(),labels.float())
        loss.backward()

        optimizer.step() 

## Test

In [None]:
accuracy_list=[]
num_correct=0

#initialize  hidden state
h=net.init_hidden(batch_size)

net.eval()
#extract the data from test_loader
for inputs,labels in test_loader:
    
    #create a new hidden state and cell state
    h=tuple([each.data for each in h])
    
    #get predicted ouputs (0 or 1)
    output,h=net(inputs,h)
    pred=[1 if w>0.2 else 0 for w in output]
    pred = torch.FloatTensor(pred)
   
    #calculate accuarcy
    correct_tensor=pred.eq(labels.float().view_as(pred))
    correct=np.squeeze(correct_tensor.cpu().numpy())
    num_correct+=np.sum(correct)
    
print("accuracy rate:",num_correct/len(test_loader.dataset))

accuracy rate: 0.7038724373576309


## Predict

In [None]:
from string import punctuation

def tokenize_titles(test_title):
    test_title=test_title.lower() 
    
    #split title into single word
    test_words=test_title.split()
    
    #append the sentence (word vectors) into the list
    test_inits=[]
    test_inits.append([word_vec_dim[word] for word in test_words if word in word_vec_dim.keys()])
    
    return test_inits

In [None]:
def pad_features(titles_ints,seq_length):
    features=np.zeros((len(titles_ints),seq_length),dtype=float)
    
    #fill the sentence(word vectors) into the array
    for i,row in enumerate(titles_ints):
        features[i,-len(row):]=np.array(row)[:seq_length]
        
    return features

In [None]:
def predict(net,test_title,sequence_length=25):
    net.eval()
    
    #tokenize the word and build an arrary filled with sentences
    test_inits=tokenize_titles(test_title)
    seq_length=sequence_length
    features=pad_features(test_inits,seq_length)
    
    #convert to tensor to feed into the LSTM model
    feature_tensor=torch.from_numpy(features)
    batch_size=feature_tensor.size(0)
    
    #initialize hidden state and cell state
    h=net.init_hidden(batch_size)
    
    #get the output from the LSTM model
    output,h=net(feature_tensor,h)
    
    #get predicted output(0 or1)
    pred=[1 if w>0.2 else 0 for w in output]
    pred = torch.FloatTensor(pred)

    
    #print the result
    if(pred.item()==1):
        print("Popular")
        
    else:
        print("Not Popular")


In [None]:
#test title
test_title="final giveaway for twelve more nintendo switch"
seq_length=25

predict(net,test_title,seq_length)

Popular


In [None]:
#test title
test_title="man dies from"
seq_length=25

predict(net,test_title,seq_length)

Popular


In [None]:
#test title
test_title="what fictional character do you absolutely hate"
seq_length=25

predict(net,test_title,seq_length)

Not Popular


In [None]:
#test title
test_title="homemade honey lemon almond pull apart cake	"
seq_length=25

predict(net,test_title,seq_length)

Not Popular
