In [1]:
# !kaggle datasets download fullmetal26/glovetwitter27b100dtxt
# !unzip glovetwitter27b100dtxt.zip

In [None]:
import tensorflow.keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import InputLayer, Dense, SimpleRNN, Activation, Dropout, Conv1D
from tensorflow.keras.layers import Embedding, Flatten, LSTM, GRU
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import EarlyStopping
import pandas as pd
import numpy as np
import spacy
from sklearn.metrics import classification_report

In [None]:
def load_glove_model(glove_file):
    print("[INFO]Loading GloVe Model...")
    model = {}
    with open(glove_file, 'r', encoding="utf-8") as f:
        for line in f:
            split_line = line.split()
            word = split_line[0]
            embeddings = [float(val) for val in split_line[1:]]
            model[word] = embeddings
    print("[INFO] Done...{} words loaded!".format(len(model)))
    return model
glove_model = load_glove_model("/content/glove.twitter.27B.100d.txt")

[INFO]Loading GloVe Model...
[INFO] Done...14726 words loaded!


In [None]:
# adopted from utils.py
nlp = spacy.load("en_core_web_sm")

def remove_stopwords(sentence):
    '''
    function to remove stopwords
        input: sentence - string of sentence
    '''
    new = []
    # tokenize sentence
    sentence = nlp(sentence)
    for tk in sentence:
        if (tk.is_stop == False) & (tk.pos_ !="PUNCT"):
            new.append(tk.string.strip())
    # convert back to sentence string
    c = " ".join(str(x) for x in new)
    return c


def lemmatize(sentence):
    '''
    function to do lemmatization
        input: sentence - string of sentence
    '''
    sentence = nlp(sentence)
    s = ""
    for w in sentence:
        s +=" "+w.lemma_
    return nlp(s)

def sent_vectorizer(sent, model):
    '''
    sentence vectorizer using the pretrained glove model
    '''
    sent_vector = np.zeros(200)
    num_w = 0
    for w in sent.split():
        try:
            # add up all token vectors to a sent_vector
            sent_vector = np.add(sent_vector, model[str(w)])
            num_w += 1
        except:
            pass
    return sent_vector

In [None]:
# !wget https://raw.githubusercontent.com/haochen23/nlp-rnn-lstm-sentiment/master/training.1600000.processed.noemoticon.csv

In [None]:
# data = pd.read_csv("training.1600000.processed.noemoticon.csv", header=None, encoding='latin-1')
data = pd.read_csv("/content/pre_processed_final.csv")
print(data.shape)
data = data[["sentiment", "text"]]
data = data.dropna()
data.head()

(28777, 3)


Unnamed: 0,sentiment,text
0,1.0,is lookin 4ward to a long weekend really dont...
1,1.0,myweakness is music and i live to meet the pe...
2,1.0,figured out the internet on my new ipod
3,1.0,cant wait to worship with you guys tonight itl...
4,1.0,congrats james im sure the book is going to b...


In [None]:
data_X = data['text']
data_y = data['sentiment']
data_y = pd.get_dummies(data_y)

In [None]:
data_y = data_y.to_numpy()
data_y

array([[0, 0, 1],
       [0, 0, 1],
       [0, 0, 1],
       ...,
       [0, 1, 0],
       [0, 1, 0],
       [0, 1, 0]], dtype=uint8)

In [None]:
n = 6542
print(data['sentiment'][n], data_y[n])

-1.0 [1 0 0]


In [None]:
max_vocab = 18000
max_len = 15
tokenizer = Tokenizer(num_words=max_vocab)

In [None]:
data_X = data_X.to_numpy()
print(data_X)
tokenizer.fit_on_texts(data_X)
sequences = tokenizer.texts_to_sequences(data_X)
word_index = tokenizer.word_index


['is lookin 4ward to a long weekend  really dont want to go to work 2day tho  x'
 'myweakness  is music and i live to meet the people who make it'
 'figured out the internet on my new ipod' ...
 'speaking of chris jericho it is wednesday today is it not time for some catching up on wrestling'
 'im so jealous of everyone at the justin bieber concert no worries tho ill be seeing maroon 5 in march  roadtrippin'
 'trending tim tebow is now dating cave woman from 10000 bc 1st date question what was it like in a time before jesus  tebowmania']


In [None]:
print('Found %s unique tokens.' % len(word_index))

Found 30651 unique tokens.


In [None]:
data_keras = pad_sequences(sequences, maxlen=max_len, padding="post")
# data_keras = data_keras.reshape(data_X.shape[0], 15, 1)
data_keras.shape



(28398, 15)

In [None]:
from sklearn.model_selection import train_test_split
train_X, valid_X, train_y, valid_y = train_test_split(data_keras, data_y, test_size = 0.3, random_state=42)

In [None]:
# adopted from sent_tran_eval.py
def build_model(nb_words, rnn_model="SimpleRNN", embedding_matrix=None):
    '''
    build_model function:
    inputs: 
        rnn_model - which type of RNN layer to use, choose in (SimpleRNN, LSTM, GRU)
        embedding_matrix - whether to use pretrained embeddings or not
    '''
    model = Sequential()
    # add an embedding layer
    if embedding_matrix is not None:
        model.add(Embedding(nb_words, 
                        100, 
                        weights=[embedding_matrix], 
                        input_length= max_len,
                        trainable = False))
    else:
        model.add(Embedding(nb_words, 
                        100, 
                        input_length= max_len,
                        trainable = False))
        
    # add an RNN layer according to rnn_model
    if rnn_model == "SimpleRNN":
        model.add(SimpleRNN(200))
    elif rnn_model == "LSTM":
        model.add(LSTM(200))
    else:
        model.add(GRU(200))
    # model.add(Dense(500,activation='relu'))
    # model.add(Dense(500, activation='relu'))
    model.add(Dense(3, activation='softmax'))
    
    model.compile(loss='categorical_crossentropy', 
                optimizer='adam',
                metrics=['accuracy'])
    return model

In [None]:
model_rnn = build_model(nb_words, "SimpleRNN", embedding_matrix)
model_rnn.fit(train_X, train_y, epochs=20, batch_size=60,
          validation_data=(valid_X, valid_y), callbacks=EarlyStopping(monitor='val_accuracy', mode='max',patience=3))
predictions = model_rnn.predict(valid_X)
predictions = predictions.argmax(axis=1)
print(classification_report(valid_y.argmax(axis=1), predictions))

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
              precision    recall  f1-score   support

           0       0.62      0.69      0.65      2713
           1       0.58      0.59      0.58      2676
           2       0.68      0.61      0.64      3131

    accuracy                           0.63      8520
   macro avg       0.63      0.63      0.63      8520
weighted avg       0.63      0.63      0.63      8520



In [None]:
model_lstm = build_model(nb_words, "LSTM", embedding_matrix)
model_lstm.fit(train_X, train_y, epochs=20, batch_size=60,
          validation_data=(valid_X, valid_y), callbacks=EarlyStopping(monitor='val_accuracy', mode='max',patience=3))
predictions = model_lstm.predict(valid_X)
predictions = predictions.argmax(axis=1)
print(classification_report(valid_y.argmax(axis=1), predictions))


Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
              precision    recall  f1-score   support

           0       0.71      0.65      0.68      2713
           1       0.63      0.59      0.61      2676
           2       0.65      0.74      0.69      3131

    accuracy                           0.66      8520
   macro avg       0.66      0.66      0.66      8520
weighted avg       0.66      0.66      0.66      8520



In [None]:
model_gru = build_model(nb_words, "GRU", embedding_matrix)
# 20, 120
model_gru.fit(train_X, train_y, epochs=10, batch_size=100,
          validation_data=(valid_X, valid_y), callbacks=EarlyStopping(monitor='val_accuracy', mode='max',patience=3))
predictions = model_gru.predict(valid_X)
predictions = predictions.argmax(axis=1)
print(classification_report(valid_y.argmax(axis=1), predictions))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
              precision    recall  f1-score   support

           0       0.70      0.72      0.71      2713
           1       0.64      0.59      0.62      2676
           2       0.70      0.72      0.71      3131

    accuracy                           0.68      8520
   macro avg       0.68      0.68      0.68      8520
weighted avg       0.68      0.68      0.68      8520



In [None]:
model_gru.save("rnn.h5")