In [1]:
import numpy as np # math calcuations and other matrix, vector processing
import pandas as pd # dataframe organization (similar to Excel)
import seaborn as sns # for plotting
import matplotlib.pyplot as plt # for plotting
from sklearn.preprocessing import StandardScaler #replace by MinMaxScaler
import tensorflow as tf
from tensorflow.keras import Sequential
from tensorflow.keras import layers
import random
import re
from collections import Counter
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.layers import Embedding # embedding=dense=fully connected layer
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import RNN, GRU, LSTM
from tensorflow.keras import optimizers
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [2]:
df_train = pd.read_csv('twitter_train.csv')
df_test = pd.read_csv('twitter_test.csv')

In [20]:
#convert to lower case and remove non-word characters
def preprocess_sentence(s):
    # convert to lower
    s = s.lower()
    # remove all non-word characters (everything except numbers and letters)
    s = re.sub(r"[^\w\s]", '', s)
    return s
# Determine size of vocabulary
def get_vocabulary_size(tweets):
    tweets = [preprocess_sentence(s) for s in tweets]
    combined_tweets = ' '.join(tweets)
    words = combined_tweets.split()
    count_words = Counter(words)
    total_words = len(words)
    sorted_words = count_words.most_common(total_words)
    return len(sorted_words), sorted_words
# Tokenize the sentence
def tokenize(tweets):
    int_tweets = []
    for twitter in tweets:
        twitter = preprocess_sentence(twitter)
        r = [word2int[w] for w in twitter.split()]
        int_tweets.append(r)
    return int_tweets
# Pad by adding 0 at the end or truncate 
def padding(int_tweets, maximum_length=200):
    padded_tweets = np.zeros((len(int_tweets), maximum_length), dtype=int)
    for i, int_tweet in enumerate(int_tweets):
        tweet_len = len(int_tweet)
        if tweet_len >= maximum_length:
            padded_tweets[i, :] = np.array(int_tweet[:maximum_length])
        else:
            _temp = int_tweet + list(np.zeros(maximum_length-tweet_len))
            padded_tweets[i, :] = np.array(_temp)
    return padded_tweets
def get_model(embed_dim, hidden_state_dim):
    model = Sequential()    
    model.add(Embedding(input_dim=vocab_dim, #size of vocabulary
                        output_dim=embed_dim, # size of output dimension
                        input_length=X_train_pad.shape[-1] # max length of a sentence

                    ))
    # we can use drop out etc.
    #model.add(GRU(hidden_state_dim, return_sequences=False))
    #model.add(LSTM(hidden_state_dim, return_sequences=True))
    #model.add(GRU(hidden_state_dim, return_sequences=False))
    model.add(GRU(hidden_state_dim, return_sequences=True))
    model.add(GRU(hidden_state_dim, return_sequences=True))
    model.add(GRU(hidden_state_dim, return_sequences=True))
    model.add(GRU(hidden_state_dim, return_sequences=False))
    model.add(Dense(3, activation='sigmoid'))
    return model

In [4]:
train_tweets = [preprocess_sentence(s) for s in df_train['twitter']]
test_tweets = [preprocess_sentence(s) for s in df_test['twitter']]

In [5]:
vocab_dim, sorted_words = get_vocabulary_size(train_tweets + 
                                             test_tweets)

In [6]:
word2int = {w: i+1 for i, (w, c) in enumerate(sorted_words)}

In [7]:
X_train = tokenize(train_tweets)
X_test = tokenize(test_tweets)

In [8]:
#convert labels to integers
le=LabelEncoder()
y_train = le.fit_transform(df_train['sentiment'])

In [9]:
maximum_length=50 # tweets are shorter than reviews we looked at in class
X_train_pad = padding(X_train, maximum_length)
X_test_pad = padding(X_test, maximum_length)
vocab_dim = vocab_dim + 1 # +1 for padding 0

In [21]:
seed = 2022
random.seed(seed)
np.random.seed(seed)
tf.random.set_seed(seed)

embed_dim = 20
hidden_state_dim = 30

model = get_model(embed_dim, hidden_state_dim)
model.summary()
model.compile(loss='sparse_categorical_crossentropy',
              optimizer=optimizers.Adam(learning_rate=1e-3),
              metrics=['accuracy']
             )

callback = tf.keras.callbacks.EarlyStopping(
    monitor="val_loss", 
    patience=2, 
    verbose=1,
    restore_best_weights=True                               
    )

X_train_new , X_val_new, y_train_new, y_val_new = \
    train_test_split(X_train_pad, y_train, stratify= y_train,
                                    test_size=0.05,random_state=2022)

history = model.fit(x=X_train_new, y=y_train_new, 
                    batch_size=32,
                    validation_data=[X_val_new, y_val_new],
                    epochs=10,
                    callbacks=[callback],
                    verbose=1)

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 50, 20)            1583980   
                                                                 
 gru_4 (GRU)                 (None, 50, 30)            4680      
                                                                 
 gru_5 (GRU)                 (None, 50, 30)            5580      
                                                                 
 gru_6 (GRU)                 (None, 50, 30)            5580      
                                                                 
 gru_7 (GRU)                 (None, 30)                5580      
                                                                 
 dense_1 (Dense)             (None, 3)                 93        
                                                                 
Total params: 1,605,493
Trainable params: 1,605,493
No

In [16]:
# Save Test predictions 
y_test_pred = model.predict(X_test_pad)
# change back to [-1, 0 , 1] values for sentiment
y_test_labels = np.around(y_test_pred)
y_test_labels_pred = np.argmax(y_test_labels, axis=1)
y_test_labels_pred_new = [x-1 for x in y_test_labels_pred]

df_test_pred = pd.DataFrame(data=y_test_labels_pred_new, columns=['sentiment'])
df_test_pred.to_csv('sentiment_prediction.csv')

