In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import torch
import torch.nn as nn
import pandas as pd
import numpy as np
import logging
import gzip
import gensim 
import re
import spacy
import math
from bs4 import BeautifulSoup
from collections import Counter
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F
import string
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence, pad_sequence
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from tqdm import tqdm
import nltk
from nltk.corpus import wordnet
from nltk.corpus import stopwords
nltk.download('wordnet')
nltk.download('stopwords')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [3]:
print(torch.cuda.is_available())
device = 'cuda:0' if torch.cuda.is_available() else 'cpu'
device

True


'cuda:0'

# Preprocessing & Loading data

### Covid dataset

In [4]:
# Load dataset for clothing reviews
covid_train = pd.read_csv("/content/drive/MyDrive/data /Corona_NLP_test.csv", encoding="latin1")
covid_train = covid_train.dropna()
covid_test = pd.read_csv("/content/drive/MyDrive/data /Corona_NLP_test.csv")
covid_test = covid_test.dropna()

covid_trian_inds = list(range(0, covid_train.shape[0]))

# cat both datasets for preprocessing
frames = [covid_train, covid_test]
covid = pd.concat(frames)
print(covid_train.head())
print(covid_test.shape)
print(type(covid))

   UserName  ...           Sentiment
0         1  ...  Extremely Negative
1         2  ...            Positive
3         4  ...            Negative
4         5  ...             Neutral
5         6  ...             Neutral

[5 rows x 6 columns]
(2964, 6)
<class 'pandas.core.frame.DataFrame'>


In [5]:
covid.head()

Unnamed: 0,UserName,ScreenName,Location,TweetAt,OriginalTweet,Sentiment
0,1,44953,NYC,02-03-2020,TRENDING: New Yorkers encounter empty supermar...,Extremely Negative
1,2,44954,"Seattle, WA",02-03-2020,When I couldn't find hand sanitizer at Fred Me...,Positive
3,4,44956,Chicagoland,02-03-2020,#Panic buying hits #NewYork City as anxious sh...,Negative
4,5,44957,"Melbourne, Victoria",03-03-2020,#toiletpaper #dunnypaper #coronavirus #coronav...,Neutral
5,6,44958,Los Angeles,03-03-2020,Do you remember the last time you paid $2.99 a...,Neutral


In [6]:
# preprocess training and testing set

covid = covid[['OriginalTweet', 'Sentiment']]
covid.columns = ['tweet', 'sentiment']
sentiment_dict = {'negative': 0, 'extremely negative':0, 'neutral':1, 'positive':2, 'extremely positive':2}
error_line = []

def encode_sentiment(x):
    sent_encoded = sentiment_dict[x.lower()]
    return sent_encoded

covid['sentiment_encoded'] = covid['sentiment'].apply(lambda x: encode_sentiment(x))



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if sys.path[0] == '':


In [7]:
covid.head()

Unnamed: 0,tweet,sentiment,sentiment_encoded
0,TRENDING: New Yorkers encounter empty supermar...,Extremely Negative,0
1,When I couldn't find hand sanitizer at Fred Me...,Positive,2
3,#Panic buying hits #NewYork City as anxious sh...,Negative,0
4,#toiletpaper #dunnypaper #coronavirus #coronav...,Neutral,1
5,Do you remember the last time you paid $2.99 a...,Neutral,1


In [8]:
print(len(covid.sentiment_encoded))
print(covid.shape[0])

5928
5928


#### Tokenize each sentence

In [9]:
# read in data
def clear_data(df):
    stopword = set(stopwords.words('english'))

    # tokenize and post process, remove unnecessary words appearing in tweets(like urls and usernames)
    tweet_words = []
    str_lis = []
    for t in df['tweet']:
        t = t.lower()
        tokens = tokenize(t)
        tmp = []
        for token in tokens:
            # if token in stopword:
            #     continue
            if not wordnet.synsets(token):
                continue
            # repalce usernames with @USER
            user = re.sub(r'@[\w\W]+', '@USER', token)
            # replace with URL
            url = re.sub(r'https[\w\W]+','URL',user)
            if url in stopword:
                continue
            tmp.append(url)
        tweet_words.append(tmp)
        st = ' '.join([str(item) for item in tmp])
        str_lis.append(st)
    df['tweet_words'] = tweet_words
    df['tweets'] = str_lis

    # remove duplicates
    df.drop_duplicates(subset=['tweets'], inplace=True)

    df.drop(columns='tweet_words', inplace=True)
    print(len(df))
    df.to_csv("covid.csv", index=False)
    return df

def tokenize(text):
    words = []
    # print(text.split())
    for token in text.split():
        # find wouldn't kind word
        words.extend(re.findall(r"\w+-\w+|https.+|\.+|\d+[\.,]\d+|[@#]\w+|[+-]\d+|(?:(?!n[’'])\w)+|\w?[’']\w+|[^\s\w]", token))
        # words.extend(re.findall(r"(?:(?!.')\w)+|\w?'\w+|[^\s\w]", token))
    return words

In [10]:
covid = clear_data(covid)

3086


In [11]:
covid.head()

Unnamed: 0,tweet,sentiment,sentiment_encoded,tweets
0,TRENDING: New Yorkers encounter empty supermar...,Extremely Negative,0,trending new encounter empty supermarket shelv...
1,When I couldn't find hand sanitizer at Fred Me...,Positive,2,find hand turned 2 pack check concerns driving...
3,#Panic buying hits #NewYork City as anxious sh...,Negative,0,buying hits city anxious shoppers stock food a...
4,#toiletpaper #dunnypaper #coronavirus #coronav...,Neutral,1,one week buying baby milk powder next buying t...
5,Do you remember the last time you paid $2.99 a...,Neutral,1,remember last time paid gallon regular gas pri...


#### Encode and truncate sentence

In [12]:
# tokenize the tweets into list
def encode_tweets(x):
    tweet_list = x.split()
    return tweet_list

covid['tweets_tokenized'] = covid['tweets'].apply(lambda x: encode_tweets(x))
covid.reset_index(drop=True, inplace=True)
# drop rows wth empty tokenized tweet
for i, row in covid.iterrows():
    if len(row.tweets_tokenized) < 2:
        covid.drop(i, inplace = True)

In [13]:
covid.head(30)

Unnamed: 0,tweet,sentiment,sentiment_encoded,tweets,tweets_tokenized
0,TRENDING: New Yorkers encounter empty supermar...,Extremely Negative,0,trending new encounter empty supermarket shelv...,"[trending, new, encounter, empty, supermarket,..."
1,When I couldn't find hand sanitizer at Fred Me...,Positive,2,find hand turned 2 pack check concerns driving...,"[find, hand, turned, 2, pack, check, concerns,..."
2,#Panic buying hits #NewYork City as anxious sh...,Negative,0,buying hits city anxious shoppers stock food a...,"[buying, hits, city, anxious, shoppers, stock,..."
3,#toiletpaper #dunnypaper #coronavirus #coronav...,Neutral,1,one week buying baby milk powder next buying t...,"[one, week, buying, baby, milk, powder, next, ..."
4,Do you remember the last time you paid $2.99 a...,Neutral,1,remember last time paid gallon regular gas pri...,"[remember, last, time, paid, gallon, regular, ..."
5,"@DrTedros ""We canÂt stop #COVID19 without pro...",Neutral,1,stop protecting prices surgical masks increase...,"[stop, protecting, prices, surgical, masks, in..."
6,Anyone been in a supermarket over the last few...,Extremely Positive,2,supermarket last days went normal shop last ni...,"[supermarket, last, days, went, normal, shop, ..."
7,Best quality couches at unbelievably low price...,Positive,2,best quality couches unbelievably low prices a...,"[best, quality, couches, unbelievably, low, pr..."
8,Beware of counterfeits trying to sell fake mas...,Extremely Negative,0,beware counterfeits trying sell fake masks che...,"[beware, counterfeits, trying, sell, fake, mas..."
9,Panic food buying in Germany due to #coronavir...,Extremely Negative,0,panic food buying germany due begun left behin...,"[panic, food, buying, germany, due, begun, lef..."


In [14]:
# sanity check for length of tweet
len_list = [len(x) for x in covid.tweets_tokenized]
max(len_list)

34

In [15]:
# get unique words in the corpus
all_words = []
for x in covid['tweets_tokenized']:
    all_words.extend(x)

word_set = list(set(all_words))
word_count = Counter(all_words)

# filter out words with low frequency
for word_list in covid.tweets_tokenized:
    new_list = []
    for word in word_list:
        if word_count[word] > 2:
            new_list.append(word)
    word_list = new_list

# update set of words after removing the ones with low frequency
new_word_list = []
for x in covid['tweets_tokenized']:
    new_word_list.extend(x)
word_set = list(set(new_word_list))

# map each unique words & unknown token in covid.encoded to an index
word2index = {}
word2index['<UNK>'] = 0
word2index['<PAD>'] = 1

for i, word in enumerate(word_set, 2):
    word2index[word] = i

# encode the original sequence
def encode(sent_list):
    result = []
    for x in sent_list:
        index = word2index[x]
        result.append(index)
    return result

covid['encoded_old'] = covid.tweets_tokenized.apply(lambda x: encode(x))


# get sequence average length
total_len = 0
for x in covid.encoded_old:
    total_len += len(x)

ave_len = math.floor(total_len/covid.shape[0])

# filter out long sequences --> encode all sequence to length = ave_len
# pad short sequence

def fix_tweet(x):
    size = min(len(x), ave_len)
    new_encoded = x[:size]
    len_before_pad = len(new_encoded)
    if size < ave_len:
        for j in range(0, ave_len - len(x)):
            new_encoded.append(0)
    return new_encoded

covid['encoded'] = covid.encoded_old.apply(lambda x: fix_tweet(x))
covid['tweet_length'] = covid.encoded_old.apply(lambda x: min(len(x), ave_len))

In [16]:
max(covid.tweet_length)

print(word2index['exchange'])
covid.head()

933


Unnamed: 0,tweet,sentiment,sentiment_encoded,tweets,tweets_tokenized,encoded_old,encoded,tweet_length
0,TRENDING: New Yorkers encounter empty supermar...,Extremely Negative,0,trending new encounter empty supermarket shelv...,"[trending, new, encounter, empty, supermarket,...","[1895, 3548, 2871, 4578, 5241, 5577, 1097, 121...","[1895, 3548, 2871, 4578, 5241, 5577, 1097, 121...",14
1,When I couldn't find hand sanitizer at Fred Me...,Positive,2,find hand turned 2 pack check concerns driving...,"[find, hand, turned, 2, pack, check, concerns,...","[5374, 2881, 2208, 1007, 5430, 3483, 1356, 219...","[5374, 2881, 2208, 1007, 5430, 3483, 1356, 219...",9
2,#Panic buying hits #NewYork City as anxious sh...,Negative,0,buying hits city anxious shoppers stock food a...,"[buying, hits, city, anxious, shoppers, stock,...","[6384, 4391, 5073, 5959, 6314, 5583, 1505, 599...","[6384, 4391, 5073, 5959, 6314, 5583, 1505, 599...",15
3,#toiletpaper #dunnypaper #coronavirus #coronav...,Neutral,1,one week buying baby milk powder next buying t...,"[one, week, buying, baby, milk, powder, next, ...","[1670, 3288, 6384, 2611, 2489, 1364, 275, 6384...","[1670, 3288, 6384, 2611, 2489, 1364, 275, 6384...",10
4,Do you remember the last time you paid $2.99 a...,Neutral,1,remember last time paid gallon regular gas pri...,"[remember, last, time, paid, gallon, regular, ...","[836, 4045, 6268, 3413, 6612, 4719, 3423, 2419...","[836, 4045, 6268, 3413, 6612, 4719, 3423, 2419...",13


In [17]:
# sanity check for encoding:
print(len(all_words))
print(ave_len)
lengths = [len(x) for x in covid.encoded]
print(max(lengths))
print(min(lengths))
print(set(covid.sentiment_encoded))
tweet_length = covid.tweet_length
covid.head()
print(len(word_set))

47853
15
15
15
{0, 1, 2}
6816


In [18]:
# Train test split from skearln
data_size = len(covid['encoded'])
assert data_size == len(covid['sentiment_encoded']) 
X, y = list(zip(list(covid['encoded']),(list(covid['tweet_length'])))), list(covid['sentiment_encoded'])
# X(data, length)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=43)

In [19]:
# Sanity check for length match
print(X_train[2])
print(y_train[2])

([4747, 4644, 3880, 1911, 5813, 448, 6343, 2419, 0, 0, 0, 0, 0, 0, 0], 8)
2


In [20]:
X_train_new=[]
for tup in X_train:
  arr = np.asarray(tup)
  X_train_new.append(arr)
X_train_new

  return array(a, dtype, copy=False, order=order)


[array([list([1740, 4107, 1028, 6079, 4357, 1279, 3137, 3101, 1287, 2371, 5232, 3334, 4236, 4113, 4402]),
        15], dtype=object),
 array([list([3147, 1425, 1741, 4543, 3522, 26, 856, 3336, 5612, 1206, 6787, 2403, 3199, 1982, 253]),
        15], dtype=object),
 array([list([4747, 4644, 3880, 1911, 5813, 448, 6343, 2419, 0, 0, 0, 0, 0, 0, 0]),
        8], dtype=object),
 array([list([6485, 6598, 1366, 5583, 5548, 2102, 5583, 4863, 4242, 4967, 6332, 2419, 3329, 0, 0]),
        13], dtype=object),
 array([list([6551, 524, 521, 4357, 3989, 5037, 3185, 2119, 1687, 2775, 5480, 5914, 5141, 2680, 6315]),
        15], dtype=object),
 array([list([5852, 4213, 2680, 711, 850, 260, 5999, 4993, 1962, 850, 2680, 4390, 4045, 4840, 943]),
        15], dtype=object),
 array([list([6598, 1489, 836, 6698, 4125, 1505, 3081, 3948, 6675, 0, 0, 0, 0, 0, 0]),
        9], dtype=object),
 array([list([1480, 1345, 141, 253, 2119, 1687, 1036, 4401, 3443, 6755, 6769, 53, 4154, 1799, 2950]),
        15], dtype=o

In [21]:
X_train_new[0].size

2

In [22]:
X_test_new=[]
for tup in X_test:
  arr = np.asarray(tup)
  X_test_new.append(arr)
X_test_new

  return array(a, dtype, copy=False, order=order)


[array([list([2680, 1751, 3732, 5577, 1978, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]),
        5], dtype=object),
 array([list([1077, 5622, 2267, 1237, 3546, 275, 658, 1642, 5077, 1257, 5902, 4315, 6492, 5241, 0]),
        14], dtype=object),
 array([list([2119, 4715, 6213, 1687, 1412, 149, 1990, 1082, 1940, 954, 6581, 6766, 3574, 1450, 2234]),
        15], dtype=object),
 array([list([3246, 3185, 5378, 5700, 6401, 2371, 5232, 2890, 2472, 4089, 5914, 2116, 1873, 5411, 2119]),
        15], dtype=object),
 array([list([3749, 5583, 3081, 1505, 4946, 3251, 1505, 5441, 2445, 530, 4468, 0, 0, 0, 0]),
        11], dtype=object),
 array([list([802, 6809, 2119, 1687, 6623, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]),
        5], dtype=object),
 array([list([4493, 5132, 6380, 1789, 4267, 5241, 924, 0, 0, 0, 0, 0, 0, 0, 0]),
        7], dtype=object),
 array([list([4109, 5232, 4045, 3925, 5284, 3759, 4637, 2454, 1165, 4440, 4892, 6597, 1303, 3370, 2389]),
        15], dtype=object),
 array([list([5956, 977, 5583, 1505, 

In [23]:
class ReviewsDataset(Dataset):
    def __init__(self, X, Y):
        self.X = X
        self.y = Y
        
    def __len__(self):
        return len(self.y)
    
    def __getitem__(self, idx):
        return torch.from_numpy(np.array(self.X[idx][0])), self.y[idx], self.X[idx][1]

In [24]:
train_ds = ReviewsDataset(X_train_new, y_train)
valid_ds = ReviewsDataset(X_test_new, y_test)

In [25]:
type(X_train_new[0][1])

int

In [26]:
def train_model(model, epochs=10, lr=0.001):
    parameters = filter(lambda p: p.requires_grad, model.parameters())
    optimizer = torch.optim.Adam(parameters, lr=lr)
    for i in range(epochs):
        model.train()
        sum_loss = 0.0
        total = 0
        for x, y, l in train_dl:
            x = x.long()
            y = y.long()
            y_pred = model(x, l)
            optimizer.zero_grad()
            loss = F.cross_entropy(y_pred, y)
            loss.backward()
            optimizer.step()
            sum_loss += loss.item()*y.shape[0]
            total += y.shape[0]
        val_loss, val_acc, val_rmse = validation_metrics(model, val_dl)
        if i % 5 == 1:
            print("train loss %.3f, val loss %.3f, val accuracy %.3f, and val rmse %.3f" % (sum_loss/total, val_loss, val_acc, val_rmse))

def validation_metrics (model, valid_dl):
    model.eval()
    correct = 0
    total = 0
    sum_loss = 0.0
    sum_rmse = 0.0
    for x, y, l in valid_dl:
        x = x.long()
        y = y.long()
        y_hat = model(x, l)
        loss = F.cross_entropy(y_hat, y)
        pred = torch.max(y_hat, 1)[1]
        correct += (pred == y).float().sum()
        total += y.shape[0]
        sum_loss += loss.item()*y.shape[0]
        sum_rmse += np.sqrt(mean_squared_error(pred, y.unsqueeze(-1)))*y.shape[0]
    return sum_loss/total, correct/total, sum_rmse/total


In [27]:
batch_size = 5000
vocab_size = len(word2index)
train_dl = DataLoader(train_ds, batch_size=batch_size, shuffle=True)
val_dl = DataLoader(valid_ds, batch_size=batch_size)

In [28]:
class LSTM_fixed_len(torch.nn.Module) :
    def __init__(self, vocab_size, embedding_dim, hidden_dim) :
        super().__init__()
        self.embeddings = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        self.linear = nn.Linear(hidden_dim, 5)
        self.dropout = nn.Dropout(0.2)
        
    def forward(self, x, l):
        x = self.embeddings(x)
        x = self.dropout(x)
        lstm_out, (ht, ct) = self.lstm(x)
        return self.linear(ht[-1])

In [29]:
model_fixed =  LSTM_fixed_len(vocab_size, 50, 50)

In [30]:
train_model(model_fixed, epochs=30, lr=0.01)

train loss 1.508, val loss 1.436, val accuracy 0.425, and val rmse 1.344
train loss 1.102, val loss 1.077, val accuracy 0.412, and val rmse 1.367
train loss 1.009, val loss 1.056, val accuracy 0.432, and val rmse 1.336
train loss 0.881, val loss 1.058, val accuracy 0.471, and val rmse 1.279
train loss 0.742, val loss 1.051, val accuracy 0.498, and val rmse 1.233
train loss 0.612, val loss 1.138, val accuracy 0.537, and val rmse 1.121


In [31]:
train_model(model_fixed, epochs=30, lr=0.01)

train loss 0.670, val loss 1.215, val accuracy 0.563, and val rmse 1.111
train loss 0.442, val loss 1.114, val accuracy 0.545, and val rmse 1.077
train loss 0.370, val loss 1.264, val accuracy 0.594, and val rmse 1.059
train loss 0.271, val loss 1.361, val accuracy 0.554, and val rmse 1.051
train loss 0.205, val loss 1.525, val accuracy 0.575, and val rmse 1.043
train loss 0.157, val loss 1.667, val accuracy 0.581, and val rmse 1.028


In [32]:
train_model(model_fixed, epochs=30, lr=0.01)

train loss 0.281, val loss 1.752, val accuracy 0.571, and val rmse 1.044
train loss 0.115, val loss 1.759, val accuracy 0.583, and val rmse 1.032
train loss 0.088, val loss 1.781, val accuracy 0.588, and val rmse 1.027
train loss 0.064, val loss 1.836, val accuracy 0.583, and val rmse 1.041
train loss 0.055, val loss 1.879, val accuracy 0.568, and val rmse 1.041
train loss 0.031, val loss 2.119, val accuracy 0.606, and val rmse 1.023
