<a href="https://colab.research.google.com/github/jliu1224/PythonProject/blob/main/Twitter_Sentiment_Covid_19_Project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
from google.colab import files, drive
drive.mount('/content/drive/')

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


In [3]:
import torch
import os
import glob
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from torch.utils.data import TensorDataset, DataLoader
import re
import string as st
import torch.optim as optim
from sklearn.model_selection import KFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

## **Data Preparation**

In [4]:
df = pd.read_csv('CoronaTwitters.csv', encoding = "ISO-8859-1", engine='python')
df.head(5)

Unnamed: 0,UserName,ScreenName,Location,TweetAt,OriginalTweet,Sentiment
0,1,44953,NYC,2/3/2020,TRENDING: New Yorkers encounter empty supermar...,Extremely Negative
1,2,44954,"Seattle, WA",2/3/2020,When I couldn't find hand sanitizer at Fred Me...,Positive
2,3,44955,,2/3/2020,Find out how you can protect yourself and love...,Extremely Positive
3,4,44956,Chicagoland,2/3/2020,#Panic buying hits #NewYork City as anxious sh...,Negative
4,5,44957,"Melbourne, Victoria",3/3/2020,#toiletpaper #dunnypaper #coronavirus #coronav...,Neutral


In [5]:
#Read Data and create labels
texts = df[['OriginalTweet', 'Sentiment']]
sentiments = {'Extremely Positive':2, 'Positive':2, 'Neutral':1, 'Negative':0, 'Extremely Negative':0}
texts['labels'] = texts['Sentiment'].str.strip().map(sentiments)
texts.head(5)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


Unnamed: 0,OriginalTweet,Sentiment,labels
0,TRENDING: New Yorkers encounter empty supermar...,Extremely Negative,0
1,When I couldn't find hand sanitizer at Fred Me...,Positive,2
2,Find out how you can protect yourself and love...,Extremely Positive,2
3,#Panic buying hits #NewYork City as anxious sh...,Negative,0
4,#toiletpaper #dunnypaper #coronavirus #coronav...,Neutral,1


In [6]:
#Clean the data in Original Tweet
tweets = texts['OriginalTweet']
lowercase = tweets.map(lambda t : t.lower())
noa = lowercase.str.replace('â', '', n=-1)
nohttp = noa.str.replace('[http://][t.co][/][/D+]', '', n=-1)
nopunc = nohttp.str.replace('[,().?!-":#]', '', n=-1)
noat = nopunc.str.replace('[@]\w+', '', n=-1)
non = noat.str.replace('\n', '', n=-1)
new_tweets = non.str.replace('/', ' ', n=-1)
new_tweets

0        trending new yorkers encounter empty supermark...
1        when i couldn't find hand sanitizer at fred me...
2        find out how you can protect yourself and love...
3        panic buying hits newyork city as anxious shop...
4        toiletpaper dunnypaper coronavirus coronavirus...
                               ...                        
44950    airline pilots offering to stock supermarket s...
44951    response to complaint not provided citing covi...
44952    you know its getting tough when   is rationin...
44953    is it wrong that the smell of hand sanitizer i...
44954     well new used rift s are going for $70000 on ...
Name: OriginalTweet, Length: 44955, dtype: object

In [7]:
texts['OriginalTweet'] = new_tweets

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [8]:
#Get the data
twitters = new_tweets.values
labels = texts.labels.values
#Using TF-IDF to represent each twitter record, denoted as X;
#Create a label vector Y
vectorizer = TfidfVectorizer(stop_words='english', max_features=500, ngram_range=(1,1)) 
instances = vectorizer.fit_transform(twitters)
X = instances.toarray()
Y = labels

print('The shape of X is:', X.shape)
print('The shape of Y is:', Y.shape)

The shape of X is: (44955, 500)
The shape of Y is: (44955,)


# **Random Forest**

In [9]:
#Building a random forest classifier as one of the baselines
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2,
                                                   random_state = 1997)
rf_model = RandomForestClassifier(criterion='entropy', max_depth=9, random_state=1997, n_estimators=20)
rf_model.fit(X_train, y_train)

print('Random Forest - Accuracy on training set: {:.4f}'.format(rf_model.score(X_train, y_train)))
print('Random Forest - Accuracy on test set: {:.4f}'.format(rf_model.score(X_test, y_test)))

Random Forest - Accuracy on training set: 0.5679
Random Forest - Accuracy on test set: 0.5712


## **Fully Connected Feedforward Network**

In [10]:
#The parameters for the fully connected feedforward network
epochs = 5
lr = 2e-3
indim = X.shape[1]
outdim = 3
drate = 0.8
batch_size = 500

#Create the dataset
X_tensor = torch.from_numpy(X)
Y_tensor = torch.from_numpy(Y)

dataset = TensorDataset(X_tensor, Y_tensor)
train_size = int(0.8*len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = torch.utils.data.random_split(dataset, [train_size, val_size], generator=torch.Generator().manual_seed(150))

train_loader = DataLoader(train_dataset, shuffle=True, batch_size=batch_size)
val_loader = DataLoader(val_dataset, shuffle=True, batch_size=batch_size)

In [11]:
#Create the fully connected feedforward network
import torch.nn as nn
import torch.nn.functional as F

class SentimentNetwork(nn.Module):
  def __init__(self, input_dim, output_dim, dropout_rate):
    
    super(SentimentNetwork,self).__init__()
    
    self.fc1 = nn.Linear(input_dim, 375)
    self.fc2 = nn.Linear(375, 244)
    self.fc3 = nn.Linear(244, 100)
    self.fc4 = nn.Linear(100, 12)
    self.fc5 = nn.Linear(12, output_dim)
    self.dropout = nn.Dropout(p=drate)

  def forward(self,x):
    x = self.dropout(F.relu(self.fc1(x))) 
    x = self.dropout(F.relu(self.fc2(x)))
    x = self.dropout(F.relu(self.fc3(x)))
    x = self.dropout(F.relu(self.fc4(x)))
    x = F.softmax(self.fc5(x))
   
    return x

# create a model
model = SentimentNetwork(indim, outdim, drate)
print(model)

SentimentNetwork(
  (fc1): Linear(in_features=500, out_features=375, bias=True)
  (fc2): Linear(in_features=375, out_features=244, bias=True)
  (fc3): Linear(in_features=244, out_features=100, bias=True)
  (fc4): Linear(in_features=100, out_features=12, bias=True)
  (fc5): Linear(in_features=12, out_features=3, bias=True)
  (dropout): Dropout(p=0.8, inplace=False)
)


In [12]:
#Training function for one epoch
def train(model, train_loader, optimizer, criterion):
  
  epoch_loss_total, epoch_acc_total = 0.0,0.0 # the loss and accuracy for each epoch

  model.train()

  for batch_idx, (data, target) in enumerate(train_loader):  
    #Zero gradient
    optimizer.zero_grad() 
    #predictions = calculate the predicted output for the current batch 
    predictions = model(data.float()) 
    #loss = calculate the loss for the current batch using predictions and the truth
    loss = criterion(predictions, target) 
    #acc = calculate the accuracy using the predictions and the truth
    pred = predictions.data.max(1)[1] # get the index of the max log-probability
    acc = pred.eq(target.data).sum()
    
    #backpropagate
    loss.backward() 
    optimizer.step() 
     
    epoch_loss_total += loss.item()
    epoch_acc_total += acc

  #calculate the average epoch_loss and epoch_acc
  epoch_loss = epoch_loss_total/len(train_loader.dataset)
  epoch_acc = epoch_acc_total/len(train_loader.dataset)

  return epoch_loss, epoch_acc

#Validation process function for one epoch
def evaluate(model, val_loader, criterion):
  
  epoch_loss_total, epoch_acc_total = 0.0,0.0 # the loss and accuracy for each epoch

  model.eval()
    
  with torch.no_grad():
    for data, target in val_loader: 
      #predictions
      predictions = model(data.float())
      #loss
      loss = criterion(predictions, target) 
      #acc
      pred = predictions.data.max(1)[1]
      acc = pred.eq(target.data).sum()

      epoch_loss_total += loss.item()
      epoch_acc_total += acc
    #calculate the average epoch_loss and epoch_acc
    epoch_loss = epoch_loss_total/len(val_loader.dataset)
    epoch_acc = epoch_acc_total/len(val_loader.dataset)   

    return epoch_loss, epoch_acc

In [13]:
#Performance of Fully Connected Feedforward Network
#Define the optimizer and the learning rate
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=lr)

#Genearte a report on the performance
for epoch in range(epochs):
  train_loss, train_acc = train(model, train_loader, optimizer, criterion)
  valid_loss, valid_acc = evaluate(model, val_loader, criterion)
    
  print(f'Epoch: {epoch+1:02}')
  print(f'Train Loss: {train_loss:.4f}   Train Acc: {train_acc:.4f}')
  print(f' Val. Loss: {valid_loss:.4f}    Val. Acc: {valid_acc:.4f}')



Epoch: 01
Train Loss: 0.0022   Train Acc: 0.3385
 Val. Loss: 0.0021    Val. Acc: 0.4410
Epoch: 02
Train Loss: 0.0021   Train Acc: 0.4434
 Val. Loss: 0.0020    Val. Acc: 0.5916
Epoch: 03
Train Loss: 0.0020   Train Acc: 0.5675
 Val. Loss: 0.0019    Val. Acc: 0.5997
Epoch: 04
Train Loss: 0.0019   Train Acc: 0.5890
 Val. Loss: 0.0019    Val. Acc: 0.6047
Epoch: 05
Train Loss: 0.0019   Train Acc: 0.5934
 Val. Loss: 0.0019    Val. Acc: 0.6069


## **Recurrent Neural Network (RNN)**

### Data Preparation for RNN

In [14]:
#Split the dataframe based on their labels
positive_tweets = texts[texts['labels']==2]
neutral_tweets = texts[texts['labels']==1]
negative_tweets = texts[texts['labels']==0]

#Sample from each tweet dataset based on a proportion of 19:17:8 (pos:neg:neu)
#pos_samp = positive_tweets.sample(n=3800, random_state=1997)
#neg_samp = negative_tweets.sample(n=3400, random_state=1997)
#neu_samp = neutral_tweets.sample(n=1600, random_state=1997)

#Construct a corpus and labels list
corpus = []
labels = []
for doc in positive_tweets['OriginalTweet']:
  corpus.append(doc.replace('\n', ' '))
  labels.append([1, 0, 0])
for doc in neutral_tweets['OriginalTweet']:
  corpus.append(doc.replace('\n', ' '))
  labels.append([0, 1, 0])
for doc in negative_tweets['OriginalTweet']:
  corpus.append(doc.replace('\n', ' '))
  labels.append([0, 0, 1])

vectorizer_r = TfidfVectorizer(max_features=500, stop_words='english')
X1 = vectorizer_r.fit_transform(corpus)
y1 = np.array(labels)
print(X1.shape, y1.shape)

(44955, 500) (44955, 3)


In [15]:
import torch
from sklearn.model_selection import train_test_split

seq_length = -1

word_tokenizer = vectorizer_r.build_tokenizer()
vocab = vectorizer_r.vocabulary_

doc_terms_list_train = [word_tokenizer(doc_str) for doc_str in corpus]
docs = []
for i in range(len(doc_terms_list_train)):
  terms = []
  for j in range(len(doc_terms_list_train[i])):
    w = doc_terms_list_train[i][j]
    if w in vocab:
      terms.append(w)
  if len(terms) > seq_length:
    seq_length = len(terms)
  docs.append(terms)

max_features = 500
datasets = np.zeros((X1.shape[0], seq_length, max_features))

for i in range(len(docs)):
  n_padding = seq_length - len(docs[i])

  for j in range(len(docs[i])):
    w = docs[i][j]
    idx = vocab[w]
    tfidf_val = X1[i, idx]
    datasets[i, j+n_padding, idx] = tfidf_val

datasets = datasets.astype(np.float32)
y1 = y1.astype(np.float32)

X1_train, X1_val, y1_train, y1_val = train_test_split(datasets, y1, test_size = 0.2, random_state = 1997)
print(X1_train.shape, X1_val.shape, y1_train.shape, y1_val.shape)

(35964, 28, 500) (8991, 28, 500) (35964, 3) (8991, 3)


In [34]:
#Same process with creating the datasets, but for RNN
batch_size_r = 250

train_data = TensorDataset(torch.from_numpy(X1_train), torch.from_numpy(y1_train))
val_data = TensorDataset(torch.from_numpy(X1_val), torch.from_numpy(y1_val))

train_loader_r = DataLoader(train_data, shuffle=True, batch_size=batch_size_r)
val_loader_r = DataLoader(val_data, shuffle=True, batch_size=batch_size_r)

Building the RNN

In [33]:
from __future__ import unicode_literals, print_function, division
import torch
import torch.nn as nn 
import torch.nn.functional as F 
import torch.optim as optim

#Parameters
input_size = 500
hidden_size = 400
n_layers = 3
output_size = 25

class Model(nn.Module):

  def __init__(self, input_size, output_size, hidden_size, n_layers):
    super().__init__()
    self.hidden_size = hidden_size
    self.n_layers = n_layers
    self.rnn = nn.RNN(input_size,hidden_size,n_layers,batch_first=True) # rnn layer
    self.fc1 = nn.Linear(hidden_size,output_size) # rnn output (y_t) --> output (y'_t)
    self.fc2 = nn.Linear(output_size,3) #the output from the last time period ->sentiment prediction

  def forward(self,x, hidden):
    batch_size = x.size()[0]
    hidden = self.init_hidden(batch_size)
    
    rnn_out,hidden = self.rnn(x,hidden)
    rnn_out = self.fc1(rnn_out)
    last_out = rnn_out[:,-1,:].view(batch_size,-1)
    out = F.softmax(self.fc2(last_out))
    return out,hidden 

  def init_hidden(self,batch_size):
    hidden = torch.zeros(self.n_layers, batch_size, self.hidden_size).cuda()
    return hidden

RNNmodel = Model(input_size, output_size, hidden_size, n_layers)

print(RNNmodel)

Model(
  (rnn): RNN(500, 400, num_layers=3, batch_first=True)
  (fc1): Linear(in_features=400, out_features=25, bias=True)
  (fc2): Linear(in_features=25, out_features=3, bias=True)
)


Training and Validating

In [35]:
#Train on GPU/CPU
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
if torch.cuda.is_available():
  RNNmodel.to(device)

#Define hyperparameters
n_epochs = 6
lr = 1e-4
counter = 0
clip = 5

#Define loss and optimizer
criterion_r = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(RNNmodel.parameters(), lr=lr)

RNNmodel.train()

for epochs in range(n_epochs):
  #initiate hidden state
  h = RNNmodel.init_hidden(batch_size_r)

  #batch_loop
  for inputs, labels in train_loader_r:
    inputs, labels = inputs.to(device), labels.to(device)
    counter += 1
    
    RNNmodel.zero_grad()

    outputs, h = RNNmodel(inputs, h)

    loss = criterion_r(outputs, torch.max(labels, 1)[1])
    loss.backward()
    pred = torch.max(outputs, 1)[1]
    acc = pred.eq(torch.max(labels, 1)[1]).sum()

    #Clip_grad_norm to help prevent the exploding gradient problem in RNNs
    nn.utils.clip_grad_norm(RNNmodel.parameters(), clip)
    optimizer.step()

    ##Validation Loss
    if counter % 10 == 0:
      val_h = RNNmodel.init_hidden(batch_size_r)
      val_losses = []


      RNNmodel.eval()

      for inputs, labels in val_loader_r:
        inputs, labels = inputs.to(device), labels.to(device)
        val_outputs, val_h = RNNmodel(inputs, val_h)
        val_loss = criterion_r(val_outputs, torch.max(labels, 1)[1])
        val_losses.append(val_loss.item())
        pred = val_outputs.data.max(1)[1]
        val_acc = pred.eq(labels.data.max(1)[1]).sum()

      RNNmodel.train()

      print('Epoch:{}/{}'.format(epochs+1, n_epochs),
            'Batch:{}'.format(counter),
            'Train Accuracy:{:.5f}'.format(acc/batch_size_r),
            'Train Loss:{:.5f}'.format(loss.item()),
            'Val Accuracy:{:.5f}'.format(val_acc/batch_size_r),
            'Val Loss:{:.5f}'.format(np.mean(val_losses)))
   



Epoch:1/6 Batch:10 Train Accuracy:0.45200 Train Loss:1.06112 Val Accuracy:0.42400 Val Loss:1.06455
Epoch:1/6 Batch:20 Train Accuracy:0.49200 Train Loss:1.04621 Val Accuracy:0.44400 Val Loss:1.05393
Epoch:1/6 Batch:30 Train Accuracy:0.46000 Train Loss:1.06052 Val Accuracy:0.44000 Val Loss:1.04995
Epoch:1/6 Batch:40 Train Accuracy:0.47200 Train Loss:1.04488 Val Accuracy:0.39200 Val Loss:1.04979
Epoch:1/6 Batch:50 Train Accuracy:0.44000 Train Loss:1.04446 Val Accuracy:0.43200 Val Loss:1.04959
Epoch:1/6 Batch:60 Train Accuracy:0.41600 Train Loss:1.06291 Val Accuracy:0.41600 Val Loss:1.04961
Epoch:1/6 Batch:70 Train Accuracy:0.38800 Train Loss:1.05370 Val Accuracy:0.44000 Val Loss:1.04954
Epoch:1/6 Batch:80 Train Accuracy:0.42000 Train Loss:1.05903 Val Accuracy:0.42400 Val Loss:1.04971
Epoch:1/6 Batch:90 Train Accuracy:0.43200 Train Loss:1.04375 Val Accuracy:0.40800 Val Loss:1.05086
Epoch:1/6 Batch:100 Train Accuracy:0.38400 Train Loss:1.04519 Val Accuracy:0.41200 Val Loss:1.05107
Epoch:1/6