In [None]:
import pandas as pd
import re
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

import numpy as np
from torch.utils.data import Dataset, DataLoader
import torch
import torch.nn as nn

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

In [None]:
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('omw-1.4')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


True

In [None]:
df = pd.read_csv('/content/drive/MyDrive/Final Year Project/DelhiRiotstweets.csv')

  exec(code_obj, self.user_global_ns, self.user_ns)


In [None]:
senNum = {'negative': 0, 'positive': 1, 'neutral': 2}
takeThis = [sum(df['sentiment'] == 'positive')]*3 # take these many samples of each of the classes
X = []
y = []

for i in df.index:
  if sum(takeThis):
    idx = senNum[df['sentiment'][i]]
    if takeThis[idx]:
      X += [df['tweet'][i]]
      y += [idx]
      takeThis[idx] -= 1
    continue
  break

Saving the balanced data set

In [None]:
balanced_df = pd.DataFrame(list(zip(X, y)), columns = ['Tweet', 'Sentiment'])

In [None]:
balanced_df.to_csv('/content/drive/MyDrive/Final Year Project/BalancedRiotsTweets.csv')

Loading the balanced data set 

In [None]:
balanced_df = pd.read_csv('/content/drive/MyDrive/Final Year Project/BalancedRiotsTweets.csv')

Vectorizing the tweets

In [None]:
X = balanced_df['Tweet']

In [None]:
y = balanced_df['Sentiment']

In [None]:
documents = []

stemmer = WordNetLemmatizer()

for sen in range(0, len(X)):
    # Remove all the special characters
    document = re.sub(r'\W', ' ', str(X[sen]))
    
    # remove all single characters
    document = re.sub(r'\s+[a-zA-Z]\s+', ' ', document)
    
    # Remove single characters from the start
    document = re.sub(r'\^[a-zA-Z]\s+', ' ', document) 
    
    # Substituting multiple spaces with single space
    document = re.sub(r'\s+', ' ', document, flags=re.I)
        
    # Converting to Lowercase
    document = document.lower()
    
    # Lemmatization
    document = document.split()

    document = [stemmer.lemmatize(word) for word in document]
    document = ' '.join(document)
    documents.append(document)

In [None]:
tfidfconverter = TfidfVectorizer(max_features=1500, min_df=5, max_df=0.7, stop_words=stopwords.words('english'))
X = tfidfconverter.fit_transform(documents).toarray()

In [None]:
class TweetDataset(Dataset):
  def __init__(self, tweets, labels):
    self.tweets = tweets
    self.labels = labels

  def __len__(self):
    return len(self.labels)

  def __getitem__(self, idx):
    label = self.labels[idx]
    tweet = self.tweets[idx]
    sample = {'Tweet': tweet, 'Label': label}
    return sample

Performing 80-20 train/test split on the data set 

In [None]:
X_train = np.array(X[: 35000])
y_train = np.array(y[: 35000])

In [None]:
train_ds = TweetDataset(X_train, y_train)

In [None]:
X_test = np.array(X[35000: ])
y_test = np.array(y[35000: ])
test_ds = TweetDataset(X_test, y_test)

Trying out the custom dataloader

In [None]:
dataloader = DataLoader(train_ds, batch_size = 2, shuffle = True)
for idx, batch in enumerate(dataloader):
  if idx == 3:
    break
  print(f"Batch #: {idx}\n Tweets: {batch['Tweet']}")
  print(f"Batch #: {idx}\n Labels: {batch['Label']}")

Batch #: 0
 Tweets: tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]], dtype=torch.float64)
Batch #: 0
 Labels: tensor([0, 0])
Batch #: 1
 Tweets: tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]], dtype=torch.float64)
Batch #: 1
 Labels: tensor([0, 0])
Batch #: 2
 Tweets: tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]], dtype=torch.float64)
Batch #: 2
 Labels: tensor([2, 2])


In [None]:
batch_size = 50
num_epochs = 5
input_dim = 1500  # The value we set as 'max_features' in tfidfvectorizer 

In [None]:
class FeedforwardNeuralNetModel(nn.Module):
  def __init__(self, input_dim, hidden_dim, output_dim):
    super(FeedforwardNeuralNetModel, self).__init__()
    self.fc1 = nn.Linear(input_dim, hidden_dim) 
    self.sigmoid = nn.Sigmoid()
    self.fc2 = nn.Linear(hidden_dim, output_dim)

  def forward(self, x):
    out = self.fc1(x)
    out = self.sigmoid(out)
    out = self.fc2(out)
    return out

In [None]:
model = FeedforwardNeuralNetModel(input_dim, hidden_dim = 750, output_dim = 3).to(device)  # output_dim is the number of classes

In [None]:
criterion = nn.CrossEntropyLoss()

In [None]:
learning_rate = 0.007
optimizer = torch.optim.Adam(model.parameters(), lr = learning_rate)

In [None]:
train_loader = DataLoader(train_ds, batch_size = batch_size, shuffle = False)
test_loader = DataLoader(test_ds, batch_size = 4, shuffle = False)

In [None]:
expected_accuracy = 93  # Change this to max. attained accuracy (IMPORTANT)

In [None]:
path = '/content/drive/MyDrive/Final Year Project/Saved Models/model2'

In [None]:
for epoch in range(num_epochs):
  for batch in train_loader:
    tweets = batch['Tweet'].view(-1, input_dim).type(torch.FloatTensor).requires_grad_()
    labels = batch['Label'].type(torch.LongTensor)
    tweets = tweets.to(device)
    labels = labels.to(device)
    optimizer.zero_grad()
    outputs = model(tweets)
    loss = criterion(outputs, labels)
    loss.backward()
    optimizer.step()
  #if not (epoch % 5):
  if True:
    correct = 0
    total = 0
    for idx, batch in enumerate(train_loader):
      tweets = batch['Tweet'].view(-1, input_dim).type(torch.FloatTensor)
      labels = batch['Label'].type(torch.LongTensor)
      tweets = tweets.to(device)
      labels = labels.to(device)
      outputs = model(tweets)
      _, predicted = torch.max(outputs.data, 1)
      total += labels.size(0)
      correct += (predicted == labels).sum()

    accuracy = 100 * correct / total
    print(f'Epoch: {epoch}, Accuracy: {accuracy: .2f}')
    '''
    if accuracy > expected_accuracy:
      expected_accuracy = accuracy 
      torch.save(model.state_dict(), path)
    '''
    

Epoch: 0, Accuracy:  94.23
Epoch: 1, Accuracy:  94.32
Epoch: 2, Accuracy:  95.07
Epoch: 3, Accuracy:  96.73
Epoch: 4, Accuracy:  97.51


Test Accuracy

In [None]:
correct = total = 0
# Checking accuracy on test set
for idx, batch in enumerate(test_loader):
  tweets = batch['Tweet'].view(-1, input_dim).type(torch.FloatTensor)
  labels = batch['Label'].type(torch.LongTensor)
  tweets = tweets.to(device)
  labels = labels.to(device)
  outputs = model(tweets)
  _, predicted = torch.max(outputs.data, 1)
  total += labels.size(0)
  correct += (predicted == labels).sum()

accuracy = 100 * correct / total
print(f'Test Accuracy: {accuracy}')

Test Accuracy: 86.09343719482422


model1 train accuracy: 85.99% &nbsp; test accuracy: 87.57%
<br>
model2 train accuracy: 97.51% &nbsp; test accuracy: 86.09%