
# Load Data

In [1]:
import pandas as pd
import numpy as np

from torch.autograd import Variable

In [2]:
fname = 'facebook_comments.csv'
df_train = pd.read_csv(fname, header = None, names = ['text', 'sentiment'], encoding='iso-8859-1',lineterminator='\n')
sent = {'positive':2, 'neutral':1,'negative':0}
df_train['labels'] = df_train['sentiment'].str.strip().map(sent)

In [3]:
training_texts = df_train.text.values #convert to numpy array/vector
labels = df_train.labels.values #convert to numpy array/vector

print(type(training_texts),type(labels))



<class 'numpy.ndarray'> <class 'numpy.ndarray'>


In [4]:
df_train.head()

Unnamed: 0,text,sentiment,labels
0,Heres a single to add to Kindle. Just read t...,neutral,1
1,If you tire of Non-Fiction.. Check out http://...,neutral,1
2,Ghost of Round Island is supposedly nonfiction.,neutral,1
3,Why is Barnes and Nobles version of the Kindle...,negative,0
4,@Maria: Do you mean the Nook? Be careful bo...,positive,2


# Pre process data


In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer 


In [6]:
vectorizer = TfidfVectorizer(stop_words='english', max_features=500, ngram_range=(1,2))
instances = vectorizer.fit_transform(training_texts) #using TFidf object to tranform the sparse matrix shape
X = instances.toarray() # convert the sparse matrix to numpy array
Y = labels
print(X.shape,',',Y.shape) 

(1999, 500) , (1999,)


#Traditional Machine Learning Models: Random Forest

In [7]:
from sklearn.model_selection import KFold
from sklearn.ensemble import RandomForestClassifier

kfold = KFold(n_splits = 10, shuffle = True, random_state = 2020)
rf_model = RandomForestClassifier(criterion = 'entropy', max_depth = 2, random_state = 2020)
rf_cvscores = []

for train_idx, val_idx in kfold.split(X):
  rf_model.fit(X[train_idx],Y[train_idx]) #train model
  acc= rf_model.score(X[val_idx],Y[val_idx]) #get acc from validation set
  rf_cvscores.append(acc)

print("Random Forest - mean: %.4f%% (std: +/- %.4f%%))" %(np.mean(rf_cvscores)*100, np.std(rf_cvscores)*100))


Random Forest - mean: 64.1332% (std: +/- 2.0919%))


#Fully Connected Feedforward Network

In [8]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import TensorDataset, DataLoader

import torch.optim as optim

In [9]:
epochs = 20
lr = 1e-3
indim = X.shape[1] #500
outdim = 3 #3 categories - positive,negative and neutral
drate = 0.5
batch_size = 20

X_tensor = torch.from_numpy(X) #convert tensor
Y_tensor = torch.from_numpy(Y) #convert tensor

dataset = TensorDataset(X_tensor,Y_tensor) #convert to dataset of text and labels 
train_size = int(0.8*len(dataset))
val_size = len(dataset)-train_size
train_dataset, val_dataset = torch.utils.data.random_split(dataset,[train_size,val_size])



train_loader = DataLoader(train_dataset, batch_size = batch_size ,shuffle = True)
val_loader = DataLoader(val_dataset, batch_size = batch_size, shuffle = True)

# print(len(train_loader), len(train_loader.dataset), (train_loader.batch_size))
# print(len(val_loader), len(val_loader.dataset), (val_loader.batch_size))


Build Network

In [10]:
#create model network
#input dim - 500
class SentimentNetwork(nn.Module):
  def __init__(self, input_dim, output_dim, dropout_rate):
    super(SentimentNetwork, self).__init__()
    self.input_dim = input_dim
    self.output_dim = output_dim
    self.dropout_rate = dropout_rate
    
    self.fc1 = nn.Linear(input_dim, 336) #https://www.heatonresearch.com/2017/06/01/hidden-layers.html#:~:text=The%20number%20of%20hidden%20neurons,size%20of%20the%20input%20layer.
    self.do1 = nn.Dropout(dropout_rate)
    self.fc2 = nn.Linear(336, 336)
    self.do2 = nn.Dropout(dropout_rate)
    self.fc3 = nn.Linear(336, output_dim)


  def forward(self,x):

     x = F.relu(self.fc1(x))
     x = F.dropout(self.do1(x))
     x = F.relu(self.fc2(x))
     x = F.dropout(self.do2(x))
     x = F.log_softmax(self.fc3(x))
     return x

#create model
model = SentimentNetwork(indim, outdim, drate)
print(model)
  


SentimentNetwork(
  (fc1): Linear(in_features=500, out_features=336, bias=True)
  (do1): Dropout(p=0.5, inplace=False)
  (fc2): Linear(in_features=336, out_features=336, bias=True)
  (do2): Dropout(p=0.5, inplace=False)
  (fc3): Linear(in_features=336, out_features=3, bias=True)
)


Create a training and evaluation function

In [11]:
def train(model, train_loader, optimizer, criterion):
  epoch_loss, epoch_acc = 0.0,0.0 #loss and accuracy for each fn
  model.train()

  for batch_x, batch_y in train_loader:
    batch_x, batch_y = Variable(batch_x),Variable(batch_y)

    #print(batch_x.ndimension())
    #zero gradient
    optimizer.zero_grad()
    #prediction = calculate the predicted output for current batch batch_x
    prediction = model(batch_x.float())
    #loss = calculate the loss for the current batch using predictions and batch_y
    loss = criterion(prediction, batch_y)

    #convert torch var to numpy: predictions.detach().numpy( )
    prediction_numpy = prediction.detach().numpy()

    #acc = calculate he accuracy using predictions(batch_size X Output_dim) and batch_y (batch_size X 1)
    prediction = prediction.data.max(1)[1]
    correct = prediction.eq(batch_y.data).sum().item()
    accuracy = correct / (train_loader.batch_size)

    #backpropogate
    loss.backward()
    optimizer.step()

    epoch_loss += loss.item()
    epoch_acc += accuracy
    #calculate avg epoch loss and epoch accuracy
    
  epoch_loss /= len(train_loader)
  epoch_acc /= len(train_loader)
  return epoch_loss, epoch_acc



def evaluate(model, val_loader, criterion):
  epoch_loss, epoch_acc = 0.0,0.0 #loss and accuracy for each fn
  model.eval()
  with torch.no_grad():
    for batch_x, batch_y in val_loader:

      batch_x, batch_y = Variable(batch_x),Variable(batch_y)

      #prediction = calculate the predicted output for current batch batch_x
      prediction = model(batch_x.float())

      #loss = calculate the loss for the current batch using predictions and batch_y
      loss = criterion(prediction, batch_y)

      #convert torch var to numpy: predictions.detach().numpy()
      prediction_numpy = prediction.detach().numpy()

      #acc = calculate he accuracy using predictions(batch_size X Output_dim) and batch_y (batch_size X 1)
      prediction = prediction.data.max(1)[1]
      correct = prediction.eq(batch_y.data).sum().item()
      accuracy = correct / (val_loader.batch_size)

      # print(correct, accuracy, len(val_loader.dataset), len(val_loader), correct / len(val_loader.dataset))
      
      epoch_loss += loss.item()
      epoch_acc += accuracy


    #calculate avg epoch loss and epoch accuracy
    epoch_loss /= len(val_loader)
    epoch_acc /= len(val_loader)
    
  return epoch_loss, epoch_acc



Train and Evaluate Model

In [12]:

#loss function and optimizer

 
optimizer = torch.optim.Adam(model.parameters(), lr)
criterion = nn.CrossEntropyLoss()

#training and evaluation
for epoch in range(epochs):
  train_loss, train_acc = train(model, train_loader, optimizer, criterion)
  valid_loss, valid_acc = evaluate(model, val_loader, criterion)

  print(f'Epoch:{epoch + 1:02}')
  print(f'\tTrain Loss:{train_loss:.4f}| Train Acc: {train_acc:.4f}')
  print(f'\t Val. Loss:{valid_loss:.4f}| Val. Acc: {valid_acc:.4f}')




Epoch:01
	Train Loss:0.8830| Train Acc: 0.6325
	 Val. Loss:0.7448| Val. Acc: 0.6575
Epoch:02
	Train Loss:0.6790| Train Acc: 0.6519
	 Val. Loss:0.5994| Val. Acc: 0.7550
Epoch:03
	Train Loss:0.5003| Train Acc: 0.8162
	 Val. Loss:0.4912| Val. Acc: 0.8175
Epoch:04
	Train Loss:0.3966| Train Acc: 0.8456
	 Val. Loss:0.4571| Val. Acc: 0.8300
Epoch:05
	Train Loss:0.3446| Train Acc: 0.8656
	 Val. Loss:0.4060| Val. Acc: 0.8350
Epoch:06
	Train Loss:0.2763| Train Acc: 0.8881
	 Val. Loss:0.3781| Val. Acc: 0.8575
Epoch:07
	Train Loss:0.2359| Train Acc: 0.9044
	 Val. Loss:0.3470| Val. Acc: 0.8800
Epoch:08
	Train Loss:0.2003| Train Acc: 0.9288
	 Val. Loss:0.3104| Val. Acc: 0.9050
Epoch:09
	Train Loss:0.1661| Train Acc: 0.9494
	 Val. Loss:0.2622| Val. Acc: 0.9300
Epoch:10
	Train Loss:0.1474| Train Acc: 0.9575
	 Val. Loss:0.2538| Val. Acc: 0.9375
Epoch:11
	Train Loss:0.1339| Train Acc: 0.9656
	 Val. Loss:0.2476| Val. Acc: 0.9350
Epoch:12
	Train Loss:0.1054| Train Acc: 0.9706
	 Val. Loss:0.2418| Val. Acc: