<a href="https://colab.research.google.com/github/hotbread213/createClass/blob/master/sentiment_analysis_rnn.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Introductions to embedding with sentiment analysis

## Getting Started

### Dataset and task

- The [Twitter sentiment analysis](https://www.kaggle.com/c/twitter-sentiment-analysis2/overview) is an open source dataset available on Kaggle. It contains 100000 twits labeled as either negative (0) or positive (1). 

- The task consist in writing a model that takes a twits as input and output 1 if the sentiment is positive or 0 if the sentiment is negative.

### Import required libraries

In [0]:
import time
import random
import os
import pandas
import numpy
import zipfile

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

import torch
import torch.nn as nn
import torch.optim as optim
from torch.nn import functional as F
from torchtext.data import Field, TabularDataset, Iterator

from google_drive_downloader import GoogleDriveDownloader

import spacy

spacy_en = spacy.load('en')

### Define some constants

In [0]:
class Constants:
    
    DATA_FILE_ID = '1wrfQmCShiTmbIsr7LpZhEiYw7dhuaOhk'                     # Google drive id to be able to download from drive
    
    SEED = 1                                                               # random seed for reproductability
    
    DATA_DIR = 'data/twitter/'                                                     # path to the csv data
    DATA_ZIP_FILE = f'{DATA_DIR}data.zip'                                        # path where to dowload the zipped data
    DATA_PATH = '{}data.csv'.format(DATA_DIR)                # path to the news data
    TRAIN_PATH = '{}train.csv'.format(DATA_DIR)
    VALID_PATH = '{}valid.csv'.format(DATA_DIR)
    TEST_PATH = '{}test.csv'.format(DATA_DIR)
    
    DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")  # set device to GPU if availale

constants = Constants

### Fix random seed for reproductability

In [0]:
numpy.random.seed(constants.SEED)
random.seed(constants.SEED)
torch.manual_seed(constants.SEED)
torch.backends.cudnn.deterministic = True

### Download the data on your local server

In [0]:
GoogleDriveDownloader.download_file_from_google_drive(file_id=constants.DATA_FILE_ID, dest_path=constants.DATA_ZIP_FILE, unzip=False)

zip_ref = zipfile.ZipFile(constants.DATA_ZIP_FILE, 'r')
zip_ref.extractall(constants.DATA_DIR)
zip_ref.close()

os.rename(f'{constants.DATA_DIR}train.csv', f'{constants.DATA_DIR}data.csv')
!rm data/twitter/test.csv

!ls data/twitter

data.csv  data.zip  valid.csv


### Visualize the data with `pandas.DataFrame`

In [0]:
data = pandas.read_csv(constants.DATA_PATH, encoding="ISO-8859-1") # weird encoding: https://stackoverflow.com/questions/18171739/unicodedecodeerror-when-reading-csv-file-in-pandas-with-python
data.head()

Unnamed: 0,ItemID,Sentiment,SentimentText
0,1,0,is so sad for my APL frie...
1,2,0,I missed the New Moon trail...
2,3,1,omg its already 7:30 :O
3,4,0,.. Omgaga. Im sooo im gunna CRy. I'...
4,5,0,i think mi bf is cheating on me!!! ...


## Methodology

- Validate the data (number of examples, number of features, label distribution, number of `nan`, etc)
- Choose a good metric that you will use for deciding the best model
- Split the data into train/valid/test
- Implement the simplest classifier and evaluate the performance on the train and the validation set
- Data exploration + model exploration (e.g. small litterature review)
- Base on data exploration and litterature, decide on a set of model to test with range of architecture (this includes preprocessing)
- Select hyperparameters based on the performance on the validation set
- Test your model on the test set and decide if it's good enough for production; else you need a new test set

### Dataset validation

In [0]:
N_OBS = len(data)

assert N_OBS == 99989

N_POSITIVE_LABEL = len(data[data.Sentiment == 1])
N_NEGATIVE_LABEL = len(data[data.Sentiment == 0])

assert N_POSITIVE_LABEL == 56457
assert N_NEGATIVE_LABEL == 43532
assert N_POSITIVE_LABEL + N_NEGATIVE_LABEL == N_OBS

assert len(data.dropna()) == N_OBS  # Make sure there is no nan

### Split the data into a train and a validation set and print some informations (split percentage, class distribution)

In [0]:
TRAIN_SIZE = round(0.7 * N_OBS)
VALID_SIZE = round(0.15 * N_OBS) + 1
TEST_SIZE = round(0.15 * N_OBS)
assert TRAIN_SIZE + VALID_SIZE + TEST_SIZE == N_OBS, f'{TRAIN_SIZE + VALID_SIZE + TEST_SIZE} != {N_OBS}'

In [0]:
# shuffle the indices
examples = set(range(N_OBS))
train_indices = set(random.sample(examples, TRAIN_SIZE))
examples = set(i for i in examples if i not in train_indices)
valid_indices = set(random.sample(examples, VALID_SIZE))
test_indices = [i for i in examples if i not in valid_indices]

In [0]:
# Split the data
train_df = data.iloc[list(train_indices)]
valid_df = data.iloc[list(valid_indices)]
test_df = data.iloc[list(test_indices)]

In [0]:
n_train = len(train_df)
n_train_positive = len(train_df[train_df.Sentiment == 1])

n_valid = len(valid_df)
n_valid_positive = len(valid_df[valid_df.Sentiment == 1])

n_test = len(test_df)
n_test_positive = len(test_df[test_df.Sentiment == 1])

print('# train example: {} ({:.2f} %) | positive: {:.2f} % | negative: {:.2f} %'.format(n_train, n_train / N_OBS * 100, n_train_positive / n_train * 100, 100 - n_train_positive / n_train * 100))
print('# valid example: {} ({:.2f} %) | positive: {:.2f} % | negative: {:.2f} %'.format(n_valid, n_valid / N_OBS * 100, n_valid_positive / n_valid * 100, 100 - n_valid_positive / n_valid * 100))
print('# test example: {} ({:.2f} %) | positive: {:.2f} % | negative: {:.2f} %'.format(n_test, n_test / N_OBS * 100, n_test_positive / n_test * 100, 100 - n_test_positive / n_test * 100))

# train example: 69992 (70.00 %) | positive: 56.50 % | negative: 43.50 %
# valid example: 14999 (15.00 %) | positive: 56.06 % | negative: 43.94 %
# test example: 14998 (15.00 %) | positive: 56.69 % | negative: 43.31 %


In [0]:
train_df.to_csv(constants.TRAIN_PATH, encoding='utf-8')
valid_df.to_csv(constants.VALID_PATH, encoding='utf-8')
test_df.to_csv(constants.TEST_PATH, encoding='utf-8')

!ls data/twitter

data.csv  data.zip  test.csv  train.csv  valid.csv


## Representing words with one-hot encoding

In [0]:
train_inputs = train_df.SentimentText
train_labels = train_df.Sentiment

valid_inputs = valid_df.SentimentText
valid_labels = valid_df.Sentiment

#### Vectorizing the features with `CountVecoctorizer` [[docs]](https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html)

In [0]:
vectorizer = CountVectorizer()
vectorizer.fit(train_inputs)
train_bow = vectorizer.transform(train_inputs)
valid_bow = vectorizer.transform(valid_inputs)

#### Classifying twits with logistic regression [[docs]](https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html)

In [0]:
# Initialize the classifier. `lbfgs` is the default optimizer. 
# Set `max_iter` to 1000 to avoid annoying convergence warning
lr = LogisticRegression(solver='lbfgs', max_iter=1000)

In [0]:
# optimize the parameters of the classifier
lr = lr.fit(train_bow, train_labels)

In [0]:
# Evaluate the accuracy of our baseline model
train_predictions = lr.predict(train_bow)
valid_predictions = lr.predict(valid_bow)

print('Train accuracy: {:.2f} %'.format(accuracy_score(train_predictions, train_labels) * 100))
print('Valid accuracy: {:.2f} %'.format(accuracy_score(valid_predictions, valid_labels) * 100))

Train accuracy: 90.63 %
Valid accuracy: 76.64 %


### Data exploration + model exploration (e.g. small litterature review)

- Things to consider in your research
 - The task: "sentiment classification" < "text classification" < "classification"
 - Preprocessing and feature representation
 - ...

- Where to look:
 - [Google scholar](https://scholar.google.ca/schhp?hl=en&as_sdt=0,5)
 - Forums and blogs (e.g. Reddit, Medium)
 - [NLP progress](http://nlpprogress.com/)
 - ...

## Representing words with *embeddings*

### Tokenization

In [0]:
STOPWORDS = ['a', 'an', 'the', 'and', 'or', 'to', 'it', 'for', 'is']

def tokenizer(text):
    """
    Tokenizes English text from a string into a list of strings (tokens)
    """
    tokens = [tok.text for tok in spacy_en.tokenizer(text) if tok.text not in STOPWORDS]
    
    return tokens

#### Loading and preprocessing `csv` file with `torchtext.data.TabularDataset`

In [0]:
input_field = Field(sequential=True, tokenize=tokenizer, pad_token='<pad>', unk_token='<unk>', lower=True, batch_first=True)
label_field = Field(sequential=False, use_vocab=False, is_target=True, unk_token=None, batch_first=True, dtype=torch.float32)

fields = {
    'SentimentText': ('input', input_field),
    'Sentiment': ('label', label_field)
}

train_data = TabularDataset(path=constants.TRAIN_PATH, format='csv', fields=fields)
valid_data = TabularDataset(path=constants.VALID_PATH, format='csv', fields=fields)

In [0]:
print(train_df.SentimentText.iloc[4])
print(vars(train_data.examples[4]))

       Sunny Again        Work Tomorrow  :-|       TV Tonight
{'input': ['       ', 'sunny', 'again', '       ', 'work', 'tomorrow', ' ', ':-|', '      ', 'tv', 'tonight'], 'label': '0'}


#### `Field.build_vocab`

- `min_freq`: The minimum frequency needed to include a token in the vocabulary. Values less than 1 will be set to 1. Default: 1.

In [0]:
input_field.build_vocab(train_data, min_freq=5)
print(dict(input_field.vocab.stoi))



#### Testing the `Iterator`

In [0]:
train_iterator = Iterator(train_data, batch_size=32)
train_iterator = iter(train_iterator)

batch = next(train_iterator)

batch_input = batch.input

print(batch_input.shape)

print(batch_input)

torch.Size([32, 34])
tensor([[  28,  184,    0,  ...,    1,    1,    1],
        [8671,    2,   79,  ...,    1,    1,    1],
        [2237,  204,    6,  ...,    1,    1,    1],
        ...,
        [   0,    2,    0,  ...,    1,    1,    1],
        [   0,   97,   22,  ...,    1,    1,    1],
        [   0,   85,    2,  ...,    1,    1,    1]])


### Building the RNN classifier

In [0]:
class SequenceClassifier(nn.Module):
    def __init__(self, input_dim, emb_dim, pretrained_emb, hidden_dim, num_layers, bidirectional, dropout, device):
        super().__init__()
        
        self.input_dim = input_dim
        self.emb_dim = emb_dim
        self.hidden_dim = hidden_dim
        self.num_layers = num_layers
        self.num_direction = 2 if bidirectional else 1
        self.device = device
        
        self.embedding = nn.Embedding(input_dim, emb_dim)
        if pretrained_emb is not None:
            self.embedding.weight.data.copy_(pretrained_emb)
            self.embedding.weight.requires_grad = False # make embedding non trainable
        
        self.rnn = nn.LSTM(emb_dim, hidden_dim, num_layers, bidirectional=bidirectional, dropout=dropout, batch_first=True)
        
        self.classifier = nn.Sequential(
            nn.Linear(hidden_dim * 2, 1),
            nn.Sigmoid()
        )
        
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, batch_input):
        """
        :param batch_input: batch of shape (`batch_size`, `sentence_lenght`)
        """
        batch_size = batch_input.shape[0]
        sentence_lenght = batch_input.shape[1]
        
        embedded = self.dropout(self.embedding(batch_input))  # `embedded `shape = (sentence_lenght, batch_size, embedding_dim)
        
        assert embedded.size() == torch.Size([batch_size, sentence_lenght, self.emb_dim]), \
            f'{embedded.size()} != {torch.Size([batch_size, sentence_lenght, self.emb_dim])}'

        outputs, (hidden, cell) = self.rnn(embedded)  # `outputs` shape = (sentence_lenght, batch_size, hidden_dim)
                                                      # `hidden` shape = (n_layers, batch_size, hidden_dim)
                                                      # `cell` shape = (n_layers, batch_size, hidden_dim)
                
        assert hidden.size() == torch.Size([self.num_layers * self.num_direction, batch_size, self.hidden_dim]), \
            f'{hidden.size()} != {torch.Size([self.num_layers * self.num_direction, batch_size, self.hidden_dim])}'
                
        code = torch.cat([hidden[-1], hidden[-2]], 1)
        
        assert code.size() == torch.Size([batch_size, 2 * self.hidden_dim]), f'{code.size()} != {torch.Size([batch_size, 2 * self.hidden_dim])}'
        
        code = self.dropout(code)
        
        outputs = self.classifier(code).squeeze()
        
        assert outputs.size() == torch.Size([batch_size]), f"{outputs.size()} != {torch.Size([batch_size])}"
        
        return outputs

In [0]:
def evaluate(model, iterator, criterion, device):
    
    model.eval()
    
    targets, predictions = [], []
    epoch_loss = 0
    
    with torch.no_grad():
    
        for batch in iterator:

            batch_input = batch.input.to(device)
            batch_label = batch.label.to(device)

            batch_proba = model(batch_input)
            assert batch_label.shape == batch_proba.shape, f'{batch_label.shape} != {output.shape}'
            
            # compute and store batch predictions
            batch_prediction = batch_proba.cpu().numpy()
            batch_prediction[batch_prediction < 0.5] = 0
            batch_prediction[batch_prediction > 0.5] = 1
            batch_prediction[batch_prediction == 0.5] = random.randint(0, 1)
            
            predictions.extend([y for y in batch_prediction])
            targets.extend([y for y in batch_label.cpu().numpy()])
            
            loss = criterion(batch_proba, batch_label.float())
            epoch_loss += loss.item()
    
    epoch_loss = epoch_loss / len(iterator)
    epoch_acc = accuracy_score(targets, predictions)
    
    return epoch_loss, epoch_acc

In [0]:
def train_iteration(model, iterator, optimizer, criterion, device):
    
    model.train()
    
    for i, batch in enumerate(iterator):
        
        optimizer.zero_grad()
        
        batch_input = batch.input.to(device)
        batch_label = batch.label.to(device)
        
        output = model(batch_input)
        
        assert batch_label.shape == output.shape, f'{batch_label.shape} != {output.shape}'
        
        loss = criterion(output, batch_label)
        
        loss.backward()
        
        optimizer.step()

In [0]:
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [0]:
def init_weights(m):
    for name, param in m.named_parameters():
        nn.init.uniform_(param.data, -0.08, 0.08)

In [0]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

In [0]:
input_field = Field(sequential=True, tokenize=tokenizer, pad_token='<pad>', unk_token='<unk>', lower=True, batch_first=True)
label_field = Field(sequential=False, use_vocab=False, is_target=True, unk_token=None, batch_first=True, dtype=torch.float32)

fields = {'SentimentText': ('input', input_field), 'Sentiment': ('label', label_field)}

train_data = TabularDataset(path=constants.TRAIN_PATH, format='csv', fields=fields)
valid_data = TabularDataset(path=constants.VALID_PATH, format='csv', fields=fields)
test_data = TabularDataset(path=constants.TEST_PATH, format='csv', fields=fields)

input_field.build_vocab(train_data, min_freq=5, vectors="glove.6B.100d")

In [0]:
INPUT_DIM = len(input_field.vocab)
EMB_DIM = 100
HID_DIM = 256
NUM_LAYERS = 2
ENC_DROPOUT = 0.5
N_EPOCHS = 10
BATCH_SIZE = 32
BIDIRECTIONAL = True
pretrained_embeddings = input_field.vocab.vectors

model = SequenceClassifier(input_dim=INPUT_DIM, 
                           emb_dim=EMB_DIM, 
                           pretrained_emb=pretrained_embeddings, 
                           hidden_dim=HID_DIM, 
                           num_layers=NUM_LAYERS, 
                           bidirectional=BIDIRECTIONAL,
                           dropout=ENC_DROPOUT, device=constants.DEVICE)
model.to(constants.DEVICE)
model.apply(init_weights)


optimizer = optim.Adam(model.parameters(), lr=0.0005)
criterion = nn.BCELoss()

print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 2,310,657 trainable parameters


In [0]:
train_iterator = Iterator(train_data, batch_size=BATCH_SIZE, device=constants.DEVICE)
valid_iterator = Iterator(valid_data, batch_size=512, device=constants.DEVICE)
test_iterator = Iterator(test_data, batch_size=512)

In [0]:
best_valid_loss = float('Inf')

for epoch in range(N_EPOCHS):
    
    start_time = time.time()
    
    train_iteration(model, train_iterator, optimizer, criterion, constants.DEVICE)
    
    if (epoch + 1) % 1 == 0:  
        train_loss, train_acc = evaluate(model, train_iterator, criterion, constants.DEVICE)
        valid_loss, valid_acc = evaluate(model, valid_iterator, criterion, constants.DEVICE)

        end_time = time.time()

        epoch_mins, epoch_secs = epoch_time(start_time, end_time)

        if valid_loss < best_valid_loss:
            best_valid_loss = valid_loss
            torch.save(model.state_dict(), 'tut1-model.pt')

        print(f'Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s | Train Loss: {train_loss:.3f} | Train Acc.: {train_acc:.2f} | Val. Loss: {valid_loss:.3f} |  Val. Acc.: {valid_acc:.2f}')

Epoch: 01 | Time: 0m 35s | Train Loss: 0.627 | Train Acc.: 0.64 | Val. Loss: 0.630 |  Val. Acc.: 0.64
Epoch: 02 | Time: 0m 34s | Train Loss: 0.612 | Train Acc.: 0.66 | Val. Loss: 0.615 |  Val. Acc.: 0.66
Epoch: 03 | Time: 0m 35s | Train Loss: 0.602 | Train Acc.: 0.68 | Val. Loss: 0.604 |  Val. Acc.: 0.67
Epoch: 04 | Time: 0m 35s | Train Loss: 0.595 | Train Acc.: 0.67 | Val. Loss: 0.598 |  Val. Acc.: 0.67
Epoch: 05 | Time: 0m 35s | Train Loss: 0.578 | Train Acc.: 0.69 | Val. Loss: 0.584 |  Val. Acc.: 0.69
Epoch: 06 | Time: 0m 35s | Train Loss: 0.554 | Train Acc.: 0.71 | Val. Loss: 0.559 |  Val. Acc.: 0.70
Epoch: 07 | Time: 0m 35s | Train Loss: 0.550 | Train Acc.: 0.72 | Val. Loss: 0.556 |  Val. Acc.: 0.71
Epoch: 08 | Time: 0m 35s | Train Loss: 0.550 | Train Acc.: 0.71 | Val. Loss: 0.555 |  Val. Acc.: 0.71
Epoch: 09 | Time: 0m 35s | Train Loss: 0.542 | Train Acc.: 0.71 | Val. Loss: 0.555 |  Val. Acc.: 0.70
Epoch: 10 | Time: 0m 35s | Train Loss: 0.531 | Train Acc.: 0.73 | Val. Loss: 0.544

In [0]:
# DO NOT RUN THIS UNTIL YOU ARE SURE ABOUT YOUR HYPERPARAMETERS; THERE IS NO GOING BACK ;)
# model.load_state_dict(torch.load('tut1-model.pt'))
# evaluate(model, test_iterator, criterion, constants.DEVICE)