<a href="https://colab.research.google.com/github/hotbread213/createClass/blob/master/embedding.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Introductions to embedding

## What does embedding means?

Projecting an input into a better representation space. For example, consider a linear regression:

Without feature mapping:

\begin{equation}
    f(\mathbf{x}) = \mathbf{w}^\top \mathbf{x}+ b
\end{equation}

With feature mapping:

\begin{equation}
    f(\mathbf{x}) = \tilde{\mathbf{w}}^\top \phi(\mathbf{x})+ b
\end{equation}


## A simple example: One-hot Encoding

- One-hot encoding is an example of deterministic embedding

<img src="https://drive.google.com/uc?export=view&id=1Sj1FSqc6JsVcg5zZGMj1bpE1C02f4uy8">

## Dataset and task for the tutorial

- The [Twitter sentiment analysis](https://www.kaggle.com/c/twitter-sentiment-analysis2/overview) is an open source dataset available on Kaggle. It contains 100000 twits labeled as either negative (0) or positive (1). 

- The task consist in writing a model that takes a twits as input and output 1 if the sentiment is positive or 0 if the sentiment is negative.

### Import required libraries

In [0]:
import time
import random
import os
import pandas
import numpy
import zipfile

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

import torch
import torch.nn as nn
import torch.optim as optim
from torch.nn import functional as F
from torchtext.data import Field, TabularDataset, Iterator

from google_drive_downloader import GoogleDriveDownloader

import spacy

spacy_en = spacy.load('en')

### Define some constants

In [0]:
class Constants:
    
    DATA_FILE_ID = '1wrfQmCShiTmbIsr7LpZhEiYw7dhuaOhk'                     # Google drive id to be able to download from drive
    
    SEED = 1                                                               # random seed for reproductability
    
    DATA_DIR = 'data/twitter/'                                             # path to the csv data
    DATA_ZIP_FILE = f'{DATA_DIR}data.zip'                                  # path where to dowload the zipped data
    DATA_PATH = '{}data.csv'.format(DATA_DIR)                              # path to the news data
    TRAIN_PATH = '{}train.csv'.format(DATA_DIR)
    VALID_PATH = '{}valid.csv'.format(DATA_DIR)
    TEST_PATH = '{}test.csv'.format(DATA_DIR)
    
    DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")  # set device to GPU if availale

constants = Constants

### Fix random seed for reproductability

In [0]:
numpy.random.seed(constants.SEED)
random.seed(constants.SEED)
torch.manual_seed(constants.SEED)
torch.backends.cudnn.deterministic = True

### Download the data on your local server

In [0]:
GoogleDriveDownloader.download_file_from_google_drive(file_id=constants.DATA_FILE_ID, dest_path=constants.DATA_ZIP_FILE, unzip=False)

zip_ref = zipfile.ZipFile(constants.DATA_ZIP_FILE, 'r')
zip_ref.extractall(constants.DATA_DIR)
zip_ref.close()

os.rename(f'{constants.DATA_DIR}train.csv', f'{constants.DATA_DIR}data.csv')
!rm data/twitter/test.csv

!ls data/twitter

Downloading 1wrfQmCShiTmbIsr7LpZhEiYw7dhuaOhk into data/twitter/data.zip... Done.
data.csv  data.zip


### Visualize the data with `pandas.DataFrame`

In [0]:
data = pandas.read_csv(constants.DATA_PATH, encoding="ISO-8859-1") # weird encoding: https://stackoverflow.com/questions/18171739/unicodedecodeerror-when-reading-csv-file-in-pandas-with-python
data.head()

Unnamed: 0,ItemID,Sentiment,SentimentText
0,1,0,is so sad for my APL frie...
1,2,0,I missed the New Moon trail...
2,3,1,omg its already 7:30 :O
3,4,0,.. Omgaga. Im sooo im gunna CRy. I'...
4,5,0,i think mi bf is cheating on me!!! ...


### Dataset validation

In [0]:
N_OBS = len(data)

assert N_OBS == 99989

N_POSITIVE_LABEL = len(data[data.Sentiment == 1])
N_NEGATIVE_LABEL = len(data[data.Sentiment == 0])

assert N_POSITIVE_LABEL == 56457
assert N_NEGATIVE_LABEL == 43532
assert N_POSITIVE_LABEL + N_NEGATIVE_LABEL == N_OBS

assert len(data.dropna()) == N_OBS  # Make sure there is no nan

### Split the data into a train and a validation set and print some informations (split percentage, class distribution)

In [0]:
TRAIN_SIZE = round(0.6 * N_OBS)
VALID_SIZE = round(0.2 * N_OBS)
TEST_SIZE = round(0.2 * N_OBS)
assert TRAIN_SIZE + VALID_SIZE + TEST_SIZE == N_OBS

In [0]:
# shuffle the indices
examples = set(range(N_OBS))
train_indices = set(random.sample(examples, TRAIN_SIZE))
examples = set(i for i in examples if i not in train_indices)
valid_indices = set(random.sample(examples, VALID_SIZE))
test_indices = [i for i in examples if i not in valid_indices]

In [0]:
# Split the data
train_df = data.iloc[list(train_indices)]
valid_df = data.iloc[list(valid_indices)]
test_df = data.iloc[list(test_indices)]

In [0]:
n_train = len(train_df)
n_train_positive = len(train_df[train_df.Sentiment == 1])

n_valid = len(valid_df)
n_valid_positive = len(valid_df[valid_df.Sentiment == 1])

n_test = len(test_df)
n_test_positive = len(test_df[test_df.Sentiment == 1])

print('# train example: {} ({:.2f} %) | positive: {:.2f} % | negative: {:.2f} %'.format(n_train, n_train / N_OBS * 100, n_train_positive / n_train * 100, 100 - n_train_positive / n_train * 100))
print('# valid example: {} ({:.2f} %) | positive: {:.2f} % | negative: {:.2f} %'.format(n_valid, n_valid / N_OBS * 100, n_valid_positive / n_valid * 100, 100 - n_valid_positive / n_valid * 100))
print('# test example: {} ({:.2f} %) | positive: {:.2f} % | negative: {:.2f} %'.format(n_test, n_test / N_OBS * 100, n_test_positive / n_test * 100, 100 - n_test_positive / n_test * 100))

# train example: 59993 (60.00 %) | positive: 56.53 % | negative: 43.47 %
# valid example: 19998 (20.00 %) | positive: 56.35 % | negative: 43.65 %
# test example: 19998 (20.00 %) | positive: 56.37 % | negative: 43.63 %


In [0]:
train_df.to_csv(constants.TRAIN_PATH, encoding='utf-8')
valid_df.to_csv(constants.VALID_PATH, encoding='utf-8')
test_df.to_csv(constants.TEST_PATH, encoding='utf-8')

!ls data/twitter

data.csv  data.zip  test.csv  train.csv  valid.csv


## Representing words with one-hot encoding

In [0]:
train_inputs = train_df.SentimentText
train_labels = train_df.Sentiment

valid_inputs = valid_df.SentimentText
valid_labels = valid_df.Sentiment

#### Vecorizing the features with `CountVecoctorizer` [[docs]](https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html)

In [0]:
fake_twit = "GOT is kind of weird this year..."

fake_vectorizer = CountVectorizer()
fake_vectorizer = fake_vectorizer.fit([fake_twit])

print(fake_vectorizer.vocabulary_)

onehot_GOT = fake_vectorizer.transform(['GOT']).toarray()[0]
print(onehot_GOT)

onehot_year = fake_vectorizer.transform(['year']).toarray()[0]
print(onehot_year)

{'got': 0, 'is': 1, 'kind': 2, 'of': 3, 'weird': 5, 'this': 4, 'year': 6}
[1 0 0 0 0 0 0]
[0 0 0 0 0 0 1]


In [0]:
vectorizer = CountVectorizer()
vectorizer = vectorizer.fit(train_inputs)

In [0]:
x = train_inputs[0]
print(f'Twit: {x}')
bow = vectorizer.transform([x]).toarray()[0]
print(f'Sum of one-hot: {bow}')

Twit:                      is so sad for my APL friend.............
Sum of one-hot: [0 0 0 ... 0 0 0]


In [0]:
train_bow = vectorizer.transform(train_inputs)
valid_bow = vectorizer.transform(valid_inputs)

#### Classifying twits with logistic regression [[docs]](https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html)

\begin{equation}
    y = (1+e^{-\mathbf{w}^\top - b})^{-1}
\end{equation}

In [0]:
# Initialize the classifier. `lbfgs` is the default optimizer. 
# Set `max_iter` to 1000 to avoid annoying convergence warning
lr = LogisticRegression(solver='lbfgs', max_iter=1000)

In [0]:
# optimize the parameters of the classifier
lr = lr.fit(train_bow, train_labels)

In [0]:
# Evaluate the accuracy of our baseline model
train_predictions = lr.predict(train_bow)
valid_predictions = lr.predict(valid_bow)

print('Train accuracy: {:.2f} %'.format(accuracy_score(train_predictions, train_labels) * 100))
print('Valid accuracy: {:.2f} %'.format(accuracy_score(valid_predictions, valid_labels) * 100))

Train accuracy: 90.91 %
Valid accuracy: 76.42 %


## Representing words with *embeddings*

### Tokenization

In [0]:
def tokenizer(text):
    """
    Tokenizes English text from a string into a list of strings (tokens)
    """
    return [tok.text for tok in spacy_en.tokenizer(text)]

# example
tokenizer(fake_twit)

['GOT', 'is', 'kind', 'of', 'weird', 'this', 'year', '...']

#### Loading and preprocessing `csv` file with `torchtext.data.TabularDataset`

In [0]:
# Defining `Field`s with `torchtext.data.Field`
input_field = Field(sequential=True, tokenize=tokenizer, lower=True, batch_first=True, fix_length=42)
label_field = Field(sequential=False, use_vocab=False, is_target=True, unk_token=None, batch_first=True, dtype=torch.float32)

fields = {
    'SentimentText': ('input', input_field),
    'Sentiment': ('label', label_field)
}

train_data = TabularDataset(path=constants.TRAIN_PATH, format='csv', fields=fields)
valid_data = TabularDataset(path=constants.VALID_PATH, format='csv', fields=fields)

In [0]:
print(train_df.SentimentText.iloc[4])
print(vars(train_data.examples[4]))

       Sunny Again        Work Tomorrow  :-|       TV Tonight
{'input': ['       ', 'sunny', 'again', '       ', 'work', 'tomorrow', ' ', ':-|', '      ', 'tv', 'tonight'], 'label': '0'}


#### `Field.build_vocab`

- `min_freq`: The minimum frequency needed to include a token in the vocabulary. Values less than 1 will be set to 1. Default: 1.

In [0]:
input_field.build_vocab(train_data, min_freq=5)

In [0]:
print(dict(input_field.vocab.stoi))



#### Testing the `Iterator` and vizualizing batch

In [0]:
train_iterator = Iterator(train_data, batch_size=32)
train_iterator = iter(train_iterator)

batch1 = next(train_iterator)
batch2 = next(train_iterator)

batch_input1 = batch1.input
batch_input2 = batch2.input

batch_label1 = batch1.label

print(batch_input1.shape)
print(batch_input2.shape)
print(batch_label1.shape)

print(batch_input1[0])
print(batch_input2[0])

torch.Size([32, 42])
torch.Size([32, 42])
torch.Size([32])
tensor([  0, 138,   7,   5,  95,   7,  98,  12,  41, 336,  18,  56, 185,  86,
         77,   3,   3,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,
          1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1])
tensor([   0,    2,   84, 2224,    9,  198,   14,    5,   75,   17,   81, 8291,
          54,   10,   10,    1,    1,    1,    1,    1,    1,    1,    1,    1,
           1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
           1,    1,    1,    1,    1,    1])


## Dense embedding

In [0]:
tokens_twit = tokenizer(fake_twit)
fake_vocab = set(tokens_twit)

word2index = {w: i for i, w in enumerate(fake_vocab)}
print(word2index)

fake_twit_index = [word2index[w] for w in tokens_twit]

fake_twit_index = torch.LongTensor(fake_twit_index)

print(fake_twit_index)

embed = nn.Embedding(len(fake_vocab), 5)

embedding_matrix = embed.weight.data
print(embedding_matrix)

def get_embedding(word):
    index = word2index[word]
    tensor = torch.LongTensor([index])
    return embed(tensor)

get_embedding('GOT')

{'GOT': 0, '...': 1, 'kind': 2, 'this': 3, 'is': 4, 'of': 5, 'weird': 6, 'year': 7}
tensor([0, 4, 2, 5, 6, 3, 7, 1])
tensor([[-1.5256, -0.7502, -0.6540, -1.6095, -0.1002],
        [-0.6092, -0.9798, -1.6091, -0.7121,  0.3037],
        [-0.7773, -0.2515, -0.2223,  1.6871,  0.2284],
        [ 0.4676, -0.6970, -1.1608,  0.6995,  0.1991],
        [ 0.8657,  0.2444, -0.6629,  0.8073, -1.8821],
        [-0.7765,  2.0242, -0.0865,  0.0981, -1.2150],
        [ 0.7312,  1.1718,  2.4070,  0.2786,  0.2468],
        [ 1.1843, -0.7282,  1.1633, -0.0091, -0.8425]])


tensor([[-1.5256, -0.7502, -0.6540, -1.6095, -0.1002]],
       grad_fn=<EmbeddingBackward>)

### Building the logistic regression classifier

In [0]:
class LR(nn.Module):

    def __init__(self, num_embeddings, seq_len, embedding_dim):
        super(LR, self).__init__()
        self.embedding_dim = embedding_dim
        self.embed = nn.Embedding(num_embeddings, embedding_dim)
        
        self.classifier = nn.Sequential(
            nn.Linear(seq_len * embedding_dim, 1),
            nn.Sigmoid()
        )

    def forward(self, inputs):
        batch_size, seq_len = inputs.shape
        embedded = self.embed(inputs).view(batch_size, seq_len * self.embedding_dim)
        proba = self.classifier(embedded).squeeze()
        return proba

In [0]:
def evaluate(model, iterator, criterion, device):
    
    model.eval()
    
    targets, predictions = [], []
    epoch_loss = 0
    
    with torch.no_grad():
    
        for batch in iterator:

            batch_input = batch.input.to(device)
            batch_label = batch.label.to(device)

            batch_proba = model(batch_input)
            assert batch_label.shape == batch_proba.shape, f'{batch_label.shape} != {output.shape}'
            
            # compute and store batch predictions
            batch_prediction = batch_proba.cpu().numpy()
            batch_prediction[batch_prediction < 0.5] = 0
            batch_prediction[batch_prediction > 0.5] = 1
            batch_prediction[batch_prediction == 0.5] = random.randint(0, 1)
            
            predictions.extend([y for y in batch_prediction])
            targets.extend([y for y in batch_label.cpu().numpy()])
            
            loss = criterion(batch_proba, batch_label.float())
            epoch_loss += loss.item()
    
    epoch_loss = epoch_loss / len(iterator)
    epoch_acc = accuracy_score(targets, predictions)
    
    return epoch_loss, epoch_acc

In [0]:
def train_iteration(model, iterator, optimizer, criterion, device):
    
    model.train()
    
    for i, batch in enumerate(iterator):
        
        optimizer.zero_grad()
        
        batch_input = batch.input.to(device)
        batch_label = batch.label.to(device)
        
        output = model(batch_input).squeeze()
        
        assert batch_label.shape == output.shape, f'{batch_label.shape} != {output.shape}'
        
        loss = criterion(output, batch_label)
        
        loss.backward()
        
        optimizer.step()

In [0]:
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [0]:
EMB_DIM = 8
SEQ_LEN = 32
N_EPOCHS = 100

In [0]:
input_field = Field(sequential=True, tokenize=tokenizer, lower=True, batch_first=True, fix_length=SEQ_LEN)
label_field = Field(sequential=False, use_vocab=False, is_target=True, unk_token=None, batch_first=True, dtype=torch.float32)

fields = {'SentimentText': ('input', input_field), 'Sentiment': ('label', label_field)}

train_data = TabularDataset(path=constants.TRAIN_PATH, format='csv', fields=fields)
valid_data = TabularDataset(path=constants.VALID_PATH, format='csv', fields=fields)

input_field.build_vocab(train_data, min_freq=5)

train_iterator = Iterator(train_data, batch_size=256)
valid_iterator = Iterator(valid_data, batch_size=1024)

SRC_DIM = len(input_field.vocab)

In [0]:
model = LR(SRC_DIM, SEQ_LEN, EMB_DIM)
model = model.to(constants.DEVICE)

optimizer = optim.SGD(model.parameters(), lr=0.1, momentum=0.9)
criterion = nn.BCELoss()

In [0]:
best_valid_loss = float('Inf')

start_time = time.time()

for epoch in range(N_EPOCHS):
    
    train_iteration(model, train_iterator, optimizer, criterion, constants.DEVICE)
    
    if (epoch + 1) % 1 == 0:  
        train_loss, train_acc = evaluate(model, train_iterator, criterion, constants.DEVICE)
        valid_loss, valid_acc = evaluate(model, valid_iterator, criterion, constants.DEVICE)

        end_time = time.time()

        epoch_mins, epoch_secs = epoch_time(start_time, end_time)

        if valid_loss < best_valid_loss:
            best_valid_loss = valid_loss
            torch.save(model.state_dict(), 'tut1-model.pt')

        print(f'Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s | Train Loss: {train_loss:.3f} | Train Acc.: {train_acc:.2f} | Val. Loss: {valid_loss:.3f} |  Val. Acc.: {valid_acc:.2f}')