In [1]:
%load_ext autoreload
%autoreload 1
%aimport sentiment_utils
%aimport mytorch

[nltk_data] Downloading package stopwords to /Users/admin/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
import os
import pickle
import sys

import datasets
import evaluate
import gensim
import numpy as np
import pandas as pd
import sklearn
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, HashingVectorizer
from tqdm import tqdm
import torch
from torch import nn
from torch.utils.data import DataLoader
from torch_lr_finder import LRFinder
import transformers
from transformers import (AutoModelForSequenceClassification, AutoTokenizer,
                          DataCollatorWithPadding, TrainingArguments, Trainer)


print('python:'.ljust(16), sys.version.split('\n')[0])
print('scikit-learn:'.ljust(16), sklearn.__version__)
print('Gensim:'.ljust(16), gensim.__version__)
print('PyTorch:'.ljust(16), torch.__version__)
print('Transformers:'.ljust(16), transformers.__version__)

python:          3.11.3 (main, Apr  7 2023, 20:13:31) [Clang 14.0.0 (clang-1400.0.29.202)]
scikit-learn:    1.2.2
Gensim:          4.3.1
PyTorch:         2.0.1
Transformers:    4.29.2


# Device

In [3]:
# Get cpu or gpu device for training
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f'Using {DEVICE} device')

Using cpu device


# Hyperparameters & Constants

In [4]:
# Hyperparameters
VOCAB_SIZE = 50000
SVD_SIZE = 100
BATCH_SIZE = 64
EPOCHS = 15  # select from: 2**n - 1 = [1, 3, 7, 15, ...]
SCHEDULER_GAMMA = 0.7

# Constants
WORKING_PATH = './sentiment-data/'
MODEL_PATH = '../app/models/'
DATASET_NAME = 'tweet_eval'
DATASET_CONF = 'sentiment'
CLASSES = 3
LABEL_MAP = {
    0: 'negative',
    1: 'neutral',
    2: 'positive',
}
HUGGINGFACE_MODEL_NAME = 'cardiffnlp/twitter-roberta-base-sentiment-latest'

# Actions
DO_LR_RANGE_TEST=True

# Random state

In [5]:
RANDOM_STATE = 2147483647
# random.seed(RANDOM_STATE)
# np.random.seed(RANDOM_STATE)
torch.manual_seed(RANDOM_STATE)
torch.cuda.manual_seed(RANDOM_STATE)
torch.cuda.manual_seed_all(RANDOM_STATE)

# Load & show data

In [6]:
dataset = datasets.load_dataset(DATASET_NAME, DATASET_CONF)
dataset

Found cached dataset tweet_eval (/Users/admin/.cache/huggingface/datasets/tweet_eval/sentiment/1.1.0/12aee5282b8784f3e95459466db4cdf45c6bf49719c25cdb0743d71ed0410343)


  0%|          | 0/3 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 45615
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 12284
    })
    validation: Dataset({
        features: ['text', 'label'],
        num_rows: 2000
    })
})

In [7]:
dataset['train'].features

{'text': Value(dtype='string', id=None),
 'label': ClassLabel(names=['negative', 'neutral', 'positive'], id=None)}

In [8]:
dataset['train'][0:5]

{'text': ['"QT @user In the original draft of the 7th book, Remus Lupin survived the Battle of Hogwarts. #HappyBirthdayRemusLupin"',
  '"Ben Smith / Smith (concussion) remains out of the lineup Thursday, Curtis #NHL #SJ"',
  'Sorry bout the stream last night I crashed out but will be on tonight for sure. Then back to Minecraft in pc tomorrow night.',
  "Chase Headley's RBI double in the 8th inning off David Price snapped a Yankees streak of 33 consecutive scoreless innings against Blue Jays",
  '@user Alciato: Bee will invest 150 million in January, another 200 in the Summer and plans to bring Messi by 2017"'],
 'label': [2, 1, 1, 1, 2]}

# Tokenization
## TokTokTokenizer

Try tokenization

In [9]:
tokenizer = sentiment_utils.Tokenizer()

for text in dataset['train']['text'][:5]:
    print(text)
    print(tokenizer(text, return_str=True))
    print()

"QT @user In the original draft of the 7th book, Remus Lupin survived the Battle of Hogwarts. #HappyBirthdayRemusLupin"
qt @user origin draft 7th book , remu lupin surviv battl hogwarts. #happybirthdayremuslupin

"Ben Smith / Smith (concussion) remains out of the lineup Thursday, Curtis #NHL #SJ"
ben smith / smith ( concuss ) remain lineup thursday , curti #nhl #sj

Sorry bout the stream last night I crashed out but will be on tonight for sure. Then back to Minecraft in pc tomorrow night.
sorri bout stream last night crash tonight sure. back minecraft pc tomorrow night .

Chase Headley's RBI double in the 8th inning off David Price snapped a Yankees streak of 33 consecutive scoreless innings against Blue Jays
chase headley ' rbi doubl 8th inning david price snap yanke streak 33 consecut scoreless inning blue jay

@user Alciato: Bee will invest 150 million in January, another 200 in the Summer and plans to bring Messi by 2017"
@user alciato : bee invest 150 million januari , anoth 200 s

## RobertaTokenizerFast

Try tokenization

In [10]:
tokenizer = AutoTokenizer.from_pretrained(HUGGINGFACE_MODEL_NAME)

for text in dataset['train']['text'][:2]:
    print(text)
    preprocessed_text = sentiment_utils.preprocess_text(text)
    tokens = tokenizer.tokenize(preprocessed_text)
    model_input = tokenizer(preprocessed_text, return_tensors='pt')
    print(tokenizer.decode(model_input['input_ids'][0]))
    print(tokens)
    print(model_input['input_ids'])
    print()

"QT @user In the original draft of the 7th book, Remus Lupin survived the Battle of Hogwarts. #HappyBirthdayRemusLupin"
<s>"QT @user In the original draft of the 7th book, Remus Lupin survived the Battle of Hogwarts. #HappyBirthdayRemusLupin" "QT @user In the original draft of the 7th book, Remus Lupin survived the Battle of Hogwarts. #HappyBirthdayRemusLupin"</s>
['"', 'Q', 'T', 'Ġ@', 'user', 'ĠIn', 'Ġthe', 'Ġoriginal', 'Ġdraft', 'Ġof', 'Ġthe', 'Ġ7', 'th', 'Ġbook', ',', 'ĠRem', 'us', 'ĠLup', 'in', 'Ġsurvived', 'Ġthe', 'ĠBattle', 'Ġof', 'ĠHogwarts', '.', 'Ġ#', 'Happy', 'Birth', 'day', 'Rem', 'us', 'L', 'up', 'in', '"', 'Ġ"', 'Q', 'T', 'Ġ@', 'user', 'ĠIn', 'Ġthe', 'Ġoriginal', 'Ġdraft', 'Ġof', 'Ġthe', 'Ġ7', 'th', 'Ġbook', ',', 'ĠRem', 'us', 'ĠLup', 'in', 'Ġsurvived', 'Ġthe', 'ĠBattle', 'Ġof', 'ĠHogwarts', '.', 'Ġ#', 'Happy', 'Birth', 'day', 'Rem', 'us', 'L', 'up', 'in', '"']
tensor([[    0,   113,  1864,   565,   787, 12105,    96,     5,  1461,  2479,
             9,     5,   262,   21

Tokenize the whole dataset to prepare it for fine-tuning

In [11]:
%%time
def tokenize_function(examples):
    preprocessed_text = sentiment_utils.preprocess_text(examples['text'])
    return tokenizer(preprocessed_text)

tokenized_dataset = dataset.map(tokenize_function, batched=True)
# tokenized_dataset = tokenized_dataset.remove_columns('text')
# tokenized_dataset = tokenized_dataset.rename_column('label', 'labels')
# tokenized_dataset = tokenized_dataset.with_format('torch')
tokenized_dataset

Loading cached processed dataset at /Users/admin/.cache/huggingface/datasets/tweet_eval/sentiment/1.1.0/12aee5282b8784f3e95459466db4cdf45c6bf49719c25cdb0743d71ed0410343/cache-e69e5419e9b6a115.arrow
Loading cached processed dataset at /Users/admin/.cache/huggingface/datasets/tweet_eval/sentiment/1.1.0/12aee5282b8784f3e95459466db4cdf45c6bf49719c25cdb0743d71ed0410343/cache-460ef6407e274887.arrow
Loading cached processed dataset at /Users/admin/.cache/huggingface/datasets/tweet_eval/sentiment/1.1.0/12aee5282b8784f3e95459466db4cdf45c6bf49719c25cdb0743d71ed0410343/cache-a4d147f31b1fa3d9.arrow


CPU times: user 41.3 ms, sys: 5.36 ms, total: 46.7 ms
Wall time: 46.4 ms


DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 45615
    })
    test: Dataset({
        features: ['text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 12284
    })
    validation: Dataset({
        features: ['text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 2000
    })
})

In [12]:
class ComputeMetrics:
    def __init__(self):
        self.metric = evaluate.load(DATASET_NAME, DATASET_CONF)
        
    def __call__(self, eval_preds):
        logits, labels = eval_preds
        predictions = np.argmax(logits, axis=-1)
        return self.metric.compute(predictions=predictions, references=labels)

In [13]:
model = AutoModelForSequenceClassification.from_pretrained(HUGGINGFACE_MODEL_NAME)

training_args = TrainingArguments(WORKING_PATH + 'training_data',
                                  evaluation_strategy='epoch')

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

trainer = Trainer(model,
                  training_args,
                  data_collator,
                  tokenized_dataset['train'],
                  tokenized_dataset['validation'],
                  tokenizer,
                  compute_metrics=ComputeMetrics)

trainer.train()

Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss


[E thread_pool.cpp:109] Exception in thread pool task: mutex lock failed: Invalid argument
[E thread_pool.cpp:109] Exception in thread pool task: mutex lock failed: Invalid argument
[E thread_pool.cpp:109] Exception in thread pool task: mutex lock failed: Invalid argument


KeyboardInterrupt: 

# Vectorization
## Document vectorizers
### BOW, TF-IDF, Hashing BOW and their SVD variants
Fit vectorizers

In [11]:
%%time
save_vectorizers = True
file_vectorizers = WORKING_PATH + 'document_vectorizers_' + str(VOCAB_SIZE) + '_' + str(SVD_SIZE) + '.pickle'

# Load vectorizer if it already exists
if os.path.isfile(file_vectorizers):
    with open(file_vectorizers, 'rb') as f:
        vectorizers = pickle.load(f)

else:
    # Initialize tokenizer
    tokenizer = sentiment_utils.Tokenizer()

    # Initialize vectorizers
    vectorizers = {'toktok': {
        'bow': CountVectorizer(lowercase=False,
                               tokenizer=tokenizer,
                               max_features=VOCAB_SIZE),
        'hbow': HashingVectorizer(lowercase=False,
                                  tokenizer=tokenizer,
                                  n_features=VOCAB_SIZE),
        'tfidf': TfidfTransformer(),
        'bow_svd': TruncatedSVD(n_components=SVD_SIZE),
        'hbow_svd': TruncatedSVD(n_components=SVD_SIZE),
        'tfidf_svd': TruncatedSVD(n_components=SVD_SIZE),
    }}
    # Fit vectorizers and transform train data
    bow_train_texts = vectorizers['toktok']['bow'].fit_transform(dataset['train']['text'])
    hbow_train_texts = vectorizers['toktok']['hbow'].fit_transform(dataset['train']['text'])
    tfidf_train_texts = vectorizers['toktok']['tfidf'].fit_transform(bow_train_texts)

    # Fit SVD-truncated vectorizers
    vectorizers['toktok']['bow_svd'].fit(bow_train_texts)
    vectorizers['toktok']['hbow_svd'].fit(hbow_train_texts)
    vectorizers['toktok']['tfidf_svd'].fit(tfidf_train_texts)
    
    # Save vectorizers
    if save_vectorizers:
        with open(file_vectorizers, 'wb') as f:
            pickle.dump(vectorizers, f, pickle.HIGHEST_PROTOCOL)

# Print SVD explained variance
print('Explained variance for SVD:')
print('BOW:        ', round(vectorizers['toktok']['bow_svd'].explained_variance_ratio_.sum() * 100, 2), '%')
print('Hashing BOW:', round(vectorizers['toktok']['hbow_svd'].explained_variance_ratio_.sum() * 100, 2), '%')
print('TF-IDF:     ', round(vectorizers['toktok']['tfidf_svd'].explained_variance_ratio_.sum() * 100, 2), '%')
print()



Explained variance for SVD:
BOW:         51.53 %
Hashing BOW: 41.75 %
TF-IDF:      16.51 %

CPU times: user 1min 10s, sys: 7.23 s, total: 1min 17s
Wall time: 20.4 s


Transform data

In [12]:
%%time
transform_data = True
save_data = True
file_data = WORKING_PATH + 'document_data_' + str(VOCAB_SIZE) + '_' + str(SVD_SIZE) + '.pickle'

if transform_data:
    # Load transformed data if it already exists
    if os.path.isfile(file_data):
        with open(file_data, 'rb') as f:
            data = pickle.load(f)

    else:
        data = {'toktok': {
            'bow': {
                'train': vectorizers['toktok']['bow'].transform(dataset['train']['text']),
                'valid': vectorizers['toktok']['bow'].transform(dataset['validation']['text']),
                'test': vectorizers['toktok']['bow'].transform(dataset['test']['text']),
            },
            'hbow': {
                'train': vectorizers['toktok']['hbow'].transform(dataset['train']['text']),
                'valid': vectorizers['toktok']['hbow'].transform(dataset['validation']['text']),
                'test': vectorizers['toktok']['hbow'].transform(dataset['test']['text']),
            },
        }}
        
        data['toktok']['tfidf'] = {
            'train': vectorizers['toktok']['tfidf'].transform(data['toktok']['bow']['train']),
            'valid': vectorizers['toktok']['tfidf'].transform(data['toktok']['bow']['valid']),
            'test': vectorizers['toktok']['tfidf'].transform(data['toktok']['bow']['test']),
        }

        data['toktok']['bow_svd'] = {
            'train': vectorizers['toktok']['bow_svd'].transform(data['toktok']['bow']['train']),
            'valid': vectorizers['toktok']['bow_svd'].transform(data['toktok']['bow']['valid']),
            'test': vectorizers['toktok']['bow_svd'].transform(data['toktok']['bow']['test']),
        }

        data['toktok']['hbow_svd'] = {
            'train': vectorizers['toktok']['hbow_svd'].transform(data['toktok']['hbow']['train']),
            'valid': vectorizers['toktok']['hbow_svd'].transform(data['toktok']['hbow']['valid']),
            'test': vectorizers['toktok']['hbow_svd'].transform(data['toktok']['hbow']['test']),
        }

        data['toktok']['tfidf_svd'] = {
            'train': vectorizers['toktok']['tfidf_svd'].transform(data['toktok']['tfidf']['train']),
            'valid': vectorizers['toktok']['tfidf_svd'].transform(data['toktok']['tfidf']['valid']),
            'test': vectorizers['toktok']['tfidf_svd'].transform(data['toktok']['tfidf']['test']),
        }

        # Save transformed data
        if save_data:
            with open(file_data, 'wb') as f:
                pickle.dump(data, f, pickle.HIGHEST_PROTOCOL)

    # Print shapes
    print('Full size data shapes:')
    print('BOW:        ',
          data['toktok']['bow']['train'].shape,
          data['toktok']['bow']['valid'].shape,
          data['toktok']['bow']['test'].shape)
    print('Hashing BOW:',
          data['toktok']['hbow']['train'].shape,
          data['toktok']['hbow']['valid'].shape,
          data['toktok']['hbow']['test'].shape)
    print('TF-IDF:     ',
          data['toktok']['tfidf']['train'].shape,
          data['toktok']['tfidf']['valid'].shape,
          data['toktok']['tfidf']['test'].shape)
    print()
    print('SVD-truncated data shapes:')
    print('BOW:        ',
          data['toktok']['bow_svd']['train'].shape,
          data['toktok']['bow_svd']['valid'].shape,
          data['toktok']['bow_svd']['test'].shape)
    print('Hashing BOW:',
          data['toktok']['hbow_svd']['train'].shape,
          data['toktok']['hbow_svd']['valid'].shape,
          data['toktok']['hbow_svd']['test'].shape)
    print('TF-IDF:     ',
          data['toktok']['tfidf_svd']['train'].shape,
          data['toktok']['tfidf_svd']['valid'].shape,
          data['toktok']['tfidf_svd']['test'].shape)
    print()

Full size data shapes:
BOW:         (45615, 50000) (2000, 50000) (12284, 50000)
Hashing BOW: (45615, 50000) (2000, 50000) (12284, 50000)
TF-IDF:      (45615, 50000) (2000, 50000) (12284, 50000)

SVD-truncated data shapes:
BOW:         (45615, 100) (2000, 100) (12284, 100)
Hashing BOW: (45615, 100) (2000, 100) (12284, 100)
TF-IDF:      (45615, 100) (2000, 100) (12284, 100)

CPU times: user 14 s, sys: 88.2 ms, total: 14.1 s
Wall time: 14.2 s


Form datasets

In [35]:
class Doc2VecDataset(torch.utils.data.Dataset):
    def __init__(self, x, y):
        self.x = x
        self.y = y

    def __len__(self):
        return len(self.y)

    def __getitem__(self, i):
        if isinstance(self.x[i], np.ndarray):
            x = self.x[i]
        else:
            x = self.x[i].todense()
        return x, self.y[i]


datasets = {'toktok': {
    'bow': {
        'train': Doc2VecDataset(data['toktok']['bow']['train'], dataset['train']['label']),
        'valid': Doc2VecDataset(data['toktok']['bow']['valid'], dataset['validation']['label']),
        'test': Doc2VecDataset(data['toktok']['bow']['test'], dataset['test']['label']),
    },
    'hbow': {
        'train': Doc2VecDataset(data['toktok']['hbow']['train'], dataset['train']['label']),
        'valid': Doc2VecDataset(data['toktok']['hbow']['valid'], dataset['validation']['label']),
        'test': Doc2VecDataset(data['toktok']['hbow']['test'], dataset['test']['label']),
    },
    'tfidf': {
        'train': Doc2VecDataset(data['toktok']['tfidf']['train'], dataset['train']['label']),
        'valid': Doc2VecDataset(data['toktok']['tfidf']['valid'], dataset['validation']['label']),
        'test': Doc2VecDataset(data['toktok']['tfidf']['test'], dataset['test']['label']),
    },
    'bow_svd': {
        'train': Doc2VecDataset(data['toktok']['bow_svd']['train'], dataset['train']['label']),
        'valid': Doc2VecDataset(data['toktok']['bow_svd']['valid'], dataset['validation']['label']),
        'test': Doc2VecDataset(data['toktok']['bow_svd']['test'], dataset['test']['label']),
    },
    'hbow_svd': {
        'train': Doc2VecDataset(data['toktok']['hbow_svd']['train'], dataset['train']['label']),
        'valid': Doc2VecDataset(data['toktok']['hbow_svd']['valid'], dataset['validation']['label']),
        'test': Doc2VecDataset(data['toktok']['hbow_svd']['test'], dataset['test']['label']),
    },
    'tfidf_svd': {
        'train': Doc2VecDataset(data['toktok']['tfidf_svd']['train'], dataset['train']['label']),
        'valid': Doc2VecDataset(data['toktok']['tfidf_svd']['valid'], dataset['validation']['label']),
        'test': Doc2VecDataset(data['toktok']['tfidf_svd']['test'], dataset['test']['label']),
    },
}}

Create dataloaders

In [45]:
dataloaders = {'toktok': {
    'bow': {
        'train': DataLoader(datasets['toktok']['bow']['train'], batch_size=BATCH_SIZE, shuffle=True),
        'valid': DataLoader(datasets['toktok']['bow']['valid'], batch_size=BATCH_SIZE),
        'test': DataLoader(datasets['toktok']['bow']['test'], batch_size=BATCH_SIZE),
    },
    'hbow': {
        'train': DataLoader(datasets['toktok']['hbow']['train'], batch_size=BATCH_SIZE, shuffle=True),
        'valid': DataLoader(datasets['toktok']['hbow']['valid'], batch_size=BATCH_SIZE),
        'test': DataLoader(datasets['toktok']['hbow']['test'], batch_size=BATCH_SIZE),
    },
    'tfidf': {
        'train': DataLoader(datasets['toktok']['tfidf']['train'], batch_size=BATCH_SIZE, shuffle=True),
        'valid': DataLoader(datasets['toktok']['tfidf']['valid'], batch_size=BATCH_SIZE),
        'test': DataLoader(datasets['toktok']['tfidf']['test'], batch_size=BATCH_SIZE),
    },
    'bow_svd': {
        'train': DataLoader(datasets['toktok']['bow_svd']['train'], batch_size=BATCH_SIZE, shuffle=True),
        'valid': DataLoader(datasets['toktok']['bow_svd']['valid'], batch_size=BATCH_SIZE),
        'test': DataLoader(datasets['toktok']['bow_svd']['test'], batch_size=BATCH_SIZE),
    },
    'hbow_svd': {
        'train': DataLoader(datasets['toktok']['hbow_svd']['train'], batch_size=BATCH_SIZE, shuffle=True),
        'valid': DataLoader(datasets['toktok']['hbow_svd']['valid'], batch_size=BATCH_SIZE),
        'test': DataLoader(datasets['toktok']['hbow_svd']['test'], batch_size=BATCH_SIZE),
    },
    'tfidf_svd': {
        'train': DataLoader(datasets['toktok']['tfidf_svd']['train'], batch_size=BATCH_SIZE, shuffle=True),
        'valid': DataLoader(datasets['toktok']['tfidf_svd']['valid'], batch_size=BATCH_SIZE),
        'test': DataLoader(datasets['toktok']['tfidf_svd']['test'], batch_size=BATCH_SIZE),
    },
}}

# Show shapes and types
print('Training examples:  ', len(dataloaders['toktok']['bow']['train'].dataset))
print('Validation examples:', len(dataloaders['toktok']['bow']['valid'].dataset))
print('Test examples:      ', len(dataloaders['toktok']['bow']['test'].dataset))
print()
print('Training batches:  ', len(dataloaders['toktok']['bow']['train']))
print('Validation batches:', len(dataloaders['toktok']['bow']['valid']))
print('Test batches:      ', len(dataloaders['toktok']['bow']['test']))
print()
X, y = next(iter(dataloaders['toktok']['bow']['train']))
print(f'Shape of X: {X.shape} {X.dtype}')
print(f'Shape of y: {y.shape} {y.dtype}')
print()
X, y = next(iter(dataloaders['toktok']['bow_svd']['train']))
print(f'Shape of X: {X.shape} {X.dtype}')
print(f'Shape of y: {y.shape} {y.dtype}')
print()
display(X[0])
display(y[0])

Training examples:   45615
Validation examples: 2000
Test examples:       12284

Training batches:   713
Validation batches: 32
Test batches:       192

Shape of X: torch.Size([64, 1, 50000]) torch.int64
Shape of y: torch.Size([64]) torch.int64

Shape of X: torch.Size([64, 100]) torch.float64
Shape of y: torch.Size([64]) torch.int64



tensor([ 1.1555e-01,  1.6407e-01, -2.7625e-01, -2.4199e-02,  1.1401e-01,
         3.1504e-02,  1.1226e+00, -2.4902e-01, -5.6359e-01,  3.2294e-01,
         6.2872e-02,  2.8139e-01, -3.3960e-01,  3.8802e-02, -5.7210e-02,
        -4.7027e-03,  3.7865e-02, -1.0003e-01,  4.3402e-02,  2.9222e-03,
        -6.7827e-02,  1.5661e-03,  2.0842e-02,  1.6896e-02, -8.8386e-02,
        -3.0166e-02, -5.1601e-02, -2.9432e-02, -1.3618e-03,  5.0311e-03,
         1.8233e-03, -5.8659e-02, -3.2248e-02,  9.3290e-03,  1.9913e-02,
        -2.9928e-02,  1.3025e-02, -3.0176e-02,  9.9205e-03,  7.6976e-03,
        -4.1902e-02, -5.3152e-02,  1.2157e-02, -2.9063e-02,  4.9892e-02,
         4.1866e-02, -4.7178e-02,  3.7598e-02, -4.4794e-02, -6.9791e-03,
        -1.1249e-02,  5.5423e-02,  4.6279e-02,  1.0261e-02, -6.5526e-02,
        -1.5512e-02, -2.3795e-02, -2.9470e-02, -1.9551e-02, -5.3367e-02,
         4.8417e-02, -1.7730e-02,  1.0392e-01,  8.4523e-05,  7.8961e-02,
         1.1607e-01, -8.6686e-02, -6.0226e-02, -7.4

tensor(0)

## Token vectorizers
### Word2Vec
#### Train from the ground up

In [14]:
%%time
vector_size = 100
file = WORKING_PATH + 'word2vec_' + str(vector_size) + '.gensim'
saving = True

# Load model if it already exists
if os.path.isfile(file):
    word2vec = gensim.models.KeyedVectors.load(file, mmap='r')

else:
    # Initialize tokenizer wiht corpus in it
    tokenizer = sentiment_utils.Tokenizer(dataset['train']['text']
                                          + dataset['test']['text']
                                          + dataset['validation']['text'])

    # Train the model
    word2vec = gensim.models.Word2Vec(
        sentences=tokenizer, vector_size=vector_size, window=5, min_count=5, sg=1, hs=0, negative=5,
        workers=7, epochs=5, seed=RANDOM_STATE,
    )
    
    # Use the word vectors only
    word2vec = word2vec.wv

    # Save the model word vectors
    if saving:
        word2vec.save(file)

# Print vocabulary shape
print('Vocabulary shape:')
print((len(word2vec.index_to_key), vector_size))
print()

# Print most frequent words
print('Most frequent words:')
for word in word2vec.index_to_key[:20]:
    print(word)
print()

Vocabulary shape:
(10611, 100)

Most frequent words:
"
@user
'
,
!
.
:
...
?
may
tomorrow
go
)
day
-
get
see
like
(
;

CPU times: user 1.8 ms, sys: 1.89 ms, total: 3.69 ms
Wall time: 4.27 ms


Show examples

In [15]:
# Print the vector example
print('A vector example:')
print(word2vec['@user'])
print(word2vec['@user'].shape)

A vector example:
[-6.7988336e-03 -7.7007711e-03 -6.7419447e-03  7.7721477e-03
 -9.1446610e-03 -6.6873073e-03 -6.6153635e-03 -2.2669220e-03
  5.0509833e-03  5.8403742e-03  6.4396439e-03  8.6656129e-03
 -8.7526087e-03 -9.2006801e-04 -1.6529012e-03 -6.5322830e-03
 -3.4659612e-03 -1.9954813e-03  8.2546510e-03  1.9973540e-03
 -9.0243109e-03  4.0886807e-03 -5.3359149e-04 -2.5054060e-03
 -6.9734524e-03 -4.2239283e-03 -1.2363232e-03  1.5906275e-03
  1.5835894e-03  6.6484306e-03 -1.8646896e-03  9.8702870e-03
  9.3534179e-03 -8.1601581e-03 -3.8998926e-03 -6.2233713e-03
 -3.3651828e-04  2.3092914e-03 -2.8936565e-03 -3.0549956e-03
  3.3477665e-04 -2.8081452e-03 -7.9259863e-03 -8.3585903e-03
  6.7217945e-04  9.0850675e-03 -8.8485815e-03 -3.2784594e-03
 -1.6568815e-03  7.9573207e-03  2.2853673e-03 -1.6162921e-03
 -7.9821423e-03  3.6615168e-03 -2.7477740e-06  2.6824963e-03
 -9.2297187e-03 -8.0831572e-03  2.4737692e-03  4.3313741e-03
 -6.3958620e-03 -1.2299264e-03  1.1683321e-03  9.0518082e-03
  3.15

### fastText
#### Train from the ground up

In [16]:
%%time
vector_size = 100
file = WORKING_PATH + 'fasttext_' + str(vector_size) + '.gensim'
saving = True

# Load model if it already exists
if os.path.isfile(file):
    fasttext = gensim.models.fasttext.FastTextKeyedVectors.load(file, mmap='r')

else:
    # Initialize tokenizer wiht corpus in it
    tokenizer = sentiment_utils.Tokenizer(dataset['train']['text']
                                          + dataset['test']['text']
                                          + dataset['validation']['text'])

    # Train the model
    fasttext = gensim.models.FastText(
        sentences=tokenizer, vector_size=vector_size, window=5, min_count=5, sg=1, hs=0, negative=5,
        workers=7, epochs=5, seed=RANDOM_STATE,
    )
    
    # Use the word vectors only
    fasttext = fasttext.wv

    # Save the model word vectors
    if saving:
        fasttext.save(file)

# Print vocabulary shape
print('Vocabulary shape:')
print((len(fasttext.index_to_key), vector_size))
print()

# Print most frequent words
print('Most frequent words:')
for word in fasttext.index_to_key[:20]:
    print(word)
print()

Vocabulary shape:
(10611, 100)

Most frequent words:
"
@user
'
,
!
.
:
...
?
may
tomorrow
go
)
day
-
get
see
like
(
;

CPU times: user 386 ms, sys: 311 ms, total: 697 ms
Wall time: 2.76 s


Show examples

In [17]:
# Print the vector example
print('A vector example:')
print(fasttext['@user'])
print(fasttext['@user'].shape)

A vector example:
[-4.87358047e-04  4.51807398e-04  8.77588638e-04 -3.32686584e-04
 -1.02142920e-03 -5.65615657e-04 -1.85830169e-03 -1.64301752e-03
 -3.25506413e-03  4.57338197e-03  1.27422193e-03  3.48202884e-03
  1.94513152e-04  1.38777623e-05  2.34347157e-04  1.86236299e-04
  1.52681104e-03  1.45538780e-03 -2.91749515e-04 -8.49718112e-04
 -1.08485331e-03  7.60952767e-04  2.18979130e-03  1.52153475e-03
 -8.63333640e-04 -1.12725690e-03 -4.13653994e-04 -9.69837129e-04
 -3.50951846e-03  2.00337311e-03  2.72217090e-03  9.62128979e-04
 -9.16386663e-04 -2.23301514e-03 -9.92241781e-04 -4.96376480e-04
 -2.19261716e-03  4.06168081e-04 -2.76850234e-03 -2.22463836e-03
  1.32712605e-03 -9.71838774e-04 -3.71666916e-04 -3.45141103e-04
  2.19832268e-03 -4.77933296e-04 -1.50938821e-03 -8.66060960e-04
  1.31140207e-03 -1.85101863e-03  1.01888634e-03  2.18549496e-04
 -1.96930929e-03 -6.17635378e-04  2.47252802e-03 -8.30107136e-04
  1.03148588e-04  3.78433871e-03 -1.67751324e-03  8.75202590e-04
 -7.455

# Computational graphs

In [41]:
# Dictionary of models
models = {}

In [42]:
class LogRegModel(nn.Module):
    """Logistic Regression model"""
    def __init__(self, n_neurons):
        super().__init__()
        self.linear = nn.Linear(n_neurons, CLASSES)

    def forward(self, x):
        logits = self.linear(x)
        return logits

models['toktok_bow_logreg'] = LogRegModel(VOCAB_SIZE)
models['toktok_hbow_logreg'] = LogRegModel(VOCAB_SIZE)
models['toktok_tfidf_logreg'] = LogRegModel(VOCAB_SIZE)
models['toktok_bow_svd_logreg'] = LogRegModel(SVD_SIZE)
models['toktok_hbow_svd_logreg'] = LogRegModel(SVD_SIZE)
models['toktok_tfidf_svd_logreg'] = LogRegModel(SVD_SIZE)

# Print model info
model = 'toktok_bow_logreg'
params = [p.numel() for p in models[model].parameters()]
print(f'For "{model}" model:')
print('- all params:      ', params, '  total:', sum(params))
params = [p.numel() for p in models[model].parameters() if p.requires_grad]
print('- trainable params:', params, '  total:', sum(params))
print('-', models[model])
print()
model = 'toktok_bow_svd_logreg'
params = [p.numel() for p in models[model].parameters()]
print(f'For "{model}" model:')
print('- all params:      ', params, '  total:', sum(params))
params = [p.numel() for p in models[model].parameters() if p.requires_grad]
print('- trainable params:', params, '  total:', sum(params))
print('-', models[model])

For "toktok_bow_logreg" model:
- all params:       [150000, 3]   total: 150003
- trainable params: [150000, 3]   total: 150003
- LogRegModel(
  (linear): Linear(in_features=50000, out_features=3, bias=True)
)

For "toktok_bow_svd_logreg" model:
- all params:       [300, 3]   total: 303
- trainable params: [300, 3]   total: 303
- LogRegModel(
  (linear): Linear(in_features=100, out_features=3, bias=True)
)


In [43]:
class Dense2Model(nn.Module):
    """Dense model with 2 fully connected layers"""
    def __init__(self, n_neurons):
        super().__init__()
        self.linear_stack = nn.Sequential(
            nn.Linear(n_neurons[0], n_neurons[1]),
            nn.ReLU(),
            nn.Linear(n_neurons[1], CLASSES),
        )

    def forward(self, x):
        logits = self.linear_stack(x)
        return logits

models['toktok_bow_dense2'] = Dense2Model((VOCAB_SIZE, SVD_SIZE))
models['toktok_hbow_dense2'] = Dense2Model((VOCAB_SIZE, SVD_SIZE))
models['toktok_tfidf_dense2'] = Dense2Model((VOCAB_SIZE, SVD_SIZE))
models['toktok_bow_svd_dense2'] = Dense2Model((SVD_SIZE, SVD_SIZE))
models['toktok_hbow_svd_dense2'] = Dense2Model((SVD_SIZE, SVD_SIZE))
models['toktok_tfidf_svd_dense2'] = Dense2Model((SVD_SIZE, SVD_SIZE))

# Print model info
model = 'toktok_bow_dense2'
params = [p.numel() for p in models[model].parameters()]
print(f'For "{model}" model:')
print('- all params:      ', params, '  total:', sum(params))
params = [p.numel() for p in models[model].parameters() if p.requires_grad]
print('- trainable params:', params, '  total:', sum(params))
print('-', models[model])
print()
model = 'toktok_bow_svd_dense2'
params = [p.numel() for p in models[model].parameters()]
print(f'For "{model}" model:')
print('- all params:      ', params, '  total:', sum(params))
params = [p.numel() for p in models[model].parameters() if p.requires_grad]
print('- trainable params:', params, '  total:', sum(params))
print('-', models[model])

For "toktok_bow_dense2" model:
- all params:       [5000000, 100, 300, 3]   total: 5000403
- trainable params: [5000000, 100, 300, 3]   total: 5000403
- Dense2Model(
  (linear_stack): Sequential(
    (0): Linear(in_features=50000, out_features=100, bias=True)
    (1): ReLU()
    (2): Linear(in_features=100, out_features=3, bias=True)
  )
)

For "toktok_bow_svd_dense2" model:
- all params:       [10000, 100, 300, 3]   total: 10403
- trainable params: [10000, 100, 300, 3]   total: 10403
- Dense2Model(
  (linear_stack): Sequential(
    (0): Linear(in_features=100, out_features=100, bias=True)
    (1): ReLU()
    (2): Linear(in_features=100, out_features=3, bias=True)
  )
)


## Range test for learning rate

In [44]:
if DO_LR_RANGE_TEST:
    start_lr = 1e-4
    end_lr = 1e1
    num_iter = 50

    # Loss function
    loss_fn = nn.CrossEntropyLoss()

    for name, model in models.items():
        print(name)
        
        # Optimizer
        optimizer = torch.optim.SGD(model.parameters(), lr=start_lr)

        # Range test for LR
        lr_finder = LRFinder(model, optimizer, loss_fn)
        lr_finder.range_test(train_loader=dataloaders['toktok']['bow']['train'],
                             val_loader=dataloaders['toktok']['bow']['valid'],
                             end_lr=end_lr,
                             num_iter=num_iter)
        
        # Inspect the loss-LR graph
        lr_finder.plot()
        
        # Reset the model and optimizer to their initial state
        lr_finder.reset()
else:
    lrs = {
        'logreg': 1e-1,
        'dense3': 6e-1,
        'conv3': 1e-1,
        'conv5': 4e-1,
    }

toktok_bow_logreg


  0%|          | 0/50 [00:00<?, ?it/s]

RuntimeError: mat1 and mat2 must have the same dtype