In [1]:
%load_ext autoreload
%autoreload 1
%aimport sentiment_utils

[nltk_data] Downloading package stopwords to /Users/admin/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
import os
import pickle

import datasets
import gensim
import pandas as pd
import sklearn
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, HashingVectorizer
from tqdm import tqdm
import torch
import transformers
from transformers import AutoTokenizer

print('python:'.ljust(16), sys.version.split('\n')[0])
print('scikit-learn:'.ljust(16), sklearn.__version__)
print('Gensim:'.ljust(16), gensim.__version__)
print('PyTorch:'.ljust(16), torch.__version__)
print('Transformers:'.ljust(16), transformers.__version__)

python:          3.10.11 (main, Apr  7 2023, 07:24:53) [Clang 14.0.0 (clang-1400.0.29.202)]
scikit-learn:    1.2.2
Gensim:          4.3.1
PyTorch:         1.13.1
Transformers:    4.28.1


# Device

In [3]:
# Get cpu or gpu device for training
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f'Using {DEVICE} device')

Using cpu device


# Hyperparameters & Constants

In [4]:
# Hyperparameters
# BATCH_SIZE = 64
# EPOCHS = 15  # select from: 2**n - 1 = [1, 3, 7, 15, ...]
# SCHEDULER_GAMMA = 0.7

# Constants
WORKING_PATH = './sentiment-data/'
MODEL_PATH = '../app/models/'
DATASET_NAME = 'tweet_eval'
DATASET_CONF = 'sentiment'
CLASSES = 3
HUGGINGFACE_MODEL_NAME = 'cardiffnlp/twitter-roberta-base-sentiment-latest'

# Actions
# DO_LR_RANGE_TEST=False

# Random state

In [5]:
RANDOM_STATE = 2147483647
# random.seed(RANDOM_STATE)
# np.random.seed(RANDOM_STATE)
torch.manual_seed(RANDOM_STATE)
torch.cuda.manual_seed(RANDOM_STATE)
torch.cuda.manual_seed_all(RANDOM_STATE)

# Load & show data

In [6]:
dataset = datasets.load_dataset(DATASET_NAME, DATASET_CONF)
dataset

Found cached dataset tweet_eval (/Users/admin/.cache/huggingface/datasets/tweet_eval/sentiment/1.1.0/12aee5282b8784f3e95459466db4cdf45c6bf49719c25cdb0743d71ed0410343)


  0%|          | 0/3 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 45615
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 12284
    })
    validation: Dataset({
        features: ['text', 'label'],
        num_rows: 2000
    })
})

In [7]:
dataset['train'][0:5]

{'text': ['"QT @user In the original draft of the 7th book, Remus Lupin survived the Battle of Hogwarts. #HappyBirthdayRemusLupin"',
  '"Ben Smith / Smith (concussion) remains out of the lineup Thursday, Curtis #NHL #SJ"',
  'Sorry bout the stream last night I crashed out but will be on tonight for sure. Then back to Minecraft in pc tomorrow night.',
  "Chase Headley's RBI double in the 8th inning off David Price snapped a Yankees streak of 33 consecutive scoreless innings against Blue Jays",
  '@user Alciato: Bee will invest 150 million in January, another 200 in the Summer and plans to bring Messi by 2017"'],
 'label': [2, 1, 1, 1, 2]}

# Tokenization
## TokTokTokenizer

In [8]:
tokenizer = sentiment_utils.Tokenizer()

# Print tokenization examples
for text in dataset['train']['text'][:5]:
    print(text)
    print(tokenizer(text, return_str=True))
    print()

"QT @user In the original draft of the 7th book, Remus Lupin survived the Battle of Hogwarts. #HappyBirthdayRemusLupin"
" qt @user origin draft 7th book , remu lupin surviv battl hogwarts. #happybirthdayremuslupin "

"Ben Smith / Smith (concussion) remains out of the lineup Thursday, Curtis #NHL #SJ"
" ben smith / smith ( concuss ) remain lineup thursday , curti #nhl #sj "

Sorry bout the stream last night I crashed out but will be on tonight for sure. Then back to Minecraft in pc tomorrow night.
sorri bout stream last night crash tonight sure. back minecraft pc tomorrow night .

Chase Headley's RBI double in the 8th inning off David Price snapped a Yankees streak of 33 consecutive scoreless innings against Blue Jays
chase headley ' rbi doubl 8th inning david price snap yanke streak 33 consecut scoreless inning blue jay

@user Alciato: Bee will invest 150 million in January, another 200 in the Summer and plans to bring Messi by 2017"
@user alciato : bee invest 150 million januari , ano

## RobertaTokenizerFast

In [21]:
tokenizer = AutoTokenizer.from_pretrained(HUGGINGFACE_MODEL_NAME)

for text in dataset['train']['text'][:2]:
    print(text)
    preprocessed_text = sentiment_utils.preprocess_text(text)
    print(alv_pretrained(preprocessed_text, return_tensors='pt')['input_ids'])
    print()

"QT @user In the original draft of the 7th book, Remus Lupin survived the Battle of Hogwarts. #HappyBirthdayRemusLupin"
tensor([[    0,   113,  1864,   565,   787, 12105,    96,     5,  1461,  2479,
             9,     5,   262,   212,  1040,     6,  8022,   687, 26110,   179,
          5601,     5,  9846,     9, 42210,     4,   849, 21136, 44728,  1208,
         31157,   687,   574,   658,   179,   113,    22,  1864,   565,   787,
         12105,    96,     5,  1461,  2479,     9,     5,   262,   212,  1040,
             6,  8022,   687, 26110,   179,  5601,     5,  9846,     9, 42210,
             4,   849, 21136, 44728,  1208, 31157,   687,   574,   658,   179,
           113,     2]])

"Ben Smith / Smith (concussion) remains out of the lineup Thursday, Curtis #NHL #SJ"
tensor([[    0,   113, 17521,  1259,  1589,  1259,    36,  3865, 33825,    43,
          1189,    66,     9,     5,  4451,   296,     6, 11292,   849,   487,
          8064,   849,   104,   863,   113,    22, 17521, 

In [22]:
type(alv_pretrained)

transformers.models.roberta.tokenization_roberta_fast.RobertaTokenizerFast

In [23]:
%%time
# To process the whole corpus
def preprocess_function(examples):
    preprocessed_text = sentiment_utils.preprocess_text(examples['text'])
    return alv_pretrained(preprocessed_text)

dataset_alv = dataset.map(preprocess_function, batched=True)

dataset_alv

Map:   0%|          | 0/45615 [00:00<?, ? examples/s]

Map:   0%|          | 0/12284 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

CPU times: user 8.98 s, sys: 125 ms, total: 9.1 s
Wall time: 2.13 s


DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 45615
    })
    test: Dataset({
        features: ['text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 12284
    })
    validation: Dataset({
        features: ['text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 2000
    })
})

# Vectorization
## Document vectorizers
### BOW, TF-IDF, Hashing BOW and their SVD variants
Fit vectorizers

In [10]:
%%time
n_features = 50000
svd_components = 100
save_vectorizers = True
file_vectorizers = WORKING_PATH + 'bow_tfidf_vectorizers_' + str(n_features) + '_' + str(svd_components) + '.pickle'

# Load vectorizer if it already exists
if os.path.isfile(file_vectorizers):
    with open(file_vectorizers, 'rb') as f:
        vectorizers = pickle.load(f)
        
    (
        bow_vectorizer, tfidf_vectorizer, hashing_vectorizer,
        svd_bow_vectorizer, svd_tfidf_vectorizer, svd_hashing_vectorizer,
    ) = vectorizers

else:
    # Initialize tokenizer
    tokenizer = sentiment_utils.Tokenizer()

    # Initialize vectorizers
    bow_vectorizer = CountVectorizer(lowercase=False,
                                     tokenizer=tokenizer,
                                     max_features=n_features)
    tfidf_vectorizer = TfidfTransformer()
    hashing_vectorizer = HashingVectorizer(lowercase=False,
                                           tokenizer=tokenizer,
                                           n_features=n_features)

    # Initialize SVD-truncated vectorizers
    svd_bow_vectorizer = TruncatedSVD(n_components=svd_components)
    svd_tfidf_vectorizer = TruncatedSVD(n_components=svd_components)
    svd_hashing_vectorizer = TruncatedSVD(n_components=svd_components)

    # Fit vectorizers and transform train data
    x_train_bow = bow_vectorizer.fit_transform(dataset['train']['text'])
    x_train_tfidf = tfidf_vectorizer.fit_transform(x_train_bow)
    x_train_hashing = hashing_vectorizer.fit_transform(dataset['train']['text'])

    # Fit SVD-truncated vectorizers
    svd_bow_vectorizer.fit(x_train_bow)
    svd_tfidf_vectorizer.fit(x_train_tfidf)
    svd_hashing_vectorizer.fit(x_train_hashing)
    
    # Save vectorizers
    if save_vectorizers:
        vectorizers = (
            bow_vectorizer, tfidf_vectorizer, hashing_vectorizer,
            svd_bow_vectorizer, svd_tfidf_vectorizer, svd_hashing_vectorizer,
        )

        with open(file_vectorizers, 'wb') as f:
            pickle.dump(vectorizers, f, pickle.HIGHEST_PROTOCOL)

# Print SVD explained variance
print('Explained variance for SVD:')
print(f'BOW:         {(svd_bow_vectorizer.explained_variance_ratio_.sum() * 100):.2f} %')
print(f'TF-IDF:      {(svd_tfidf_vectorizer.explained_variance_ratio_.sum() * 100):.2f} %')
print(f'Hashing BOW: {(svd_hashing_vectorizer.explained_variance_ratio_.sum() * 100):.2f} %')
print()



Explained variance for SVD:
BOW:         51.53 %
TF-IDF:      16.51 %
Hashing BOW: 41.75 %

CPU times: user 56.8 s, sys: 17.6 s, total: 1min 14s
Wall time: 22.6 s


Transform data

In [12]:
%%time
transform_data = True
save_data = True
file_data = WORKING_PATH + 'bow_tfidf_data_' + str(n_features) + '_' + str(svd_components) + '.pickle'

if transform_data:
    # Load transformed data if it already exists
    if os.path.isfile(file_data):
        with open(file_data, 'rb') as f:
            data = pickle.load(f)

        (
            x_train_bow, x_valid_bow, x_test_bow,
            x_train_tfidf, x_valid_tfidf, x_test_tfidf,
            x_train_hashing, x_valid_hashing, x_test_hashing,
            x_train_svd_bow, x_valid_svd_bow, x_test_svd_bow,
            x_train_svd_tfidf, x_valid_svd_tfidf, x_test_svd_tfidf,
            x_train_svd_hashing, x_valid_svd_hashing, x_test_svd_hashing,
        ) = data

    else:
        # Fit vectorizers and transform train data
        x_train_bow = bow_vectorizer.transform(dataset['train']['text'])
        x_train_tfidf = tfidf_vectorizer.transform(x_train_bow)
        x_train_hashing = hashing_vectorizer.transform(dataset['train']['text'])

        # Transform train data for SVD-truncated vectorizers
        x_train_svd_bow = svd_bow_vectorizer.transform(x_train_bow)
        x_train_svd_tfidf = svd_tfidf_vectorizer.transform(x_train_tfidf)
        x_train_svd_hashing = svd_hashing_vectorizer.transform(x_train_hashing)

        # Transform validation and test data
        x_valid_bow = bow_vectorizer.transform(dataset['validation']['text'])
        x_valid_tfidf = tfidf_vectorizer.transform(x_valid_bow)
        x_valid_hashing = hashing_vectorizer.transform(dataset['validation']['text'])
        x_test_bow = bow_vectorizer.transform(dataset['test']['text'])
        x_test_tfidf = tfidf_vectorizer.transform(x_test_bow)
        x_test_hashing = hashing_vectorizer.transform(dataset['test']['text'])

        # Transform validation and test data for SVD-truncated vectorizers
        x_valid_svd_bow = svd_bow_vectorizer.transform(x_valid_bow)
        x_valid_svd_tfidf = svd_tfidf_vectorizer.transform(x_valid_tfidf)
        x_valid_svd_hashing = svd_hashing_vectorizer.transform(x_valid_hashing)
        x_test_svd_bow = svd_bow_vectorizer.transform(x_test_bow)
        x_test_svd_tfidf = svd_tfidf_vectorizer.transform(x_test_tfidf)
        x_test_svd_hashing = svd_hashing_vectorizer.transform(x_test_hashing)

        # Save transformed data
        if save_data:
            data = (
                x_train_bow, x_valid_bow, x_test_bow,
                x_train_tfidf, x_valid_tfidf, x_test_tfidf,
                x_train_hashing, x_valid_hashing, x_test_hashing,
                x_train_svd_bow, x_valid_svd_bow, x_test_svd_bow,
                x_train_svd_tfidf, x_valid_svd_tfidf, x_test_svd_tfidf,
                x_train_svd_hashing, x_valid_svd_hashing, x_test_svd_hashing,
            )

            with open(file_data, 'wb') as f:
                pickle.dump(data, f, pickle.HIGHEST_PROTOCOL)

    # Print shapes
    print('Full size data shapes:')
    print('BOW:        ', x_train_bow.shape, x_valid_bow.shape, x_test_bow.shape)
    print('TF-IDF:     ', x_train_tfidf.shape, x_valid_tfidf.shape, x_test_tfidf.shape)
    print('Hashing BOW:', x_train_hashing.shape, x_valid_hashing.shape, x_test_hashing.shape)
    print()
    print('SVD-truncated data shapes:')
    print('BOW:        ', x_train_svd_bow.shape, x_valid_svd_bow.shape, x_test_svd_bow.shape)
    print('TF-IDF:     ', x_train_svd_tfidf.shape, x_valid_svd_tfidf.shape, x_test_svd_tfidf.shape)
    print('Hashing BOW:', x_train_svd_hashing.shape, x_valid_svd_hashing.shape, x_test_svd_hashing.shape)
    print()

Full size data shapes:
BOW:         (45615, 50000) (2000, 50000) (12284, 50000)
TF-IDF:      (45615, 50000) (2000, 50000) (12284, 50000)
Hashing BOW: (45615, 50000) (2000, 50000) (12284, 50000)

SVD-truncated data shapes:
BOW:         (45615, 100) (2000, 100) (12284, 100)
TF-IDF:      (45615, 100) (2000, 100) (12284, 100)
Hashing BOW: (45615, 100) (2000, 100) (12284, 100)

CPU times: user 18.7 s, sys: 106 ms, total: 18.8 s
Wall time: 18.9 s


Show examples

In [17]:
example = dataset['train']['text'][:2]

# Vecotrize document example
bow_output = bow_vectorizer.transform(example)
tfidf_output = tfidf_vectorizer.transform(bow_output)
hashing_output = hashing_vectorizer.transform(example)
svd_bow_output = svd_bow_vectorizer.transform(bow_output)
svd_tfidf_output = svd_tfidf_vectorizer.transform(tfidf_output)
svd_hashing_output = svd_hashing_vectorizer.transform(hashing_output)

# Print document vectors and their shapes
print('Document vector examples and their shapes:')
print()
print('BOW:')
print(bow_output.todense())
print(bow_output.shape)
print()
print('TF-IDF:')
print(tfidf_output.todense())
print(tfidf_output.shape)
print()
print('Hashing BOW:')
print(hashing_output.todense())
print(hashing_output.shape)
print()
print('SVD BOW:')
print(svd_bow_output[0][:6])
print(svd_bow_output.shape)
print()
print('SVD TF-IDF:')
print(svd_tfidf_output[0][:6])
print(svd_tfidf_output.shape)
print()
print('SVD Hashing BOW:')
print(svd_hashing_output[0][:6])
print(svd_hashing_output.shape)

Document vector examples and their shapes:

BOW:
[[0 2 0 ... 0 0 0]
 [0 2 0 ... 0 0 0]]
(2, 50000)

TF-IDF:
[[0.         0.1592305  0.         ... 0.         0.         0.        ]
 [0.         0.16949097 0.         ... 0.         0.         0.        ]]
(2, 50000)

Hashing BOW:
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
(2, 50000)

SVD BOW:
[ 2.19219037  0.02814938 -0.37149003  0.46869895  0.77413124 -0.27684479]
(2, 100)

SVD TF-IDF:
[ 0.16403713 -0.06192932  0.00118506  0.05926234 -0.01728777 -0.0282153 ]
(2, 100)

SVD Hashing BOW:
[ 0.51587004 -0.1754948   0.05378128  0.20697629 -0.04919744 -0.03881921]
(2, 100)


## Token vectorizers
### Word2Vec

Train word2vec from the groud up

In [24]:
%%time
vector_size = 100
file = WORKING_PATH + 'word2vec_' + str(vector_size) + '.gensim'
saving = True

# Load model if it already exists
if os.path.isfile(file):
    word2vec = gensim.models.KeyedVectors.load(file, mmap='r')

else:
    # Initialize tokenizer wiht corpus in it
    tokenizer = sentiment_utils.Tokenizer(dataset['train']['text']
                                          + dataset['test']['text']
                                          + dataset['validation']['text'])

    # Train the model
    word2vec = gensim.models.Word2Vec(
        sentences=tokenizer, vector_size=vector_size, window=5, min_count=5, sg=1, hs=0, negative=5,
        workers=7, epochs=5, seed=RANDOM_STATE,
    )
    
    # Use the word vectors only
    word2vec = word2vec.wv

    # Save the model word vectors
    if saving:
        word2vec.save(file)

# Print vocabulary shape
print('Vocabulary shape:')
print((len(word2vec.index_to_key), vector_size))
print()

# Print most frequent words
print('Most frequent words:')
for word in word2vec.index_to_key[:20]:
    print(word)
print()

Vocabulary shape:
(10611, 100)

Most frequent words:
"
@user
'
,
!
.
:
...
?
may
tomorrow
go
)
day
-
get
see
like
(
;

CPU times: user 6.08 ms, sys: 3.92 ms, total: 10 ms
Wall time: 8.04 ms


Show examples

In [32]:
# Print the vector example
print('A vector example:')
print(word2vec['@user'])
print(word2vec['@user'].shape)

A vector example:
[-6.7988336e-03 -7.7007711e-03 -6.7419447e-03  7.7721477e-03
 -9.1446610e-03 -6.6873073e-03 -6.6153635e-03 -2.2669220e-03
  5.0509833e-03  5.8403742e-03  6.4396439e-03  8.6656129e-03
 -8.7526087e-03 -9.2006801e-04 -1.6529012e-03 -6.5322830e-03
 -3.4659612e-03 -1.9954813e-03  8.2546510e-03  1.9973540e-03
 -9.0243109e-03  4.0886807e-03 -5.3359149e-04 -2.5054060e-03
 -6.9734524e-03 -4.2239283e-03 -1.2363232e-03  1.5906275e-03
  1.5835894e-03  6.6484306e-03 -1.8646896e-03  9.8702870e-03
  9.3534179e-03 -8.1601581e-03 -3.8998926e-03 -6.2233713e-03
 -3.3651828e-04  2.3092914e-03 -2.8936565e-03 -3.0549956e-03
  3.3477665e-04 -2.8081452e-03 -7.9259863e-03 -8.3585903e-03
  6.7217945e-04  9.0850675e-03 -8.8485815e-03 -3.2784594e-03
 -1.6568815e-03  7.9573207e-03  2.2853673e-03 -1.6162921e-03
 -7.9821423e-03  3.6615168e-03 -2.7477740e-06  2.6824963e-03
 -9.2297187e-03 -8.0831572e-03  2.4737692e-03  4.3313741e-03
 -6.3958620e-03 -1.2299264e-03  1.1683321e-03  9.0518082e-03
  3.15

### fastText

Train fastTest from the groud up

In [43]:
%%time
vector_size = 100
file = WORKING_PATH + 'fasttext_' + str(vector_size) + '.gensim'
saving = True

# Load model if it already exists
if os.path.isfile(file):
    fasttext = gensim.models.fasttext.FastTextKeyedVectors.load(file, mmap='r')

else:
    # Initialize tokenizer wiht corpus in it
    tokenizer = sentiment_utils.Tokenizer(dataset['train']['text']
                                          + dataset['test']['text']
                                          + dataset['validation']['text'])

    # Train the model
    fasttext = gensim.models.fasttext.FastText(
        sentences=tokenizer, vector_size=vector_size, window=5, min_count=5, sg=1, hs=0, negative=5,
        workers=7, epochs=5, seed=RANDOM_STATE,
    )
    
    # Use the word vectors only
    fasttext = fasttext.wv

    # Save the model word vectors
    if saving:
        fasttext.save(file)

# Print vocabulary shape
print('Vocabulary shape:')
print((len(fasttext.index_to_key), vector_size))
print()

# Print most frequent words
print('Most frequent words:')
for word in fasttext.index_to_key[:20]:
    print(word)
print()

Vocabulary shape:
(10611, 100)

Most frequent words:
"
@user
'
,
!
.
:
...
?
may
tomorrow
go
)
day
-
get
see
like
(
;

CPU times: user 389 ms, sys: 295 ms, total: 684 ms
Wall time: 3.3 s


Show examples

In [46]:
# Print the vector example
print('A vector example:')
print(fasttext['@user'])
print(fasttext['@user'].shape)

A vector example:
[-4.87358047e-04  4.51807398e-04  8.77588638e-04 -3.32686584e-04
 -1.02142920e-03 -5.65615657e-04 -1.85830169e-03 -1.64301752e-03
 -3.25506413e-03  4.57338197e-03  1.27422193e-03  3.48202884e-03
  1.94513152e-04  1.38777623e-05  2.34347157e-04  1.86236299e-04
  1.52681104e-03  1.45538780e-03 -2.91749515e-04 -8.49718112e-04
 -1.08485331e-03  7.60952767e-04  2.18979130e-03  1.52153475e-03
 -8.63333640e-04 -1.12725690e-03 -4.13653994e-04 -9.69837129e-04
 -3.50951846e-03  2.00337311e-03  2.72217090e-03  9.62128979e-04
 -9.16386663e-04 -2.23301514e-03 -9.92241781e-04 -4.96376480e-04
 -2.19261716e-03  4.06168081e-04 -2.76850234e-03 -2.22463836e-03
  1.32712605e-03 -9.71838774e-04 -3.71666916e-04 -3.45141103e-04
  2.19832268e-03 -4.77933296e-04 -1.50938821e-03 -8.66060960e-04
  1.31140207e-03 -1.85101863e-03  1.01888634e-03  2.18549496e-04
 -1.96930929e-03 -6.17635378e-04  2.47252802e-03 -8.30107136e-04
  1.03148588e-04  3.78433871e-03 -1.67751324e-03  8.75202590e-04
 -7.455