In [1]:
%load_ext autoreload
%autoreload 1
%aimport sentiment_utils

[nltk_data] Downloading package stopwords to /Users/admin/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
import os
import pickle

import datasets
import gensim
import pandas as pd
import sklearn
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, HashingVectorizer
from tqdm import tqdm
import torch
from transformers import AutoTokenizer

print('python:'.ljust(16), sys.version.split('\n')[0])
print('scikit-learn:'.ljust(16), sklearn.__version__)
print('Gensim:'.ljust(16), gensim.__version__)
print('PyTorch:'.ljust(16), torch.__version__)

python:          3.10.11 (main, Apr  7 2023, 07:24:53) [Clang 14.0.0 (clang-1400.0.29.202)]
scikit-learn:    1.2.2
Gensim:          4.3.1
PyTorch:         1.13.1


# Device

In [3]:
# Get cpu or gpu device for training
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f'Using {DEVICE} device')

Using cpu device


# Hyperparameters & Constants

In [4]:
# Hyperparameters
# BATCH_SIZE = 64
# EPOCHS = 15  # select from: 2**n - 1 = [1, 3, 7, 15, ...]
# SCHEDULER_GAMMA = 0.7

# Constants
WORKING_PATH = './sentiment-data/'
MODEL_PATH = '../app/models/'
DATASET_NAME = 'tweet_eval'
DATASET_CONF = 'sentiment'
CLASSES = 3
HUGGINGFACE_MODEL_NAME = 'cardiffnlp/twitter-roberta-base-sentiment-latest'

# Actions
# DO_LR_RANGE_TEST=False

# Random state

In [5]:
RANDOM_STATE = 2147483647
# random.seed(RANDOM_STATE)
# np.random.seed(RANDOM_STATE)
torch.manual_seed(RANDOM_STATE)
torch.cuda.manual_seed(RANDOM_STATE)
torch.cuda.manual_seed_all(RANDOM_STATE)

# Load & show data

In [6]:
dataset = datasets.load_dataset(DATASET_NAME, DATASET_CONF)
dataset

Found cached dataset tweet_eval (/Users/admin/.cache/huggingface/datasets/tweet_eval/sentiment/1.1.0/12aee5282b8784f3e95459466db4cdf45c6bf49719c25cdb0743d71ed0410343)


  0%|          | 0/3 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 45615
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 12284
    })
    validation: Dataset({
        features: ['text', 'label'],
        num_rows: 2000
    })
})

In [7]:
dataset['train'][0:5]

{'text': ['"QT @user In the original draft of the 7th book, Remus Lupin survived the Battle of Hogwarts. #HappyBirthdayRemusLupin"',
  '"Ben Smith / Smith (concussion) remains out of the lineup Thursday, Curtis #NHL #SJ"',
  'Sorry bout the stream last night I crashed out but will be on tonight for sure. Then back to Minecraft in pc tomorrow night.',
  "Chase Headley's RBI double in the 8th inning off David Price snapped a Yankees streak of 33 consecutive scoreless innings against Blue Jays",
  '@user Alciato: Bee will invest 150 million in January, another 200 in the Summer and plans to bring Messi by 2017"'],
 'label': [2, 1, 1, 1, 2]}

# Tokenization

In [9]:
tokenizer = sentiment_utils.Tokenizer()

# Print tokenization examples
for text in dataset['train']['text'][:5]:
    print(text)
    print(tokenizer(text, return_str=True))
    print()

"QT @user In the original draft of the 7th book, Remus Lupin survived the Battle of Hogwarts. #HappyBirthdayRemusLupin"
" qt @user origin draft 7th book , remu lupin surviv battl hogwarts. #happybirthdayremuslupin "

"Ben Smith / Smith (concussion) remains out of the lineup Thursday, Curtis #NHL #SJ"
" ben smith / smith ( concuss ) remain lineup thursday , curti #nhl #sj "

Sorry bout the stream last night I crashed out but will be on tonight for sure. Then back to Minecraft in pc tomorrow night.
sorri bout stream last night crash tonight sure. back minecraft pc tomorrow night .

Chase Headley's RBI double in the 8th inning off David Price snapped a Yankees streak of 33 consecutive scoreless innings against Blue Jays
chase headley ' rbi doubl 8th inning david price snap yanke streak 33 consecut scoreless inning blue jay

@user Alciato: Bee will invest 150 million in January, another 200 in the Summer and plans to bring Messi by 2017"
@user alciato : bee invest 150 million januari , ano

# Vectorization
## Document vectorizers
### BOW, TF-IDF, Hashing BOW and their SVD variants

In [11]:
%%time
n_features = 50000
svd_components = 100
file = WORKING_PATH + 'bow_tfidf_' + str(n_features) + '_' + str(svd_components) + '.pickle'
saving = True

# Load vectorizer if it already exists
if os.path.isfile(file):
    with open(file, 'rb') as f:
        output = pickle.load(f)
        
    (
        bow_vectorizer, x_train_bow, x_valid_bow, x_test_bow,
        tfidf_vectorizer, x_train_tfidf, x_valid_tfidf, x_test_tfidf,
        hashing_vectorizer, x_train_hashing, x_valid_hashing, x_test_hashing,
        svd_bow_vectorizer, x_train_svd_bow, x_valid_svd_bow, x_test_svd_bow,
        svd_tfidf_vectorizer, x_train_svd_tfidf, x_valid_svd_tfidf, x_test_svd_tfidf,
        svd_hashing_vectorizer, x_train_svd_hashing, x_valid_svd_hashing, x_test_svd_hashing,
    ) = output

else:
    # Initialize tokenizer
    tokenizer = Tokenizer()

    # Initialize vectorizers
    bow_vectorizer = CountVectorizer(lowercase=False,
                                     tokenizer=tokenizer,
                                     max_features=n_features)
    tfidf_vectorizer = TfidfTransformer()
    hashing_vectorizer = HashingVectorizer(lowercase=False,
                                           tokenizer=tokenizer,
                                           n_features=n_features)

    # Initialize SVD-truncated vectorizers
    svd_bow_vectorizer = TruncatedSVD(n_components=svd_components)
    svd_tfidf_vectorizer = TruncatedSVD(n_components=svd_components)
    svd_hashing_vectorizer = TruncatedSVD(n_components=svd_components)

    # Fit vectorizers and transform train data
    x_train_bow = bow_vectorizer.fit_transform(dataset['train']['text'])
    x_train_tfidf = tfidf_vectorizer.fit_transform(x_train_bow)
    x_train_hashing = hashing_vectorizer.fit_transform(dataset['train']['text'])

    # Fit SVD-truncated vectorizers and transform train data
    x_train_svd_bow = svd_bow_vectorizer.fit_transform(x_train_bow)
    x_train_svd_tfidf = svd_tfidf_vectorizer.fit_transform(x_train_tfidf)
    x_train_svd_hashing = svd_hashing_vectorizer.fit_transform(x_train_hashing)

    # Transform validation and test data
    x_valid_bow = bow_vectorizer.transform(dataset['validation']['text'])
    x_valid_tfidf = tfidf_vectorizer.transform(x_valid_bow)
    x_valid_hashing = hashing_vectorizer.transform(dataset['validation']['text'])
    x_test_bow = bow_vectorizer.transform(dataset['test']['text'])
    x_test_tfidf = tfidf_vectorizer.transform(x_test_bow)
    x_test_hashing = hashing_vectorizer.transform(dataset['test']['text'])

    # Transform validation and test data for SVD-truncated vectorizers
    x_valid_svd_bow = svd_bow_vectorizer.transform(x_valid_bow)
    x_valid_svd_tfidf = svd_tfidf_vectorizer.transform(x_valid_tfidf)
    x_valid_svd_hashing = svd_hashing_vectorizer.transform(x_valid_hashing)
    x_test_svd_bow = svd_bow_vectorizer.transform(x_test_bow)
    x_test_svd_tfidf = svd_tfidf_vectorizer.transform(x_test_tfidf)
    x_test_svd_hashing = svd_hashing_vectorizer.transform(x_test_hashing)
    
    # Form output
    output = (
        bow_vectorizer, x_train_bow, x_valid_bow, x_test_bow,
        tfidf_vectorizer, x_train_tfidf, x_valid_tfidf, x_test_tfidf,
        hashing_vectorizer, x_train_hashing, x_valid_hashing, x_test_hashing,
        svd_bow_vectorizer, x_train_svd_bow, x_valid_svd_bow, x_test_svd_bow,
        svd_tfidf_vectorizer, x_train_svd_tfidf, x_valid_svd_tfidf, x_test_svd_tfidf,
        svd_hashing_vectorizer, x_train_svd_hashing, x_valid_svd_hashing, x_test_svd_hashing,
    )

    # Save vectorizers and transformed data
    if saving:
        with open(file, 'wb') as f:
            pickle.dump(output, f, pickle.HIGHEST_PROTOCOL)

# Print shapes
print('Full size data shapes:')
print('BOW:        ', x_train_bow.shape, x_valid_bow.shape, x_test_bow.shape)
print('TF-IDF:     ', x_train_tfidf.shape, x_valid_tfidf.shape, x_test_tfidf.shape)
print('Hashing BOW:', x_train_hashing.shape, x_valid_hashing.shape, x_test_hashing.shape)
print()
print('SVD-truncated data shapes:')
print('BOW:        ', x_train_svd_bow.shape, x_valid_svd_bow.shape, x_test_svd_bow.shape)
print('TF-IDF:     ', x_train_svd_tfidf.shape, x_valid_svd_tfidf.shape, x_test_svd_tfidf.shape)
print('Hashing BOW:', x_train_svd_hashing.shape, x_valid_svd_hashing.shape, x_test_svd_hashing.shape)
print()

# Print SVD explained variance
print('Explained variance for SVD:')
print(f'BOW:         {(svd_bow_vectorizer.explained_variance_ratio_.sum() * 100):.2f} %')
print(f'TF-IDF:      {(svd_tfidf_vectorizer.explained_variance_ratio_.sum() * 100):.2f} %')
print(f'Hashing BOW: {(svd_hashing_vectorizer.explained_variance_ratio_.sum() * 100):.2f} %')
print()

Full size data shapes:
BOW:         (45615, 50000) (2000, 50000) (12284, 50000)
TF-IDF:      (45615, 50000) (2000, 50000) (12284, 50000)
Hashing BOW: (45615, 50000) (2000, 50000) (12284, 50000)

SVD-truncated data shapes:
BOW:         (45615, 100) (2000, 100) (12284, 100)
TF-IDF:      (45615, 100) (2000, 100) (12284, 100)
Hashing BOW: (45615, 100) (2000, 100) (12284, 100)

Explained variance for SVD:
BOW:         51.53 %
TF-IDF:      16.50 %
Hashing BOW: 41.75 %

CPU times: user 24.7 ms, sys: 97.8 ms, total: 122 ms
Wall time: 187 ms


In [17]:
example = dataset['train']['text'][:2]

# Vecotrize document example
bow_output = bow_vectorizer.transform(example)
tfidf_output = tfidf_vectorizer.transform(bow_output)
hashing_output = hashing_vectorizer.transform(example)
svd_bow_output = svd_bow_vectorizer.transform(bow_output)
svd_tfidf_output = svd_tfidf_vectorizer.transform(tfidf_output)
svd_hashing_output = svd_hashing_vectorizer.transform(hashing_output)

# Print document vectors and their shapes
print('Document vector examples and their shapes:')
print()
print('BOW:')
print(bow_output.todense())
print(bow_output.shape)
print()
print('TF-IDF:')
print(tfidf_output.todense())
print(tfidf_output.shape)
print()
print('Hashing BOW:')
print(hashing_output.todense())
print(hashing_output.shape)
print()
print('SVD BOW:')
print(svd_bow_output[0][:6])
print(svd_bow_output.shape)
print()
print('SVD TF-IDF:')
print(svd_tfidf_output[0][:6])
print(svd_tfidf_output.shape)
print()
print('SVD Hashing BOW:')
print(svd_hashing_output[0][:6])
print(svd_hashing_output.shape)

Document vector examples and their shapes:

BOW:
[[0 2 0 ... 0 0 0]
 [0 2 0 ... 0 0 0]]
(2, 50000)

TF-IDF:
[[0.         0.1592305  0.         ... 0.         0.         0.        ]
 [0.         0.16949097 0.         ... 0.         0.         0.        ]]
(2, 50000)

Hashing BOW:
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
(2, 50000)

SVD BOW:
[ 2.19219037  0.02814938 -0.37149003  0.46869895  0.77413124 -0.27684479]
(2, 100)

SVD TF-IDF:
[ 0.16403713 -0.06192932  0.00118506  0.05926234 -0.01728777 -0.0282153 ]
(2, 100)

SVD Hashing BOW:
[ 0.51587004 -0.1754948   0.05378128  0.20697629 -0.04919744 -0.03881921]
(2, 100)


## Token vectorizers
### Word2Vec

In [19]:
%%time
vector_size = 100
file = WORKING_PATH + 'word2vec_' + str(vector_size) + '.gensim'
saving = True

# Load model if it already exists
if os.path.isfile(file):
    word2vec = gensim.models.Word2Vec.load(file)

else:
    # Initialize tokenizer wiht corpus in it
    tokenizer = sentiment_utils.Tokenizer()
    tokenizer.corpus = dataset['train']['text']

    # Train the model
    word2vec = gensim.models.Word2Vec(
        sentences=tokenizer, min_count=5, vector_size=vector_size, sg=1, hs=0, negative=5,
        workers=7, epochs=5, seed=RANDOM_STATE,
    )

    # Save the model
    if saving:
        word2vec.save(file)

# Print vocabulary shape
print('Vocabulary shape:')
print((len(word2vec.wv.index_to_key), vector_size))
print()

# Print most frequent words
print('Most frequent words:')
for word in word2vec.wv.index_to_key[:20]:
    print(word)
print()

Vocabulary shape:
(8865, 100)

Most frequent words:
"
'
,
@user
!
.
:
...
may
tomorrow
?
go
day
)
see
get
;
night
-
(

CPU times: user 49.6 ms, sys: 5.1 ms, total: 54.7 ms
Wall time: 54.9 ms


In [20]:
# Print the vector example
print('A vector example:')
print(word2vec.wv['@user'])

A vector example:
[-1.7856718e-04 -7.1627498e-03  5.6681838e-03  3.3616947e-03
 -4.3037655e-03 -9.9052377e-03  7.8344531e-03 -3.0744076e-03
 -4.9395845e-03  2.8602481e-03  3.4456314e-03  7.0434297e-03
  4.8328913e-03 -2.2578656e-03  9.6133817e-03  6.0931980e-03
 -6.9454909e-05 -7.1364711e-03 -4.3063499e-03 -4.0867887e-03
  7.2207870e-03  1.9087791e-04  8.7932190e-03  3.0965197e-03
  8.6710928e-03 -1.7151153e-03 -3.0472302e-03  2.3653186e-03
 -1.0201788e-03 -1.1647308e-03  6.5529346e-04  8.5652145e-03
  6.0522771e-03  7.2767651e-03 -1.3941026e-03 -3.9289714e-04
  8.7667480e-03  3.2828213e-04  7.9442039e-03 -5.4157972e-03
 -1.1028457e-03  8.5191736e-03  5.3559565e-03 -2.3469485e-03
  1.3705313e-03 -8.5036922e-03 -1.4750564e-03 -1.0823035e-03
  9.5225284e-03  7.1474756e-03 -9.3324063e-04  3.3289576e-03
  4.4522537e-03  1.0433340e-03  9.0771206e-03  3.6466957e-04
 -3.4984981e-03  9.3069943e-03  9.0731289e-03  2.9107772e-03
 -2.8863156e-03 -8.8340221e-03  4.6982062e-03  2.8851735e-03
  5.58

### RobertaTokenizerFast

In [21]:
# Arbitrary length vectorizers (ALV)
alv_pretrained = AutoTokenizer.from_pretrained(HUGGINGFACE_MODEL_NAME)

for text in dataset['train']['text'][:2]:
    print(text)
    preprocessed_text = sentiment_utils.preprocess_text(text)
    print(alv_pretrained(preprocessed_text, return_tensors='pt')['input_ids'])
    print()

"QT @user In the original draft of the 7th book, Remus Lupin survived the Battle of Hogwarts. #HappyBirthdayRemusLupin"
tensor([[    0,   113,  1864,   565,   787, 12105,    96,     5,  1461,  2479,
             9,     5,   262,   212,  1040,     6,  8022,   687, 26110,   179,
          5601,     5,  9846,     9, 42210,     4,   849, 21136, 44728,  1208,
         31157,   687,   574,   658,   179,   113,    22,  1864,   565,   787,
         12105,    96,     5,  1461,  2479,     9,     5,   262,   212,  1040,
             6,  8022,   687, 26110,   179,  5601,     5,  9846,     9, 42210,
             4,   849, 21136, 44728,  1208, 31157,   687,   574,   658,   179,
           113,     2]])

"Ben Smith / Smith (concussion) remains out of the lineup Thursday, Curtis #NHL #SJ"
tensor([[    0,   113, 17521,  1259,  1589,  1259,    36,  3865, 33825,    43,
          1189,    66,     9,     5,  4451,   296,     6, 11292,   849,   487,
          8064,   849,   104,   863,   113,    22, 17521, 

In [22]:
type(alv_pretrained)

transformers.models.roberta.tokenization_roberta_fast.RobertaTokenizerFast

In [23]:
%%time
# To process the whole corpus
def preprocess_function(examples):
    preprocessed_text = sentiment_utils.preprocess_text(examples['text'])
    return alv_pretrained(preprocessed_text)

dataset_alv = dataset.map(preprocess_function, batched=True)

dataset_alv

Map:   0%|          | 0/45615 [00:00<?, ? examples/s]

Map:   0%|          | 0/12284 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

CPU times: user 8.98 s, sys: 125 ms, total: 9.1 s
Wall time: 2.13 s


DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 45615
    })
    test: Dataset({
        features: ['text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 12284
    })
    validation: Dataset({
        features: ['text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 2000
    })
})