In [1]:
%load_ext autoreload
%autoreload 1
%aimport sentiment_utils

[nltk_data] Downloading package stopwords to /Users/admin/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
import datasets
import pandas as pd
# import sklearn
from tqdm import tqdm
import torch
from transformers import AutoTokenizer

print('python:'.ljust(16), sys.version.split('\n')[0])
# print('Scikit-learn:'.ljust(16), sklearn.__version__)
print('PyTorch:'.ljust(16), torch.__version__)

python:          3.10.11 (main, Apr  7 2023, 07:24:53) [Clang 14.0.0 (clang-1400.0.29.202)]
PyTorch:         1.13.1


## Device

In [3]:
# Get cpu or gpu device for training
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f'Using {DEVICE} device')

Using cpu device


## Hyperparameters & Constants

In [4]:
# Hyperparameters
# BATCH_SIZE = 64
# EPOCHS = 15  # select from: 2**n - 1 = [1, 3, 7, 15, ...]
# SCHEDULER_GAMMA = 0.7

# Constants
WORKING_PATH = './sentiment-data/'
MODEL_PATH = '../app/models/'
DATASET_NAME = 'tweet_eval'
DATASET_CONF = 'sentiment'
CLASSES = 3
HUGGINGFACE_MODEL_NAME = 'cardiffnlp/twitter-roberta-base-sentiment-latest'

# Actions
# DO_LR_RANGE_TEST=False

## Random state

In [5]:
RANDOM_STATE = 2147483647
# random.seed(RANDOM_STATE)
# np.random.seed(RANDOM_STATE)
torch.manual_seed(RANDOM_STATE)
torch.cuda.manual_seed(RANDOM_STATE)
torch.cuda.manual_seed_all(RANDOM_STATE)

## Load & show data

In [6]:
dataset = datasets.load_dataset(DATASET_NAME, DATASET_CONF)
dataset

Found cached dataset tweet_eval (/Users/admin/.cache/huggingface/datasets/tweet_eval/sentiment/1.1.0/12aee5282b8784f3e95459466db4cdf45c6bf49719c25cdb0743d71ed0410343)


  0%|          | 0/3 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 45615
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 12284
    })
    validation: Dataset({
        features: ['text', 'label'],
        num_rows: 2000
    })
})

In [7]:
dataset['train'][0:5]

{'text': ['"QT @user In the original draft of the 7th book, Remus Lupin survived the Battle of Hogwarts. #HappyBirthdayRemusLupin"',
  '"Ben Smith / Smith (concussion) remains out of the lineup Thursday, Curtis #NHL #SJ"',
  'Sorry bout the stream last night I crashed out but will be on tonight for sure. Then back to Minecraft in pc tomorrow night.',
  "Chase Headley's RBI double in the 8th inning off David Price snapped a Yankees streak of 33 consecutive scoreless innings against Blue Jays",
  '@user Alciato: Bee will invest 150 million in January, another 200 in the Summer and plans to bring Messi by 2017"'],
 'label': [2, 1, 1, 1, 2]}

## Tokenization

In [8]:
tokenizer = sentiment_utils.Tokenizer()

for text in dataset['train']['text'][:5]:
    print(text)
    print(tokenizer(text, return_str=True))
    print()

"QT @user In the original draft of the 7th book, Remus Lupin survived the Battle of Hogwarts. #HappyBirthdayRemusLupin"
" qt @user origin draft 7th book , remu lupin surviv battl hogwarts. #happybirthdayremuslupin "

"Ben Smith / Smith (concussion) remains out of the lineup Thursday, Curtis #NHL #SJ"
" ben smith / smith ( concuss ) remain lineup thursday , curti #nhl #sj "

Sorry bout the stream last night I crashed out but will be on tonight for sure. Then back to Minecraft in pc tomorrow night.
sorri bout stream last night crash tonight sure. back minecraft pc tomorrow night .

Chase Headley's RBI double in the 8th inning off David Price snapped a Yankees streak of 33 consecutive scoreless innings against Blue Jays
chase headley ' rbi doubl 8th inning david price snap yanke streak 33 consecut scoreless inning blue jay

@user Alciato: Bee will invest 150 million in January, another 200 in the Summer and plans to bring Messi by 2017"
@user alciato : bee invest 150 million januari , ano

## Vectorization
### Document vectorizers

In [20]:
%%time
n_features = 50000
svd_components = 100

# Get vectorizers and data
(
    bow_vectorizer, x_train_bow, x_valid_bow, x_test_bow,
    tfidf_vectorizer, x_train_tfidf, x_valid_tfidf, x_test_tfidf,
    hashing_vectorizer, x_train_hashing, x_valid_hashing, x_test_hashing,
    svd_bow_vectorizer, x_train_svd_bow, x_valid_svd_bow, x_test_svd_bow,
    svd_tfidf_vectorizer, x_train_svd_tfidf, x_valid_svd_tfidf, x_test_svd_tfidf,
    svd_hashing_vectorizer, x_train_svd_hashing, x_valid_svd_hashing, x_test_svd_hashing,
) = sentiment_utils.get_bow_and_tfifd(
    dataset,
    n_features,
    svd_components,
    file=WORKING_PATH + 'bow_tfidf_' + str(n_features) + '_' + str(svd_components) + '.pickle',
    saving=True,
)

# Print shapes
print('Full size data shapes:')
print('BOW:        ', x_train_bow.shape, x_valid_bow.shape, x_test_bow.shape)
print('TFIDF:      ', x_train_tfidf.shape, x_valid_tfidf.shape, x_test_tfidf.shape)
print('Hashing BOW:', x_train_hashing.shape, x_valid_hashing.shape, x_test_hashing.shape)
print()
print('SVD-truncated data shapes:')
print('BOW:        ', x_train_svd_bow.shape, x_valid_svd_bow.shape, x_test_svd_bow.shape)
print('TFIDF:      ', x_train_svd_tfidf.shape, x_valid_svd_tfidf.shape, x_test_svd_tfidf.shape)
print('Hashing BOW:', x_train_svd_hashing.shape, x_valid_svd_hashing.shape, x_test_svd_hashing.shape)
print()
print('Explained variance for SVD:')
print(f'BOW:         {(svd_bow_vectorizer.explained_variance_ratio_.sum() * 100):.2f} %')
print(f'TFIDF:       {(svd_tfidf_vectorizer.explained_variance_ratio_.sum() * 100):.2f} %')
print(f'Hashing BOW: {(svd_hashing_vectorizer.explained_variance_ratio_.sum() * 100):.2f} %')
print()

Full size data shapes:
BOW:         (45615, 50000) (2000, 50000) (12284, 50000)
TFIDF:       (45615, 50000) (2000, 50000) (12284, 50000)
Hashing BOW: (45615, 50000) (2000, 50000) (12284, 50000)

SVD-truncated data shapes:
BOW:         (45615, 100) (2000, 100) (12284, 100)
TFIDF:       (45615, 100) (2000, 100) (12284, 100)
Hashing BOW: (45615, 100) (2000, 100) (12284, 100)

Explained variance for SVD:
BOW:         51.53 %
TFIDF:       16.50 %
Hashing BOW: 41.75 %

CPU times: user 27.7 ms, sys: 83.3 ms, total: 111 ms
Wall time: 166 ms


In [14]:
text = ['Hello, world!']

bow_output = bow_vectorizer.transform(text)
tfidf_output = tfidf_vectorizer.transform(bow_output)
hashing_output = hashing_vectorizer.transform(text)
svd_bow_output = svd_bow_vectorizer.transform(bow_output)
svd_tfidf_output = svd_tfidf_vectorizer.transform(tfidf_output)
svd_hashing_output = svd_hashing_vectorizer.transform(hashing_output)

print(bow_output.todense())
print(bow_output.shape)
print(tfidf_output.todense())
print(tfidf_output.shape)
print(hashing_output.todense())
print(hashing_output.shape)
print()
print(svd_bow_output[0][:6])
print(svd_bow_output.shape)
print(svd_tfidf_output[0][:6])
print(svd_tfidf_output.shape)
print(svd_hashing_output[0][:6])
print(svd_hashing_output.shape)

[[1 0 0 ... 0 0 0]]
(1, 50000)
[[0.26755602 0.         0.         ... 0.         0.         0.        ]]
(1, 50000)
[[0. 0. 0. ... 0. 0. 0.]]
(1, 50000)

[ 0.30957865  1.00127087  0.11494237  0.87701774 -0.22821021 -0.21423857]
(1, 100)
[ 0.14219744  0.17554127  0.14005281 -0.0453262  -0.15248644 -0.09945275]
(1, 100)
[ 0.3206325   0.18681136  0.3500979  -0.13938562  0.38438379 -0.25106814]
(1, 100)


### Arbitrary length vectorizers

In [74]:
# Arbitrary length vectorizers (ALV)
alv_pretrained = AutoTokenizer.from_pretrained(HUGGINGFACE_MODEL_NAME)

for text in dataset['train']['text'][:2]:
    print(text)
    preprocessed_text = sentiment_utils.preprocess_text(text)
    print(alv_pretrained(preprocessed_text, return_tensors='pt')['input_ids'])
    print()

"QT @user In the original draft of the 7th book, Remus Lupin survived the Battle of Hogwarts. #HappyBirthdayRemusLupin"
tensor([[    0,   113,  1864,   565,   787, 12105,    96,     5,  1461,  2479,
             9,     5,   262,   212,  1040,     6,  8022,   687, 26110,   179,
          5601,     5,  9846,     9, 42210,     4,   849, 21136, 44728,  1208,
         31157,   687,   574,   658,   179,   113,    22,  1864,   565,   787,
         12105,    96,     5,  1461,  2479,     9,     5,   262,   212,  1040,
             6,  8022,   687, 26110,   179,  5601,     5,  9846,     9, 42210,
             4,   849, 21136, 44728,  1208, 31157,   687,   574,   658,   179,
           113,     2]])

"Ben Smith / Smith (concussion) remains out of the lineup Thursday, Curtis #NHL #SJ"
tensor([[    0,   113, 17521,  1259,  1589,  1259,    36,  3865, 33825,    43,
          1189,    66,     9,     5,  4451,   296,     6, 11292,   849,   487,
          8064,   849,   104,   863,   113,    22, 17521, 

In [None]:
%%time
