In [1]:
import nltk
from nltk.corpus import gutenberg, webtext, reuters
import random
from lib.process_text import clean_text, tokenize_document, remove_stopwords, lemmatize_doc
import pandas as pd
import numpy as np
from sklearn.metrics import roc_auc_score, accuracy_score, precision_score, recall_score, f1_score

# Download the corpora
nltk.download('gutenberg')
nltk.download('webtext')
nltk.download('reuters')
nltk.download('stopwords')

[nltk_data] Downloading package gutenberg to
[nltk_data]     C:\Users\leocb\AppData\Roaming\nltk_data...
[nltk_data]   Package gutenberg is already up-to-date!
[nltk_data] Downloading package webtext to
[nltk_data]     C:\Users\leocb\AppData\Roaming\nltk_data...
[nltk_data]   Package webtext is already up-to-date!
[nltk_data] Downloading package reuters to
[nltk_data]     C:\Users\leocb\AppData\Roaming\nltk_data...
[nltk_data]   Package reuters is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\leocb\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
# gutenberg
sentences_gutenberg = []
for fid in range(8):
    story = gutenberg.fileids()[fid]
    sentences_gutenberg.extend(gutenberg.sents(story))
    
print(len(sentences_gutenberg))

52659


In [3]:
sentences_webtext = webtext.sents()
print(sentences_webtext)
len(sentences_webtext)

[['Cookie', 'Manager', ':', '"', 'Don', "'", 't', 'allow', 'sites', 'that', 'set', 'removed', 'cookies', 'to', 'set', 'future', 'cookies', '"', 'should', 'stay', 'checked', 'When', 'in', 'full', 'screen', 'mode', 'Pressing', 'Ctrl', '-', 'N', 'should', 'open', 'a', 'new', 'browser', 'when', 'only', 'download', 'dialog', 'is', 'left', 'open', 'add', 'icons', 'to', 'context', 'menu', 'So', 'called', '"', 'tab', 'bar', '"', 'should', 'be', 'made', 'a', 'proper', 'toolbar', 'or', 'given', 'the', 'ability', 'collapse', '/', 'expand', '.'], ['[', 'XUL', ']', 'Implement', 'Cocoa', '-', 'style', 'toolbar', 'customization', '.'], ...]


25733

In [4]:
sentences_reuters = reuters.sents()
print(sentences_reuters)
len(sentences_reuters)

[['ASIAN', 'EXPORTERS', 'FEAR', 'DAMAGE', 'FROM', 'U', '.', 'S', '.-', 'JAPAN', 'RIFT', 'Mounting', 'trade', 'friction', 'between', 'the', 'U', '.', 'S', '.', 'And', 'Japan', 'has', 'raised', 'fears', 'among', 'many', 'of', 'Asia', "'", 's', 'exporting', 'nations', 'that', 'the', 'row', 'could', 'inflict', 'far', '-', 'reaching', 'economic', 'damage', ',', 'businessmen', 'and', 'officials', 'said', '.'], ['They', 'told', 'Reuter', 'correspondents', 'in', 'Asian', 'capitals', 'a', 'U', '.', 'S', '.', 'Move', 'against', 'Japan', 'might', 'boost', 'protectionist', 'sentiment', 'in', 'the', 'U', '.', 'S', '.', 'And', 'lead', 'to', 'curbs', 'on', 'American', 'imports', 'of', 'their', 'products', '.'], ...]


54716

In [5]:
all_sentences = list(sentences_gutenberg + sentences_webtext + sentences_reuters)
random.seed(123)
sentences_random = random.sample(all_sentences, 9713)
print(len(sentences_random))

9713


In [6]:
# clean text
sentences = [clean_text(" ".join(l)) for l in sentences_random]
print(sentences[:10])

['as dinner was not to be ready in less than two hours from their arrival elinor determined to employ the interval in writing to her mother and sat down for that purpose ', 'do you want money or a bagel ', ' and he ran unto eli and said here am i for thou calledst me ', 'conchemco inc lt ckc sets quarterly qtly div cts vs cts prior pay april six record march ', 'that doesn t make him a social worker ', ' after him repaired nehemiah the son of azbuk the ruler of the half part of bethzur unto the place over against the sepulchres of david and to the pool that was made and unto the house of the mighty ', 'she always watched them as long as she could delighted to fancy she understood what they might be talking of as they walked along in happy independence or equally delighted to see the admiral s hearty shake of the hand when he encountered an old friend and observe their eagerness of conversation when occasionally forming into a little knot of the navy mrs croft looking as intelligent and

In [7]:
# tokenize
token_docs = []
for doc in sentences:
    token_docs.append(tokenize_document(doc))
print(token_docs[0])

['as', 'dinner', 'was', 'not', 'to', 'be', 'ready', 'in', 'less', 'than', 'two', 'hours', 'from', 'their', 'arrival', 'elinor', 'determined', 'to', 'employ', 'the', 'interval', 'in', 'writing', 'to', 'her', 'mother', 'and', 'sat', 'down', 'for', 'that', 'purpose']


In [8]:
# remove stop-words
sentences = remove_stopwords(token_docs)
print(sentences[0])

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\leocb\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
100%|██████████| 9713/9713 [00:00<00:00, 38066.55it/s]

['dinner', 'ready', 'less', 'two', 'hours', 'arrival', 'elinor', 'determined', 'employ', 'interval', 'writing', 'mother', 'sat', 'purpose']





In [9]:
# lemmatize
not_amazon_reviews = []

for doc in sentences:
    not_amazon_reviews.append(lemmatize_doc(doc))

print(not_amazon_reviews[0])

['dinner', 'ready', 'le', 'two', 'hour', 'arrival', 'elinor', 'determined', 'employ', 'interval', 'writing', 'mother', 'sat', 'purpose']


In [10]:
# load amazon reviews
df_amazon_reviews = pd.read_parquet('data/amazon_reviews.parquet')
df_amazon_reviews.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9713 entries, 0 to 9712
Data columns (total 3 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   Ratings        9713 non-null   int64 
 1   Comment        9713 non-null   object
 2   Review_tokens  9713 non-null   object
dtypes: int64(1), object(2)
memory usage: 227.8+ KB


In [11]:
# create training and test sets
len(not_amazon_reviews)

y = [0]*len(not_amazon_reviews) + [1]*len(df_amazon_reviews)

In [12]:
len(not_amazon_reviews)

9713

In [13]:
amazon_reviews = df_amazon_reviews["Review_tokens"]
not_amazon_reviews = pd.Series(not_amazon_reviews)

In [14]:
assert len(amazon_reviews) + len(not_amazon_reviews) == len(y)

In [15]:
df_reviews = pd.DataFrame(columns=["Sentence","Label"])
df_reviews["Sentence"] = pd.concat([not_amazon_reviews,amazon_reviews])
df_reviews["Label"] = y
df_reviews.info()

<class 'pandas.core.frame.DataFrame'>
Index: 19426 entries, 0 to 9712
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Sentence  19426 non-null  object
 1   Label     19426 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 455.3+ KB


In [16]:
print(df_reviews)

                                               Sentence  Label
0     [dinner, ready, le, two, hour, arrival, elinor...      0
1                                  [want, money, bagel]      0
2                [ran, unto, eli, said, thou, calledst]      0
3     [conchemco, inc, lt, ckc, set, quarterly, qtly...      0
4                                [make, social, worker]      0
...                                                 ...    ...
9708                            [absolutely, brilliant]      1
9709  [superb, phone, th, iphone, feel, se, thinnest...      1
9710                                             [nice]      1
9711                            [loving, good, product]      1
9712                            [niceelegant, electric]      1

[19426 rows x 2 columns]


In [17]:
# shuffle the DataFrame's rows
df_reviews_shuffle = df_reviews.sample(frac=1, random_state=123)

# Reset the index of the shuffled DataFrame
df_reviews_shuffle = df_reviews_shuffle.reset_index(drop=True)

print(df_reviews_shuffle)

                                                Sentence  Label
0                                                 [nice]      1
1      [man, cell, baby, butter, thang, cause, coming...      0
2      [performance, phone, good, problem, batteryit,...      1
3                              [nice, one, online, fast]      1
4              [old, man, hiccup, old, lady, kill, dead]      0
...                                                  ...    ...
19421                            [make, better, andriod]      1
19422  [cloud, tarried, long, upon, tabernacle, many,...      0
19423                                      [good, phone]      1
19424                                             [good]      1
19425  [excellent, phone, latest, feature, iphone, se...      1

[19426 rows x 2 columns]


In [18]:
from sklearn.model_selection import train_test_split
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification
from torch.utils.data import DataLoader
from transformers import AdamW

# load the DistilBERT tokenizer
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

# split with stratification
train_texts, test_texts, train_labels, test_labels = train_test_split(list(df_reviews_shuffle['Sentence']), list(df_reviews_shuffle['Label']), stratify=list(df_reviews_shuffle['Label']), test_size=0.2)

# untokenize
train_texts_joined = [' '.join(tokens) for tokens in train_texts]
test_texts_joined = [' '.join(tokens) for tokens in test_texts]

# convert texts to BERT input format with padding and truncation
train_encodings = tokenizer(train_texts_joined, truncation=True, padding=True, max_length=512)
test_encodings = tokenizer(test_texts_joined, truncation=True, padding=True, max_length=512)


  from .autonotebook import tqdm as notebook_tqdm


In [19]:
print(type(train_encodings))
print(type(test_encodings))

<class 'transformers.tokenization_utils_base.BatchEncoding'>
<class 'transformers.tokenization_utils_base.BatchEncoding'>


In [20]:
print(train_encodings.keys())
print(test_encodings.keys())

dict_keys(['input_ids', 'attention_mask'])
dict_keys(['input_ids', 'attention_mask'])


In [21]:
print(len(train_encodings['input_ids'][0]))
print(len(test_encodings['input_ids'][0]))

344
238


In [22]:
print(train_encodings['input_ids'][0][:10])
print(test_encodings['attention_mask'][0][:10])

[101, 18059, 7367, 1050, 2094, 8991, 5294, 2224, 13151, 3674]
[1, 1, 1, 1, 1, 0, 0, 0, 0, 0]


In [23]:
# convert encodings to PyTorch dataset
import torch
from torch.utils.data import Dataset

class ReviewsDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = ReviewsDataset(train_encodings, train_labels)
test_dataset = ReviewsDataset(test_encodings, test_labels)

In [24]:
# initialize distilbert model
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=2)

# train model
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir='./results',          # output directory
    num_train_epochs=3,              # total number of training epochs
    per_device_train_batch_size=64,  # batch size per device during training
    per_device_eval_batch_size=128,  # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
)

trainer = Trainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=test_dataset            # evaluation dataset
)

trainer.train()

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
 69%|██████▊   | 500/729 [4:49:55<2:09:46, 34.00s/it]

{'loss': 0.1537, 'learning_rate': 5e-05, 'epoch': 2.06}


100%|██████████| 729/729 [7:05:09<00:00, 34.99s/it]  

{'train_runtime': 25509.3695, 'train_samples_per_second': 1.828, 'train_steps_per_second': 0.029, 'train_loss': 0.11257640160977922, 'epoch': 3.0}





TrainOutput(global_step=729, training_loss=0.11257640160977922, metrics={'train_runtime': 25509.3695, 'train_samples_per_second': 1.828, 'train_steps_per_second': 0.029, 'train_loss': 0.11257640160977922, 'epoch': 3.0})

In [25]:
# save model
trainer.save_model("data/review_classifier")

# save tokenizer
tokenizer.save_pretrained("data/review_classifier_tokenizer")

('data/review_classifier_tokenizer\\tokenizer_config.json',
 'data/review_classifier_tokenizer\\special_tokens_map.json',
 'data/review_classifier_tokenizer\\vocab.txt',
 'data/review_classifier_tokenizer\\added_tokens.json',
 'data/review_classifier_tokenizer\\tokenizer.json')

In [26]:
trainer.evaluate()

100%|██████████| 31/31 [06:35<00:00, 12.77s/it]


{'eval_loss': 0.04180752858519554,
 'eval_runtime': 409.474,
 'eval_samples_per_second': 9.49,
 'eval_steps_per_second': 0.076,
 'epoch': 3.0}

In [34]:
# get predictions
predictions = trainer.predict(test_dataset)

# the predictions are in a tuple with the first item being the prediction scores
prediction_scores = predictions[0]

# the scores are in the format of (num_samples, num_classes), 
# and we need to get the score for the positive class
positive_class_scores = prediction_scores[:, 1]

# calculate AUC-ROC
auc_roc = roc_auc_score(test_dataset.labels, positive_class_scores)

print(f"AUC-ROC: {auc_roc}")

# Get the predicted classes
preds = predictions[0].argmax(-1)

# Calculate metrics
accuracy = accuracy_score(test_dataset.labels, preds)
precision = precision_score(test_dataset.labels, preds)
recall = recall_score(test_dataset.labels, preds)
f1 = f1_score(test_dataset.labels, preds)

print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1-score: {f1}")

100%|██████████| 31/31 [06:46<00:00, 13.10s/it]

AUC-ROC: 0.999091847981418
Accuracy: 0.9866186309830159
Precision: 0.9851205746536685
Recall: 0.9881626351003603
F1-score: 0.986639260020555





In [69]:
# save tokenizer
from transformers import AutoTokenizer, AutoModel
from nltk.corpus import stopwords

text = "This iPhone is great, I love this product!"

tokenizer = AutoTokenizer.from_pretrained("data/review_classifier_tokenizer")

tokenized_text = tokenize_document(text)
print(tokenized_text)

def remove_stopwords_sentence(tokens):
    for word in tokens:
        if word in stopwords.words('english'):
            tokens.remove(word)
    
    return tokens

text_nostop = remove_stopwords_sentence(tokenized_text)
print(text_nostop)

text_lemmatized = lemmatize_doc(text_nostop)
print(text_lemmatized)

tokens_bert = tokenizer(' '.join(text_lemmatized), return_tensors='pt')
print(tokens_bert)

['This', 'book', 'is', 'pretty', 'good', ',', 'I', 'recommend', 'it']
['This', 'book', 'pretty', 'good', ',', 'I', 'recommend']
['This', 'book', 'pretty', 'good', ',', 'I', 'recommend']
{'input_ids': tensor([[  101,  2023,  2338,  3492,  2204,  1010,  1045, 16755,   102]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1]])}


In [70]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained("data/review_classifier_tokenizer")

# Load the model
model_load = AutoModelForSequenceClassification.from_pretrained("data/review_classifier")

# Get the model's predictions
predictions = model_load(**tokens_bert)

print(predictions.logits)

import torch.nn.functional as F

# Apply softmax to output
probabilities = F.softmax(predictions.logits, dim=-1)

print(probabilities)

import torch

# Apply softmax to output to get probabilities
probabilities = torch.nn.functional.softmax(predictions.logits, dim=-1)

# Get the predicted class
predicted_class = torch.argmax(probabilities, dim=-1)

print(predicted_class)


tensor([[-1.1112,  0.6538]], grad_fn=<AddmmBackward0>)
tensor([[0.1462, 0.8538]], grad_fn=<SoftmaxBackward0>)
tensor([1])


In [79]:
float(max(probabilities[0]))

0.8538343906402588

In [80]:
int(predicted_class)

1