**Project 3 : Amazon reviews analysis. This dataset consists of a few million Amazon customer reviews (input text) and star
ratings (output labels) for learning how to train fastText for sentiment analysis.**
**bold text**

In [None]:
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/

In [None]:
!kaggle datasets download -d bittlingmayer/amazonreviews

Downloading amazonreviews.zip to /content
 97% 480M/493M [00:04<00:00, 131MB/s]
100% 493M/493M [00:05<00:00, 103MB/s]


In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import bz2

In [None]:
import zipfile
zip_ref = zipfile.ZipFile('/content/amazonreviews.zip', 'r')
zip_ref.extractall('/content')
zip_ref.close()

In [None]:
data_file = bz2.BZ2File('/content/train.ft.txt.bz2')


In [None]:
def load_extract(file):
    texts, labels = [], [] 
    for line in file:
        x = line.decode('utf-8')  
        labels.append(int(x[9]) - 1)  
        texts.append(x[10:].strip())  
    print('Done !')
    return np.array(labels), texts

In [None]:
labels, texts = load_extract(data_file)

Done !


In [None]:
texts[996]

'Brain Based Learning: The New Paradigm of Teaching: I am currently reading the Kindle version of this book. The book itself is very easy to read and full of material to try in the class room. Unfortunately, there are no page numbers that correspond to the paper version of the book. As this is a text book, it is impossible to cite material and the page from which it was taken. This is really unacceptable for a text book. It is one thing if it were a literature book, and even then, it would be better to have the physical page number of the actual book. If I had known that there were no page numbers contained in this Kindle version, I would NOT have purchased it. I would have purchased a used hard copy.'

In [None]:
len(texts)

3600000

In [None]:
set(labels)

{0, 1}

In [None]:
dict = {"text":texts, "label":labels}
df = pd.DataFrame(dict)
df.head(5)

Unnamed: 0,text,label
0,Stuning even for the non-gamer: This sound tra...,1
1,The best soundtrack ever to anything.: I'm rea...,1
2,Amazing!: This soundtrack is my favorite music...,1
3,Excellent Soundtrack: I truly like this soundt...,1
4,"Remember, Pull Your Jaw Off The Floor After He...",1


In [None]:
df = df[:100000]
df

Unnamed: 0,text,label
0,Stuning even for the non-gamer: This sound tra...,1
1,The best soundtrack ever to anything.: I'm rea...,1
2,Amazing!: This soundtrack is my favorite music...,1
3,Excellent Soundtrack: I truly like this soundt...,1
4,"Remember, Pull Your Jaw Off The Floor After He...",1
...,...,...
99995,Good: very good excelent fantastic wonderful m...,1
99996,THE REAL SUPERMAN: THESE ARE THE REAL ADVENTUR...,0
99997,It gets one star because zero was not an optio...,0
99998,Don't Spend Your Money: I think I now know mor...,0


In [None]:
df['label'].value_counts()

1    51267
0    48733
Name: label, dtype: int64

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_val, y_train, y_val = train_test_split(df.index.values, 
                                                  
                                                   df.label.values,
                                                  test_size=0.15, 
                                                  random_state=42,
                                                  
                                                  stratify=df.label.values
                                                  )

In [None]:
df['data_type'] = ['not_set']*df.shape[0]

In [None]:
df.head()

Unnamed: 0,text,label,data_type
0,Stuning even for the non-gamer: This sound tra...,1,not_set
1,The best soundtrack ever to anything.: I'm rea...,1,not_set
2,Amazing!: This soundtrack is my favorite music...,1,not_set
3,Excellent Soundtrack: I truly like this soundt...,1,not_set
4,"Remember, Pull Your Jaw Off The Floor After He...",1,not_set


In [None]:
df.loc[X_train, 'data_type'] = 'train'
df.loc[X_val, 'data_type'] = 'val'

In [None]:
df.groupby(['label', 'data_type']).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,text
label,data_type,Unnamed: 2_level_1
0,train,41423
0,val,7310
1,train,43577
1,val,7690


In [None]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.26.1-py3-none-any.whl (6.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.3/6.3 MB[0m [31m100.2 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m79.7 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.12.1-py3-none-any.whl (190 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m190.3/190.3 KB[0m [31m22.9 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.12.1 tokenizers-0.13.2 transformers-4.26.1


In [None]:
import torch
from tqdm.notebook import tqdm
import numpy as np 
import pandas as pd

In [None]:
from transformers import BertTokenizer
from torch.utils.data import TensorDataset

In [None]:
tokenizer = BertTokenizer.from_pretrained(
    'bert-base-uncased',
    do_lower_case=True
)

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [None]:
df

Unnamed: 0,text,label,data_type
0,Stuning even for the non-gamer: This sound tra...,1,val
1,The best soundtrack ever to anything.: I'm rea...,1,train
2,Amazing!: This soundtrack is my favorite music...,1,val
3,Excellent Soundtrack: I truly like this soundt...,1,train
4,"Remember, Pull Your Jaw Off The Floor After He...",1,train
...,...,...,...
99995,Good: very good excelent fantastic wonderful m...,1,train
99996,THE REAL SUPERMAN: THESE ARE THE REAL ADVENTUR...,0,train
99997,It gets one star because zero was not an optio...,0,val
99998,Don't Spend Your Money: I think I now know mor...,0,train


In [None]:
encoded_data_train = tokenizer.batch_encode_plus(
    
    df[df.data_type=='train'].text.values,
    add_special_tokens=True,
    return_attention_mask=True,
    pad_to_max_length=True,
    max_length=256,
    return_tensors='pt'
)

encoded_data_val = tokenizer.batch_encode_plus(
    df[df.data_type=='val'].text.values,
    add_special_tokens=True,
    return_attention_mask=True,
    pad_to_max_length=True,
    max_length=256,
    return_tensors='pt'
)

input_ids_train = encoded_data_train['input_ids']
attention_masks_train = encoded_data_train['attention_mask']

labels_train = torch.tensor(df[df.data_type=='train'].label.values)

input_ids_val = encoded_data_val['input_ids']
attention_masks_val = encoded_data_val['attention_mask']

labels_val = torch.tensor(df[df.data_type=='val'].label.values)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [None]:
dataset_train = TensorDataset(input_ids_train, 
                              attention_masks_train,
                              labels_train)

dataset_val = TensorDataset(input_ids_val, 
                            attention_masks_val,
                           labels_val)

In [None]:
len(dataset_train)

85000

In [None]:
dataset_val.tensors

(tensor([[  101, 24646,  5582,  ...,     0,     0,     0],
         [  101,  6429,   999,  ...,     0,     0,     0],
         [  101,  2019,  7619,  ...,     0,     0,     0],
         ...,
         [  101,  2980,  2980,  ...,     0,     0,     0],
         [  101,  2298,  1010,  ...,     0,     0,     0],
         [  101,  2009,  4152,  ...,     0,     0,     0]]),
 tensor([[1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         ...,
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0]]),
 tensor([1, 1, 1,  ..., 1, 1, 0]))

In [None]:
from transformers import BertForSequenceClassification

In [None]:
model = BertForSequenceClassification.from_pretrained(
                                      'bert-base-uncased', 
                                      
                                      output_attentions = False,
                                      output_hidden_states = False
                                     )

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [None]:
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler

In [None]:
dataset_train

<torch.utils.data.dataset.TensorDataset at 0x7fde15a21850>

In [None]:
batch_size = 4

dataloader_train = DataLoader(
    dataset_train,
    sampler=RandomSampler(dataset_train),
    batch_size=batch_size
)

dataloader_val = DataLoader(
    dataset_val,
    sampler=RandomSampler(dataset_val),
    batch_size=32
)

In [None]:
from transformers import AdamW, get_linear_schedule_with_warmup

In [None]:
optimizer = AdamW(
    model.parameters(),
    lr = 1e-5,
    eps = 1e-8
)



In [None]:
epochs = 2

scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0,
    num_training_steps = len(dataloader_train)*epochs
)

In [None]:
import numpy as np
from sklearn.metrics import f1_score

In [None]:
def f1_score_func(preds, labels):
    preds_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return f1_score(labels_flat, preds_flat, average = 'weighted')

In [None]:
def accuracy_per_class(preds, labels):
    label_dict_inverse = {v: k for k, v in label_dict.items()}
    
    preds_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    
    for label in np.unique(labels_flat):
        y_preds = preds_flat[labels_flat==label]
        y_true = labels_flat[labels_flat==label]
        print(f'Class: {label_dict_inverse[label]}')
        print(f'Accuracy:{len(y_preds[y_preds==label])}/{len(y_true)}\n')

In [None]:
import random

seed_val = 17
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
print(device)

cuda


In [None]:
def evaluate(dataloader_val):

    model.eval()
    
    loss_val_total = 0
    predictions, true_vals = [], []
    
    for batch in tqdm(dataloader_val):
        
        batch = tuple(b.to(device) for b in batch)
        
        inputs = {'input_ids':      batch[0],
                  'attention_mask': batch[1],
                  'labels':         batch[2],
                 }

        with torch.no_grad():        
            outputs = model(**inputs)
            
        loss = outputs[0]
        logits = outputs[1]
        loss_val_total += loss.item()

        logits = logits.detach().cpu().numpy()
        label_ids = inputs['labels'].cpu().numpy()
        predictions.append(logits)
        true_vals.append(label_ids)
    
    loss_val_avg = loss_val_total/len(dataloader_val) 
    
    predictions = np.concatenate(predictions, axis=0)
    true_vals = np.concatenate(true_vals, axis=0)
            
    return loss_val_avg, predictions, true_vals

In [None]:
for epoch in tqdm(range(1, epochs+1)):
    model.train()
    loss_train_total = 0
    
    progress_bar = tqdm(dataloader_train, 
                        desc='Epoch {:1d}'.format(epoch), 
                        leave=False, 
                        disable=False)
    
    for batch in progress_bar:
        model.zero_grad()
        batch = tuple(b.to(device) for b in batch)
        inputs = {
            'input_ids': batch[0],
            'attention_mask': batch[1],
            'labels': batch[2]
        }
        
        outputs = model(**inputs)
        loss = outputs[0]
        loss_train_total +=loss.item()
        loss.backward()
        
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        
        optimizer.step()
        scheduler.step()
        
        progress_bar.set_postfix({'training_loss': '{:.3f}'.format(loss.item()/len(batch))})     
    
    torch.save(model, f'BERT_ft_Epoch{epoch}.model')
    
    tqdm.write(f'\nEpoch {epoch}')
    
    loss_train_avg = loss_train_total/len(dataloader_train)
    tqdm.write(f'Training loss: {loss_train_avg}')
    
    val_loss, predictions, true_vals = evaluate(dataloader_val)
    val_f1 = f1_score_func(predictions, true_vals)
    tqdm.write(f'Validation loss: {val_loss}')
    tqdm.write(f'F1 Score (weighted): {val_f1}')


  0%|          | 0/2 [00:00<?, ?it/s]

Epoch 1:   0%|          | 0/21250 [00:00<?, ?it/s]


Epoch 1
Training loss: 0.24761146459515507


  0%|          | 0/469 [00:00<?, ?it/s]

Validation loss: 0.18311524538890217
F1 Score (weighted): 0.9562708863059585


Epoch 2:   0%|          | 0/21250 [00:00<?, ?it/s]


Epoch 2
Training loss: 0.12239697688344818


  0%|          | 0/469 [00:00<?, ?it/s]

Validation loss: 0.22649897111483341
F1 Score (weighted): 0.9586713889065351


In [None]:
import warnings
from transformers import BertTokenizer
import torch
from transformers import BertForSequenceClassification
warnings.filterwarnings("ignore", category=FutureWarning)
warnings.filterwarnings('ignore')

labels = {
    0:'Negative Review',
    1:'Positive Review'
    }
tokenizer = BertTokenizer.from_pretrained(
'bert-base-uncased',
do_lower_case=True
)

In [None]:
def predict_bias(Review):
    model = torch.load('/content/BERT_ft_Epoch1.model',map_location=torch.device('cpu'))
    device = torch.device('cpu')
    print(device)
    encoded_review = tokenizer(Review, return_tensors="pt")

    input_ids = encoded_review['input_ids'].to(device)
    attention_mask = encoded_review['attention_mask'].to(device)
    output = model(input_ids, attention_mask)
    final_pred = torch.tensor(output.logits)
    categoryIndex = int(torch.argmax(final_pred))
    category = labels[categoryIndex]
    return category

In [None]:
Review = "The lyrics are just exceptional !! I love the feeling and inflection in  voice."
Category = predict_bias(Review)
print(Category)

cpu
Positive Review
