# Setup
### Import necessary packages and libraries, set model to train on GPU, and load BERT pre-trained model

In [1]:
# import necessary packages and libraries
import torch
import pandas as pd
import numpy as np

In [2]:
# Run model under GPU for faster training and testing
import tensorflow as tf

# Get the GPU device name.
device = tf.test.gpu_device_name()

# Confirm
if device == '/device:GPU:0':
    print('Found GPU at: {}'.format(device))
else:
    raise SystemError('GPU device not found')

Found GPU at: /device:GPU:0


In [3]:
# Assign PyTorch to run off GPU
gpu = torch.device("cuda")
print('GPU Found:', torch.cuda.get_device_name(0))
torch.cuda.empty_cache()

GPU Found: NVIDIA GeForce RTX 3060


In [4]:
# Display Pytorch and CUDA versions
print('PyTorch version: {}'.format(torch.__version__))
print('CUDA version: {}'.format(torch.version.cuda))

# limit how much GPU memory PyTorch can use
torch.cuda.set_per_process_memory_fraction(1.0)

PyTorch version: 2.0.0
CUDA version: 11.8


In [5]:
# load BERT tokenizer and model using GPU
from transformers import BertTokenizer, BertForSequenceClassification

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased',     # 12 layer BERT model with uncased vocabulary
                                                      num_labels=2)            # 2 for binary classification
model.cuda()

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

In [6]:
# Get all of the model's parameters as a list of tuples.
params = list(model.named_parameters())

print('The BERT model has {:} different named parameters.\n'.format(len(params)))

print('---- Embedding Layer ----\n')

for p in params[0:5]:
    print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))

print('\n---- First Transformer ----\n')

for p in params[5:21]:
    print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))

print('\n---- Output Layer ----\n')

for p in params[-4:]:
    print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))

The BERT model has 201 different named parameters.

---- Embedding Layer ----

bert.embeddings.word_embeddings.weight                  (30522, 768)
bert.embeddings.position_embeddings.weight                (512, 768)
bert.embeddings.token_type_embeddings.weight                (2, 768)
bert.embeddings.LayerNorm.weight                              (768,)
bert.embeddings.LayerNorm.bias                                (768,)

---- First Transformer ----

bert.encoder.layer.0.attention.self.query.weight          (768, 768)
bert.encoder.layer.0.attention.self.query.bias                (768,)
bert.encoder.layer.0.attention.self.key.weight            (768, 768)
bert.encoder.layer.0.attention.self.key.bias                  (768,)
bert.encoder.layer.0.attention.self.value.weight          (768, 768)
bert.encoder.layer.0.attention.self.value.bias                (768,)
bert.encoder.layer.0.attention.output.dense.weight        (768, 768)
bert.encoder.layer.0.attention.output.dense.bias              (

# Preparation - Tokenization

In [7]:
#load datasets
df_total = pd.read_csv('amazon_review_polarity_csv/train.csv', header=None, names=['label', 'title', 'review'])

'''
The Amazon reviews polarity dataset is constructed by Xiang Zhang (xiang.zhang@nyu.edu). 
It is used as a text classification benchmark in the following paper: 
Xiang Zhang, Junbo Zhao, Yann LeCun. Character-level Convolutional Networks for Text Classification. Advances in Neural Information Processing Systems 28 (NIPS 2015).

I am using the subset of data provided by Kaggle user 'kritanjalijain', but it too large for my computer to train in a 
reasonable amount of time. I will only be using a random 10% of the data.

'''

df_total.head()

Unnamed: 0,label,title,review
0,2,Stuning even for the non-gamer,This sound track was beautiful! It paints the ...
1,2,The best soundtrack ever to anything.,I'm reading a lot of reviews saying that this ...
2,2,Amazing!,This soundtrack is my favorite music of all ti...
3,2,Excellent Soundtrack,I truly like this soundtrack and I enjoy video...
4,2,"Remember, Pull Your Jaw Off The Floor After He...","If you've played the game, you know how divine..."


In [8]:
# Preprocess dataframes to be of reviews with max_length = 200

max_length = 200

df_total = df_total[df_total['review'].apply(lambda x: len(x) <= max_length)]

print('Length of truncated dataset: {:,}'.format(len(df_total)))

Length of truncated dataset: 853,214


In [9]:
# Randomly select 10% of the dataset
df_train = df_total.sample(frac=0.05, random_state=42)
print('Length of dataset for training and validation: {:,}'.format(len(df_train)))

Length of dataset for training and validation: 42,661


In [10]:
# Get the lists of reviews and their labels.
reviews = df_train.review.values

# Subtracting 1 to convert to a binary dataset with values 1 and 0
# Original dataset had 1's and 2's which is not recognized by BERT's binary classification model
labels = df_train.label.values-1   
print(type(labels))

<class 'numpy.ndarray'>


In [11]:
# Visualize tokenization of review
# Print the original sentence.
print('  Original: ', reviews[0])

# Print the sentence split into tokens.
print('  Tokenized: ', tokenizer.tokenize(reviews[0]))

# Print the sentence mapped to token ids.
print('  Token IDs: ', tokenizer.convert_tokens_to_ids(tokenizer.tokenize(reviews[0])))

  Original:  If you want an eye opening, heart opener this is the book for you. One I will read again, and again, and again.
  Tokenized:  ['if', 'you', 'want', 'an', 'eye', 'opening', ',', 'heart', 'opener', 'this', 'is', 'the', 'book', 'for', 'you', '.', 'one', 'i', 'will', 'read', 'again', ',', 'and', 'again', ',', 'and', 'again', '.']
  Token IDs:  [2065, 2017, 2215, 2019, 3239, 3098, 1010, 2540, 16181, 2023, 2003, 1996, 2338, 2005, 2017, 1012, 2028, 1045, 2097, 3191, 2153, 1010, 1998, 2153, 1010, 1998, 2153, 1012]


In [12]:
# Determine max review length
max_len = 0

for text in reviews:

    # Tokenize the text and add `[CLS]` and `[SEP]` tokens.
    input_ids = tokenizer.encode(text, add_special_tokens=True)

    # Update the maximum sentence length.
    max_len = max(max_len, len(input_ids))

print('Max review length in dataset: ', max_len)

Max review length in dataset:  108


In [13]:
# Tokenize training dataset
input_ids = []
attention_masks = []

for text in reviews:
    if len(text) > max_length:
        text = text[:max_length]                                                       # truncate longer sequences
    encoded_dict = tokenizer.encode_plus(text,                                         # text to encode
                                         add_special_tokens=True,                      # add '[CLS]' and '[SEP]'
                                         truncation=True,
                                         max_length=max_length,                        # truncate all sentences
                                         padding='max_length',                         # pad to max_length
                                         return_attention_mask=True,                   # Construct attn. masks
                                         return_tensors='pt')                          # Return pytorch tensors
    input_ids.append(encoded_dict['input_ids'])
    attention_masks.append(encoded_dict['attention_mask'])
    
input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)
labels = torch.tensor(labels)

print('Original Text: ', reviews[0])
print('Token IDs:', input_ids[0])

Original Text:  If you want an eye opening, heart opener this is the book for you. One I will read again, and again, and again.
Token IDs: tensor([  101,  2065,  2017,  2215,  2019,  3239,  3098,  1010,  2540, 16181,
         2023,  2003,  1996,  2338,  2005,  2017,  1012,  2028,  1045,  2097,
         3191,  2153,  1010,  1998,  2153,  1010,  1998,  2153,  1012,   102,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
   

# Preparation - Pre-Training Setup

In [16]:
from torch.utils.data import TensorDataset, random_split, DataLoader, RandomSampler, SequentialSampler

# Combine the training inpurts into a TensorDataset
dataset = TensorDataset(input_ids, attention_masks, labels)

# Calculate size of each dataset
training_size = int(0.9 * len(dataset))
validation_size = len(dataset) - training_size

# Split total dataset into training and validation sets
training_set, validation_set = random_split(dataset, [training_size, validation_size])

# Create a dataloader to batch and shuffle the data (split by training and testing data)
# Randomly sample datasets using RandomSampler


print('{:>8,} training samples'.format(training_size))
print('{:>8,} validation samples'.format(validation_size))

  38,394 training samples
   4,267 validation samples


In [19]:
# Determine batch size for training
batch_size = 20

# Create DataLoaders for training and validation sets
train_dataloader = DataLoader(training_set, sampler=RandomSampler(training_set), batch_size=batch_size)
validation_dataloader = DataLoader(validation_set, sampler=SequentialSampler(validation_set), batch_size=batch_size)

In [20]:
from transformers import AdamW
from torch.nn import CrossEntropyLoss

# Define optimizer and loss function
optimizer = torch.optim.AdamW(model.parameters(),
                              lr = 5e-5,        # learning rate default value
                              eps = 1e-8)       # epsilon default value

loss_fn = torch.nn.BCELoss()                    # measures difference between predicted vs. actual probability dist for binary classification

In [21]:
from transformers import get_linear_schedule_with_warmup

# Number of training epochs
epochs = 3

# Total number of training steps
total_steps = len(train_dataloader)*epochs

# Create learning rate scheduler
scheduler = get_linear_schedule_with_warmup(optimizer,
                                            num_warmup_steps=0,
                                            num_training_steps=total_steps)

In [22]:
# Define helper function for calculating accuracy
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)
# Citation: BERT Fine-Tuning Tutorial with PyTorch (Chris McCormick, Nick Ryan; 2020)

In [23]:
# Define helper function for formatting elapsed time as hh:mm:ss
import time
import datetime

def format_time(elapsed):
    '''
    Takes time in seconds and returns string in format hh:mm:ss
    '''
    # Round to the nearest second.
    elapsed_rounded = int(round((elapsed)))
    
    # Format as hh:mm:ss
    return str(datetime.timedelta(seconds=elapsed_rounded))
# Citation: BERT Fine-Tuning Tutorial with PyTorch (Chris McCormick, Nick Ryan; 2020)
# I found these functions to be quite intuitive for visualizing the training process so I wanted to add them into this notebook and credit them

# Training!

In [25]:
# Switch model to training mode
model.train()

# Set up training metrics
training_stats = []     # training, validation loss, accuracy, timings
total_t0 = time.time()  # total training time for entire run

for epoch in range(epochs):
    
    #-----------------
    #    TRAINING
    #-----------------
    
    # reset total loss per epoch
    total_train_loss = 0.0     

    print('')
    print('----- Epoch {:} / {:} -----'.format(epoch+1, epochs))
    print('Training...')

    # Measure how long the training epoch takes
    t0 = time.time()     
    
    for step, batch in enumerate(train_dataloader): 
        
        # Progress update every 400 batches because my computer is slow and I want to know if it's working or not...
        if step % 500 == 0 and not step == 0:
            elapsed = format_time(time.time()-t0)
            print('  Batch {:>4,} of {:>4,}. Elapsed: {:}'.format(step, len(train_dataloader), elapsed))
            
        # Separate into 3 pytorch tensors
        batch_input_ids = batch[0].to(gpu)
        batch_attention_masks = batch[1].to(gpu)
        batch_labels = batch[2].to(gpu)
        
        # Forward pass (evaluate the model on training batch)
        output = model(batch_input_ids,
                       attention_mask=batch_attention_masks, 
                       labels=batch_labels)
        loss = output.loss
        
        # Total training loss over all batches; `.item()` returns the python value from the tensor
        total_train_loss += loss.item()
        
        # Backward pass (calculate gradients)
        loss.backward()
        
        # Clip the norm of the gradients to 1.0 to prevent exploding gradients
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        
        # Update model parameters
        optimizer.step()
        
        # Update learning rate
        scheduler.step()
        
        # Clear previously calculated gradients
        optimizer.zero_grad()
        model.zero_grad()
        
    # Calculate average loss over all batches
    avg_train_loss = total_train_loss / len(train_dataloader)
    
    # Measure how long the epoch took to train
    train_time = format_time(time.time()-t0)    # format_time(time.time()-t0)
    
    print('')
    print('    Average training loss: {0:.2f}'.format(avg_train_loss))
    print('    Training epoch took: {:}'.format(train_time))
    
    
    #--------------------
    #     VALIDATION
    #--------------------
    
    print('')
    print('Running Validation...')
    
    t0 = time.time()
    
    # Switch model to evaluation mode
    model.eval()
    
    # Tracking metrics
    total_eval_accuracy = 0
    total_eval_loss = 0
    nb_eval_steps = 0
    
    # Evaluate data for 1 epoch
    for batch in validation_dataloader:
        
        # Separate into 3 pytorch tensors
        batch_input_ids = batch[0].to(gpu)
        batch_attention_masks = batch[1].to(gpu)
        batch_labels = batch[2].to(gpu)
        
        with torch.no_grad():
            output = model(batch_input_ids,
                           attention_mask=batch_attention_masks,
                           labels=batch_labels)
        
        # Get loss and logits output by model
        loss = output.loss
        logits = output.logits
        
        #Move logits and labels to CPI
        logits = logits.detach().cpu().numpy()
        label_ids = batch_labels.to('cpu').numpy()
        
        # Total training loss over all abtches
        total_eval_loss += loss.item()
        
        # Calculate accuracy for this batch; total for all batches
        total_eval_accuracy += flat_accuracy(logits, label_ids)
    
    # Report final accuracy for evaluation run
    avg_eval_accuracy = total_eval_accuracy / len(validation_dataloader)
    print('    Accuracy: {0:.2f}'.format(avg_eval_accuracy))
    
    # Calculate average loss over all batches
    avg_eval_loss = total_eval_loss / len(validation_dataloader)
    
    # Calculate time for validation
    eval_time = format_time(time.time()-t0)
    print('    Validation Loss: {0:.2f}'.format(avg_eval_loss))
    print('    Validation Time: {:}'.format(eval_time))
    
    # Record all statistics from this epoch
    training_stats.append(
        {
            'epoch': epoch+1,
            'Training Loss': avg_train_loss,
            'Evaluation Loss': avg_eval_loss,
            'Evaluation Accuracy': avg_eval_accuracy,
            'Training Time': train_time,
            'Validation Time': eval_time
        }
    )

print('')
print('Training complete')
print('Total training time: {:} (h:mm:ss)'.format(format_time(time.time()-total_t0)))


----- Epoch 1 / 3 -----
Training...
  Batch  500 of 1,920. Elapsed: 0:03:17
  Batch 1,000 of 1,920. Elapsed: 0:06:43
  Batch 1,500 of 1,920. Elapsed: 0:10:06

    Average training loss: 0.21
    Training epoch took: 0:12:55

Running Validation...
    Accuracy: 0.94
    Validation Loss: 0.18
    Validation Time: 0:00:27

----- Epoch 2 / 3 -----
Training...
  Batch  500 of 1,920. Elapsed: 0:03:18
  Batch 1,000 of 1,920. Elapsed: 0:06:52
  Batch 1,500 of 1,920. Elapsed: 0:10:29

    Average training loss: 0.10
    Training epoch took: 0:13:30

Running Validation...
    Accuracy: 0.94
    Validation Loss: 0.20
    Validation Time: 0:00:30

----- Epoch 3 / 3 -----
Training...
  Batch  500 of 1,920. Elapsed: 0:03:31
  Batch 1,000 of 1,920. Elapsed: 0:06:51
  Batch 1,500 of 1,920. Elapsed: 0:10:10

    Average training loss: 0.04
    Training epoch took: 0:12:58

Running Validation...
    Accuracy: 0.94
    Validation Loss: 0.27
    Validation Time: 0:00:28

Training complete
Total training 

In [26]:
# Display training summary

# Create a DataFrame from training stats
df_stats = pd.DataFrame(data=training_stats)

# Initially, the intention was to do multiple epochs, but since it took so long to do one and the accuracy came out 
# high, I decided to only do 1 for this execise. With how many data points were in the training and testing files,
# doing 1 epoch should suffice. With low loss during training and evaluation, i feel comfortable with how this data 
# was trained
df_stats = df_stats.set_index('epoch').round(2)

df_stats

Unnamed: 0_level_0,Training Loss,Evaluation Loss,Evaluation Accuracy,Training Time,Validation Time
epoch,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,0.21,0.18,0.94,0:12:55,0:00:27
2,0.1,0.2,0.94,0:13:30,0:00:30
3,0.04,0.27,0.94,0:12:58,0:00:28


In [27]:
# SAVE THE MODEL!!!!!!
import pickle
import os

with open('model.pkl', 'wb') as f:
    pickle.dump(model, f)
    
print('Model saved: ' + os.getcwd() + '\model.pkl')

Model saved: C:\Users\leeha\PycharmProjects\twitter_bias\model.pkl


In [28]:
# Load the model. This is now a major checkpoint after the model has been trained and validated
import pickle
import os

with open('model.pkl', 'rb') as f:
    model = pickle.load(f)

print('Model loaded: ' + os.getcwd() + '\model.pkl')

Model loaded: C:\Users\leeha\PycharmProjects\twitter_bias\model.pkl


# Model Testing

In [34]:
import pandas as pd

# Load the dataset into a pandas dataframe.
df_test = pd.read_csv('amazon_review_polarity_csv/test.csv', header=None, names=['label', 'title', 'review'])

# Truncate testing dataset
df_test = df_test.sample(frac=0.1, random_state=42)

# Report the number of sentences.
print('Number of test reviews: {:,}\n'.format(df_test.shape[0]))

# Create sentence and label lists
reviews = df_test.review.values
labels = df_test.label.values - 1

Number of test reviews: 40,000



In [35]:
# Tokenize all of the sentences and map the tokens to thier word IDs.
input_ids = []
attention_masks = []

max_len = 0

for text in reviews:

    # Tokenize the text and add `[CLS]` and `[SEP]` tokens.
    input_ids = tokenizer.encode(text, add_special_tokens=True)

    # Update the maximum sentence length.
    max_len = max(max_len, len(input_ids))

print('Max sentence length in subset: ', max_len)

Max sentence length in subset:  332


In [36]:
# For every sentence...
input_ids = []
attention_masks = []

# Set max length based on previous cell result

max_length = 260 

for text in reviews:
    if len(text) > max_length:
        text = text[:max_length]    # truncate longer sequences
    encoded_dict = tokenizer.encode_plus(text,                                         # text to encode
                                         add_special_tokens=True,                      # add '[CLS]' and '[SEP]'
                                         truncation=True,
                                         max_length=max_length,                        # truncate all sentences
                                         padding='max_length',                          # pad to max_length
                                         return_attention_mask=True,                   # Construct attn. masks
                                         return_tensors='pt')                          # Return pytorch tensors
    input_ids.append(encoded_dict['input_ids'])
    attention_masks.append(encoded_dict['attention_mask'])
    
input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)
labels = torch.tensor(labels)

In [37]:
from torch.utils.data import TensorDataset, DataLoader, SequentialSampler

# Set the batch size.  
batch_size = 20  

# Create the DataLoader.
prediction_data = TensorDataset(input_ids, attention_masks, labels)
prediction_dataloader = DataLoader(prediction_data, sampler=SequentialSampler(prediction_data), batch_size=batch_size)

In [38]:
# Prediction on test set

print('Predicting labels for {:,} test sentences...'.format(len(input_ids)))

# Put model back into evaluation mode
model.eval()

# Set tracking variables for predictions 
predictions , true_labels = [], []

# Run predictions 
for batch in prediction_dataloader:
    # Add batch to GPU
    batch = tuple(t.to(gpu) for t in batch)

    # Unpack the inputs from our dataloader
    batch_input_ids, batch_input_mask, batch_labels = batch

    # Telling the model not to compute or store gradients, saving memory and 
    # speeding up prediction
    with torch.no_grad():
      # Forward pass, calculate logit predictions.
      result = model(batch_input_ids, 
                     token_type_ids=None, 
                     attention_mask=batch_input_mask,
                     return_dict=True)

    logits = result.logits

    # Move logits and labels to CPU
    logits = logits.detach().cpu().numpy()
    label_ids = batch_labels.to('cpu').numpy()

    # Store predictions and true labels
    predictions.append(logits)
    true_labels.append(label_ids)
    
# Print when done
print('---- DONE')    

Predicting labels for 40,000 test sentences...
---- DONE


# Calculating Accuracy, Precision, and Recall

Accuracy: Comparing predicted labels to true labels
<br>
Precision: Measures proportion of true positives among all positive predictions
<br>
Recall: Measures the proportion of true positives that were correctly predicted

In [74]:
# Lets see what we're working with...
print('Prediction datatype: ', type(predictions))
print('Prediction sample: ', predictions[:1])

print('\n\n')

print('True label datatype: ', type(true_labels))
print('True label sample: ', true_labels[:1])

Prediction datatype:  <class 'list'>
Prediction sample:  [array([[-2.1500902,  3.0271022],
       [ 3.1468945, -3.380125 ],
       [-1.788168 ,  2.6663263],
       [-2.3615484,  3.1593673],
       [ 3.1779842, -3.4317043],
       [ 3.189553 , -3.4439228],
       [ 3.0414546, -3.2914834],
       [-2.1212173,  2.9977617],
       [-2.3401198,  3.1620266],
       [ 3.0207577, -3.2524347],
       [-2.401752 ,  3.191451 ],
       [ 3.1120658, -3.3415437],
       [ 3.2073267, -3.481062 ],
       [ 3.153405 , -3.4042554],
       [ 3.074229 , -3.2698395],
       [ 3.1316717, -3.37944  ],
       [-2.582051 ,  3.239312 ],
       [ 3.1050615, -3.3385754],
       [-2.2124133,  3.0744362],
       [ 3.1532168, -3.4060702]], dtype=float32)]



True label datatype:  <class 'list'>
True label sample:  [array([1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0],
      dtype=int64)]


In [73]:
# Calculate precision, accuaracy, and recall scores
from sklearn.metrics import accuracy_score, precision_score, recall_score

accuracy_set = []
precision_set = []
recall_set = []

for i in range(len(true_labels)):
    pred_labels = np.argmax(predictions[i], axis=1).flatten()
    
    accuracy_set.append(accuracy_score(true_labels[i], pred_labels))
    precision_set.append(precision_score(true_labels[i], pred_labels))
    recall_set.append(recall_score(true_labels[i], pred_labels, pos_label=1))

print('Accuracy score: {:>12.4}'.format(sum(accuracy_set) / len(accuracy_set)))
print('Precision score: {:>12.4}'.format(sum(precision_set) / len(precision_set)))
print('Recall score: {:>15.4}'.format(sum(recall_set) / len(recall_set)))

Accuracy score:        0.891
Precision score:       0.9027
Recall score:          0.8784


# Conclusion

Pretty good scores! I'm quite happy with the results since I used a random sample of a large dataset, which could be prone to errors and using too much of one class. With the full dataset, I would hope for higher scores, but this is definitely a satisfactory score.