# BERT Training

In [None]:
#pip install --upgrade pip

In [None]:
# #for config setup 
# !pip install "sagemaker==2.198" "transformers==4.21.1" "datasets==2.9" "torch==1.11.0"

In [None]:
!pip install -q "sagemaker==2.198" "transformers==4.26.0" "datasets==2.9" "torch==1.13.1" "tqdm"

In [7]:
import torch
import pandas as pd
import sagemaker
import boto3
import datasets
import transformers
import numpy as np
from tqdm import tqdm
from datasets import load_dataset, load_from_disk, Dataset, DatasetDict
from datasets.filesystems import S3FileSystem
from transformers import TrainingArguments, Trainer,AutoConfig,AutoTokenizer
from sagemaker.huggingface import HuggingFace, TrainingCompilerConfig

In [59]:
print(datasets.__version__)

2.9.0


## BERT Training

In [4]:
#Set up sagemaker session
sess = sagemaker.Session()
sagemaker_session_bucket='sagemaker-studio-ai-lab-3'
if sagemaker_session_bucket is None and sess is not None:
    sagemaker_session_bucket = sess.default_bucket()
role = sagemaker.get_execution_role()
sess = sagemaker.Session(default_bucket=sagemaker_session_bucket)
print(f"sagemaker role arn: {role}")
print(f"sagemaker bucket: {sess.default_bucket()}")
print(f"sagemaker session region: {sess.boto_region_name}")

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /root/.config/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /root/.config/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /root/.config/sagemaker/config.yaml
sagemaker role arn: arn:aws:iam::956796298028:role/service-role/AmazonSageMaker-ExecutionRole-20211005T133138
sagemaker bucket: sagemaker-studio-ai-lab-3
sagemaker session region: us-east-1


In [8]:
# #loading in data
dfs = []
for load_num in tqdm(range(0,70)):
    df = pd.read_json(f's3://sagemaker-studio-ai-lab-3/final-data/data_{load_num}.json') 
    dfs.append(df)
reviews = pd.concat (dfs, ignore_index = True)

100%|██████████| 70/70 [13:53<00:00, 11.90s/it]


In [9]:
#subsetting to guarantee even distribution of stars for BERT
reviews.r_stars.value_counts()
balanced_df = reviews.groupby('r_stars',as_index = False,group_keys=False).apply(lambda s: s.sample(60000,replace=True)).sample(frac=1)
balanced_df = balanced_df[['r_stars', 'r_text']]
balanced_df['r_stars'] = balanced_df['r_stars'] -1 #change indexing lables to 0 indexing for model

In [10]:
#formatting as dataset object from huggingface
dataset = Dataset.from_pandas(balanced_df, preserve_index=False)
splits = dataset.train_test_split(test_size=0.3)

In [11]:
#tokenizer
tokenizer_name = 'bert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)

# Helper function to get the content to tokenize
def tokenize(batch):
    return tokenizer(batch['r_text'], padding='max_length', truncation=True, max_length = 512)

# Tokenize
train_dataset = splits['train'].map(tokenize, batched=True, batch_size=len(splits['train']))
test_dataset = splits['test'].map(tokenize, batched=True, batch_size=len(splits['test']))

# Set the format to PyTorch
train_dataset = train_dataset.rename_column("r_stars", "labels")
train_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'labels'])
test_dataset = test_dataset.rename_column("r_stars", "labels")
test_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'labels'])

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

In [None]:
len(splits['train'][0]['r_text'])
train_dataset[0]

In [18]:
#upload to s3 for training
import botocore
from datasets.filesystems import S3FileSystem

# Upload to S3
s3 = S3FileSystem()
s3_prefix = f'samples/datasets/tokenized_reviews_v4'
training_input_path = f's3://{sess.default_bucket()}/{s3_prefix}/train'
train_dataset.save_to_disk(training_input_path, fs = s3)
test_input_path = f's3://{sess.default_bucket()}/{s3_prefix}/test'
test_dataset.save_to_disk(test_input_path, fs=s3)

print(f'Uploaded training data to {training_input_path}')
print(f'Uploaded testing data to {test_input_path}')



Saving the dataset (0/2 shards):   0%|          | 0/210000 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/90000 [00:00<?, ? examples/s]

Uploaded training data to s3://sagemaker-studio-ai-lab-3/samples/datasets/tokenized_reviews_v4/train
Uploaded testing data to s3://sagemaker-studio-ai-lab-3/samples/datasets/tokenized_reviews_v4/test


In [None]:
#with out config
#defining hyperparameters and training the BERT model
training_job_name=f'LA-bert-base-60k-v2'
hyperparameters = {'epochs':3, 
                   'train_batch_size': 12, #32
                   'model_name': 'bert-base-uncased',
                   'num_labels': 5
                  }
hyperparameters["learning_rate"] = float("5e-5") / 32 * hyperparameters["train_batch_size"]
volume_size = 100
huggingface_estimator = HuggingFace(entry_point='train.py', 
                                    source_dir='./scripts',
                                    output_path = 's3://{}/{}/{}'.format(sagemaker_session_bucket, s3_prefix, 'bert_model'),
                                    code_location = 's3://{}/{}/{}'.format(sagemaker_session_bucket, s3_prefix, 'custom_code'),
                                    instance_type='ml.g4dn.12xlarge',
                                    instance_count=1,
                                    role=role,
                                    transformers_version='4.26.0',
                                    pytorch_version= '1.13.1',
                                    py_version='py39',
                                    hyperparameters=hyperparameters,
                                    base_job_name = training_job_name,
                                    volume_size = volume_size
                                   )
huggingface_estimator.fit({'train': training_input_path, 'test': test_input_path})

In [63]:
# container image used for training job
print(f"container image used for training job: \n{huggingface_estimator.image_uri}\n")

# s3 uri where the trained model is located
print(f"s3 uri where the trained model is located: \n{huggingface_estimator.model_data}\n")

# latest training job name for this estimator
print(f"latest training job name for this estimator: \n{huggingface_estimator.latest_training_job.name}\n")

container image used for training job: 
None

s3 uri where the trained model is located: 
s3://sagemaker-studio-ai-lab-3/samples/datasets/tokenized_reviews_v2/bert_model/LA-bert-45k-v1-2023-12-02-03-29-43-447/output/model.tar.gz

latest training job name for this estimator: 
LA-bert-45k-v1-2023-12-02-03-29-43-447



Next steps: optimize efficiency of model or just go straight to pulling predictions and then using them as input bc that'll be hard enough

Hyp tuning. If have time do this

In [None]:
hyperparameter_ranges = {"learning_rate": ContinuousParameter(0.0001, 0.1),
                         "warmup_steps": IntegerParameter(100, 500),
                         "optimizer": CategoricalParameter(["AdamW", "Adafactor"]),
                         "weight_decay": ContinuousParameter(0.00, 0.001)}

objective_metric = "loss"
objective_type = "Minimize"
metric_definitions = [{"Name": "loss", "Regex": "loss = ([0-9\\.]+)"}]

In [None]:
tuner = HyperparameterTuner(huggingface_model, #have to use the current estimator here. 
                            objective_metric,
                            hyperparameter_ranges,
                            metric_definitions,
                            max_jobs=3,
                            max_parallel_jobs=1,
                            objective_type=objective_type)
tuner.fit(inputs={"train": r"s3://sagemaker-studio-ai-lab-3/samples/datasets/tokenized_reviews_v2/train", "test": "s3://sagemaker-studio-ai-lab-3/samples/datasets/tokenized_reviews_v2/test"})

### scratch work

In [None]:
# #defining hyperparameters and training the BERT model using config (supposed to be 50% improvement?)
# training_job_name=f'LA-bert-45k-v1'
# hyperparameters = {'epochs':3, 
#                    'train_batch_size': 24,
#                    'model_name': 'bert-base-uncased',
#                    'num_labels': 5
#                   }
# compiler_config=TrainingCompilerConfig()
# volume_size = 50
# huggingface_estimator = HuggingFace(entry_point='train.py', 
#                                     source_dir='./scripts',
#                                     output_path = 's3://{}/{}/{}'.format(sagemaker_session_bucket, s3_prefix, 'bert_model'),
#                                     code_location = 's3://{}/{}/{}'.format(sagemaker_session_bucket, s3_prefix, 'custom_code'),
#                                     instance_type='ml.g4dn.2xlarge', #ml.p2.xlarge
#                                     instance_count=1,
#                                     role=role,
#                                     transformers_version='4.21.1',
#                                     pytorch_version= '1.11.0',
#                                     py_version='py38',
#                                     hyperparameters=hyperparameters,
#                                     base_job_name = training_job_name,
#                                     volume_size = volume_size,
#                                     disable_profiler=True,
#                                     debugger_hook_config=False,
#                                     compiler_config=compiler_config
#                                    )
# huggingface_estimator.fit({'train': training_input_path, 'test': test_input_path})

In [None]:
# # BERT Model for text encoding
# tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased', do_lower_case=True)
# X_train, X_test, y_train, y_test = train_test_split(reviews['r_text'][0:10000], reviews['r_stars'][0:10000], test_size=0.2, random_state=42)

# X_all_encoded = tokenizer.batch_encode_plus(reviews['r_text'].tolist(), padding=True, truncation=True, max_length = 128, return_tensors='tf')
# X_train_encoded = tokenizer.batch_encode_plus(X_train.tolist(), padding=True,  truncation=True, max_length = 128, return_tensors='tf')
# X_test_encoded = tokenizer.batch_encode_plus(X_test.tolist(),  padding=True,  truncation=True, max_length = 128, return_tensors='tf')

In [None]:
# #example of using tokenizer on one sentence
# tokenizer = AutoTokenizer.from_pretrained('bert-base-cased')
# tokenized_output_default = tokenizer(list(reviews['r_text'][0:100]), truncation=True, max_length = 128)
# input_ids = tokenized_output_default['input_ids'][2]
# tokens = tokenizer.convert_ids_to_tokens(input_ids)

# print(reviews['r_text'][2])
# print(tokens)
# print(input_ids)

# #print("Default (is_split_into_words=False):", tokenized_output_default)

In [None]:
# def tokenize_and_align_labels(reviews):
#     tokenized_inputs = tokenizer(list(reviews["r_text"]), truncation=True, max_length = 128, padding = True)
#     labels = []
#     for i, label in enumerate(examples[f"ner_tags"]):
#         word_ids = tokenized_inputs.word_ids(batch_index=i)  # Map tokens to their respective word.
#         previous_word_idx = None
#         label_ids = []
#         for word_idx in word_ids:  # Set the special tokens to -100.
#             if word_idx is None:
#                 label_ids.append(-100)
#             elif word_idx != previous_word_idx:  # Only label the first token of a given word.
#                 label_ids.append(label[word_idx])
#             else:
#                 label_ids.append(-100)
#             previous_word_idx = word_idx
#         labels.append(label_ids)

#     tokenized_inputs["labels"] = labels
#     return tokenized_inputs

In [None]:
# labels = ['1','2','3','4','5']
# id2label = {idx:label for idx, label in enumerate(labels)}
# label2id = {label:idx for idx, label in enumerate(labels)}

# tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

# def preprocess_data(examples):
#     text = examples["r_text"]
#     encoding = tokenizer(text, padding=True, truncation=True, max_length=128)
#     labels_batch = {label: examples[examples] for label in labels} #label:text key-value pairs
#     labels_matrix = np.zeros((len(text), len(labels)))
#     for idx, label in enumerate(labels):
#         labels_matrix[:, idx] = labels_batch[label]
#     encoding["labels"] = labels_matrix.tolist()
#     return encoding

# encoded_dataset = dataset.select(range(100)).map(preprocess_data, batched=True, remove_columns=dataset.column_names)

In [None]:
# #raw_datasets = Dataset.from_pandas(reviews)
# raw_datasets.select(range(100))['r_text']
# dataset = Dataset.from_pandas(reviews)
# list_of_word_lists = [sentence.split() for sentence in dataset['r_text'][0:10]]
# list_of_word_lists

In [None]:
# #tokenizer for BERT
# model_checkpoint = "bert-base-cased"
# tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

# def align_labels_with_tokens(labels, word_ids):
#     new_labels = []
#     current_word = None
#     for word_id in word_ids:
#         if word_id != current_word:
#             current_word = word_id
#             label = -100 if word_id is None else labels[word_id]
#             new_labels.append(label)
#         elif word_id is None:
#             new_labels.append(-100)
#         else:
#             label = -100
#             new_labels.append(label)
#     return new_labels

# def tokenize_and_align_labels(examples):
#     tokenized_inputs = tokenizer(examples["r_text"], truncation=True, max_length = 128, padding = True, is_split_into_words=True)
#     all_labels = examples["r_stars"]
#     new_labels = []
#     for i, labels in enumerate(all_labels):
#         word_ids = tokenized_inputs.word_ids(i)
#         new_labels.append(align_labels_with_tokens(labels, word_ids))
#     tokenized_inputs["labels"] = new_labels
#     return tokenized_inputs

# #Tokenize the Dataset
# tokenized_datasets = raw_datasets.select(range(100)).map(
#     tokenize_and_align_labels,
#     batched=True,
#     remove_columns=raw_datasets.column_names,
# )
# tokenized_datasets.set_format("torch", columns = ['input_ids', 'attention_mask', 'labels'])

# # def tokenize_and_align_labels(reviews):
# #     tokenized_inputs = tokenizer(reviews['r_text'].tolist(), truncation=True, max_length = 128,padding=True, return_tensors='pt')
# #     all_labels = reviews['r_stars'].tolist()
# #     new_labels = []
# #     for i, labels in enumerate(all_labels):
# #         # word_ids = tokenized_inputs.word_ids(i)
# #         # new_labels.append(align_labels_with_tokens(labels, word_ids))
# #         new_labels.append(align_labels_with_tokens(labels, tokenized_inputs['input_ids'][i].tolist()))
# #     tokenized_inputs["labels"] = new_labels
# #     return tokenized_inputs
# # tokenized_datasets = tokenize_and_align_labels(reviews)