# Exploring Transfer Learning Performance in NLP: A Cross-Dataset Generalization Study

This experiment aims to answer the following research questions:


1.   Which BERT family model works best for transfer learning in NLP classification tasks?
2.   How does model performance vary when trained on related versus unrelated datasets?
3.   What are the optimal fine-tuning strategies for transfer learning?
4.   How much data is needed to achieve effective transfer learning effects,that is, 1,000, 5,000 records or 10,000 records?
5.   Does the sequence of data training in continuous learning matter?

This jupyter notebook covers the first sequence(A) using **distilbert** where the baseline is on the primary Amazon Reviews dataset on Movies and TV, followed by further training of the baseline model on the secondary data set CDs and Vinyl dataset from a similar domain i.e. entertainment, and finally this model is evaluated on the Grocery and Gourmet review dataset which is a different domain in the food industry.

# Load Data

In [1]:
!pip install datasets
!pip install evaluate
!pip install optuna

Collecting datasets
  Downloading datasets-3.5.0-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.12.0,>=2023.1.0 (from fsspec[http]<=2024.12.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.5.0-py3-none-any.whl (491 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.2/491.2 kB[0m [31m33.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m14.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.12.0-py3-none-any.

In [2]:
import pandas as pd
import optuna
import os
import datasets
from datasets import Dataset
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
import numpy as np
import evaluate
from sklearn.metrics import accuracy_score,precision_recall_fscore_support,f1_score
from sklearn.model_selection import ParameterGrid
from sklearn.utils import shuffle
from transformers import DataCollatorWithPadding
import torch


In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
import pandas as pd
import json

def jsonl_to_df(file_path):
    """Loads a JSONL file into a Pandas DataFrame.

    Args:
        file_path: The path to the JSONL file.

    Returns:
        A Pandas DataFrame containing the data from the JSONL file, or None if an error occurs.
    """
    try:
        data = []
        with open(file_path, 'r') as f:
            for line in f:
                try:
                    data.append(json.loads(line))
                except json.JSONDecodeError as e:
                    print(f"Skipping invalid JSON line: {line.strip()}")
                    print(f"Error: {e}")
        df = pd.DataFrame(data)
        return df
    except FileNotFoundError:
        print(f"Error: File not found at {file_path}")
        return None
    except Exception as e:
        print(f"An unexpected error occurred: {e}")
        return None


In [5]:
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=1)
    # eval_metrics=metric.compute(predictions=predictions, references=labels)
    # f1 = f1_score(labels, predictions, average='weighted')
    accuracy = accuracy_score(labels, predictions)
    f1 = f1_score(labels, predictions, average='weighted')
    return {"accuracy": accuracy, "f1": f1}
    # return eval_metrics

def prep_dataset(df):
    df = shuffle(df)
    df = df[df['rating'] != 3]
    df_subset = df[['text', 'rating']][:5000]
    df_subset['label'] = df['rating'].apply(lambda x: 1 if x > 4 else 0)
    df_subset = df_subset.drop('rating', axis=1)
    return df_subset


In [6]:
# Movies and TV Dataset
# Load file from jsonl to df :
file_path = '/content/drive/MyDrive/ColabNotebooks/W266/final_project_a/shuffle_100k.jsonl'
df = jsonl_to_df(file_path)

# Create binary classification
df_subset = prep_dataset(df)


In [7]:
#  CDs and TV data set
# Load file from jsonl to df :
file_path = '/content/drive/MyDrive/ColabNotebooks/W266/final_project_a/shuffle_2_CDs_100k.jsonl'
df_cds = jsonl_to_df(file_path)

# Create binary classification
df_cds_subset = prep_dataset(df_cds)


In [8]:
#  shuffle df_cds and create sample data set
# Load file from jsonl to df :
file_path = '/content/drive/MyDrive/ColabNotebooks/W266/final_project_a/shuffle_3_Food_100k.jsonl'
df_food = jsonl_to_df(file_path)

df_food_subset = prep_dataset(df_food)

# distilbert-base-uncased

In [9]:
BEST_MODEL_SAVE_PATH ="drive/MyDrive/ColabNotebooks/W266/final_project_a/distilbert_Movies_best"
MODEL_PATH = "drive/MyDrive/ColabNotebooks/W266/final_project_a/distilbert_Movies_baseline"

In [13]:
# Load tokenizer for Distilbert

MAX_SEQUENCE_LENGTH = 50

# Function to preprocess (tokenize) data
def tokenize_function(example):
    review_text = example['text']
    encoded = tokenizer.batch_encode_plus(
            review_text,
            max_length=MAX_SEQUENCE_LENGTH,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_token_type_ids=True,
            return_tensors="pt"
        )

    return encoded


# Apply Tokenization to each dataset
tokens_movies = Dataset.from_pandas(df_subset).map(tokenize_function, batched=True)
tokens_cds = Dataset.from_pandas(df_cds_subset).map(tokenize_function, batched=True)
tokens_food = Dataset.from_pandas(df_food_subset).map(tokenize_function, batched=True)

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

In [14]:
# Baseline
model_name = "distilbert-base-uncased"

model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)
tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

split_datasets = tokens_movies.train_test_split(test_size=0.2)

# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    learning_rate=5e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    evaluation_strategy="epoch"
)


# Create Trainer instance
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=split_datasets["train"],
    eval_dataset=split_datasets["test"],
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
    data_collator=DataCollatorWithPadding(tokenizer=tokenizer)
)

# Train the model
trainer.train()

# Evaluate the model
trainer.evaluate()

# Save the trained model
trainer.save_model(MODEL_PATH)
tokenizer.save_pretrained(MODEL_PATH)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mhngondoki[0m ([33mhngondoki-uc-berkeley[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,No log,0.438334,0.801,0.798787
2,0.401800,0.458558,0.817,0.806329
3,0.401800,0.600247,0.805,0.804006


('drive/MyDrive/ColabNotebooks/W266/final_project_a/distilbert_Movies_baseline/tokenizer_config.json',
 'drive/MyDrive/ColabNotebooks/W266/final_project_a/distilbert_Movies_baseline/special_tokens_map.json',
 'drive/MyDrive/ColabNotebooks/W266/final_project_a/distilbert_Movies_baseline/vocab.txt',
 'drive/MyDrive/ColabNotebooks/W266/final_project_a/distilbert_Movies_baseline/added_tokens.json',
 'drive/MyDrive/ColabNotebooks/W266/final_project_a/distilbert_Movies_baseline/tokenizer.json')

## Hyperparameter tuning

In [15]:
#Fine tuning
best_accuracy = 0.0
best_trainer = None

def objective(trial):
    global best_accuracy, best_trainer

    # Load the saved model
    model = AutoModelForSequenceClassification.from_pretrained(MODEL_PATH, num_labels=2)
    tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)

    # Define hyperparameters to be optimized
    learning_rate = trial.suggest_float("learning_rate", 1e-5, 5e-5, log=True)
    per_device_train_batch_size = trial.suggest_categorical("per_device_train_batch_size", [8, 16])
    num_train_epochs = trial.suggest_int("num_train_epochs", 1, 3)
    weight_decay = trial.suggest_float("weight_decay", 0.01, 0.1)

    # Define training arguments
    training_args = TrainingArguments(
        output_dir=BEST_MODEL_SAVE_PATH,
        learning_rate=learning_rate,
        per_device_train_batch_size=per_device_train_batch_size,
        per_device_eval_batch_size=16,
        num_train_epochs=num_train_epochs,
        weight_decay=weight_decay,
        evaluation_strategy="epoch",
    )

    # Create Trainer instance
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=split_datasets["train"],
        eval_dataset=split_datasets["test"],
        compute_metrics=compute_metrics,
        tokenizer=tokenizer,
        data_collator=DataCollatorWithPadding(tokenizer=tokenizer)
    )

    # Train the model
    trainer.train()

    result = trainer.evaluate()
    accuracy = result["eval_accuracy"]

    # Save the best model
    if accuracy > best_accuracy:
        best_accuracy = accuracy
        best_trainer = trainer

    return accuracy

# Create Optuna study
study = optuna.create_study(direction="maximize")  # Maximize the evaluation metric
study.optimize(objective, n_trials=10)  # Run 10 trials

if best_trainer:
    best_trainer.save_model(BEST_MODEL_SAVE_PATH)
    tokenizer.save_pretrained(BEST_MODEL_SAVE_PATH)
    print(f"Best model saved to: {BEST_MODEL_SAVE_PATH}")

# Print best hyperparameters
print("Best trial:")
trial = study.best_trial
print(f"  Value: {trial.value}")
print(f"  Params: ")
for key, value in trial.params.items():
    print(f"    {key}: {value}")

[I 2025-04-11 16:53:34,694] A new study created in memory with name: no-name-bbced2b7-4bae-4582-8bdd-4c8abb665f4f
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.1988,0.900347,0.803,0.800059
2,0.1445,0.99076,0.798,0.796853


[I 2025-04-11 16:54:52,982] Trial 0 finished with value: 0.798 and parameters: {'learning_rate': 1.159231385877423e-05, 'per_device_train_batch_size': 8, 'num_train_epochs': 2, 'weight_decay': 0.08494255722595466}. Best is trial 0 with value: 0.798.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,No log,0.74815,0.794,0.785705
2,0.143700,0.950319,0.804,0.800166
3,0.143700,1.098484,0.79,0.789532


[I 2025-04-11 16:56:26,155] Trial 1 finished with value: 0.79 and parameters: {'learning_rate': 3.51825199720751e-05, 'per_device_train_batch_size': 16, 'num_train_epochs': 3, 'weight_decay': 0.09988016513644682}. Best is trial 0 with value: 0.798.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,No log,0.758569,0.793,0.78416
2,0.144300,0.94914,0.802,0.798126
3,0.144300,1.099557,0.79,0.789768


[I 2025-04-11 16:57:59,388] Trial 2 finished with value: 0.79 and parameters: {'learning_rate': 3.586574519862683e-05, 'per_device_train_batch_size': 16, 'num_train_epochs': 3, 'weight_decay': 0.017160442515758254}. Best is trial 0 with value: 0.798.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,No log,0.749978,0.79,0.784726
2,0.153500,0.968926,0.795,0.792973


[I 2025-04-11 16:59:02,294] Trial 3 finished with value: 0.795 and parameters: {'learning_rate': 4.9484273493080636e-05, 'per_device_train_batch_size': 16, 'num_train_epochs': 2, 'weight_decay': 0.059355138460800705}. Best is trial 0 with value: 0.798.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.2482,0.831113,0.806,0.801941


[I 2025-04-11 16:59:41,695] Trial 4 finished with value: 0.806 and parameters: {'learning_rate': 3.642061228971356e-05, 'per_device_train_batch_size': 8, 'num_train_epochs': 1, 'weight_decay': 0.0999385134503945}. Best is trial 4 with value: 0.806.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.215,0.957517,0.805,0.802831
2,0.1491,1.062309,0.791,0.790416


[I 2025-04-11 17:01:18,413] Trial 5 finished with value: 0.791 and parameters: {'learning_rate': 1.859491046070508e-05, 'per_device_train_batch_size': 8, 'num_train_epochs': 2, 'weight_decay': 0.06146825320696877}. Best is trial 4 with value: 0.806.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,No log,0.754224,0.793,0.78516
2,0.135500,0.962954,0.798,0.795627
3,0.135500,1.101369,0.783,0.783119


[I 2025-04-11 17:02:44,356] Trial 6 finished with value: 0.783 and parameters: {'learning_rate': 2.897296957284457e-05, 'per_device_train_batch_size': 16, 'num_train_epochs': 3, 'weight_decay': 0.07333279993764837}. Best is trial 4 with value: 0.806.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.2537,0.920833,0.798,0.798
2,0.1677,0.97557,0.798,0.788872
3,0.096,1.16253,0.786,0.784784


[I 2025-04-11 17:04:43,055] Trial 7 finished with value: 0.786 and parameters: {'learning_rate': 2.8224487615230582e-05, 'per_device_train_batch_size': 8, 'num_train_epochs': 3, 'weight_decay': 0.04973238940448446}. Best is trial 4 with value: 0.806.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.2662,0.792872,0.808,0.802625


[I 2025-04-11 17:05:28,582] Trial 8 finished with value: 0.808 and parameters: {'learning_rate': 4.480167354427163e-05, 'per_device_train_batch_size': 8, 'num_train_epochs': 1, 'weight_decay': 0.058923003338536305}. Best is trial 8 with value: 0.808.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.2626,0.819142,0.803,0.800562


[I 2025-04-11 17:06:15,548] Trial 9 finished with value: 0.803 and parameters: {'learning_rate': 4.0374315275608466e-05, 'per_device_train_batch_size': 8, 'num_train_epochs': 1, 'weight_decay': 0.04467587514464077}. Best is trial 8 with value: 0.808.


Best model saved to: drive/MyDrive/ColabNotebooks/W266/final_project_a/distilbert_Movies_best
Best trial:
  Value: 0.808
  Params: 
    learning_rate: 4.480167354427163e-05
    per_device_train_batch_size: 8
    num_train_epochs: 1
    weight_decay: 0.058923003338536305


## Train on CD and Vinyl Dataset


In [16]:
# Load best model
model = AutoModelForSequenceClassification.from_pretrained(BEST_MODEL_SAVE_PATH)
tokenizer = AutoTokenizer.from_pretrained(BEST_MODEL_SAVE_PATH)
TRAINING_ARGS=f"{BEST_MODEL_SAVE_PATH}/training_args.bin"
CD_MODEL_PATH = "drive/MyDrive/ColabNotebooks/W266/final_project_a/distilbert_Movies_CD"

# Split the data
split_datasets = tokens_cds.train_test_split(test_size=0.2)

training_args = torch.load(TRAINING_ARGS, weights_only=False)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=split_datasets["train"],
    eval_dataset=split_datasets["test"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

# Train
trainer.train()

# save model

model.save_pretrained(CD_MODEL_PATH)
tokenizer.save_pretrained(CD_MODEL_PATH)


  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.4653,0.408678,0.828,0.805303


('drive/MyDrive/ColabNotebooks/W266/final_project_a/distilbert_Movies_CD/tokenizer_config.json',
 'drive/MyDrive/ColabNotebooks/W266/final_project_a/distilbert_Movies_CD/special_tokens_map.json',
 'drive/MyDrive/ColabNotebooks/W266/final_project_a/distilbert_Movies_CD/vocab.txt',
 'drive/MyDrive/ColabNotebooks/W266/final_project_a/distilbert_Movies_CD/added_tokens.json',
 'drive/MyDrive/ColabNotebooks/W266/final_project_a/distilbert_Movies_CD/tokenizer.json')

## Evaluate on Food

In [17]:

model = AutoModelForSequenceClassification.from_pretrained(CD_MODEL_PATH)
tokenizer = AutoTokenizer.from_pretrained(CD_MODEL_PATH)

# Train
trainer = Trainer(
    model=model,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

results = trainer.evaluate(eval_dataset=tokens_food)
print("\nEvaluation Results:", results)

  trainer = Trainer(



Evaluation Results: {'eval_loss': 0.376629114151001, 'eval_model_preparation_time': 0.0026, 'eval_accuracy': 0.8468, 'eval_f1': 0.8410996113464048, 'eval_runtime': 9.0061, 'eval_samples_per_second': 555.179, 'eval_steps_per_second': 69.397}
