In [23]:
import torch
training_device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
training_device
import numpy as np

In [24]:
import pandas as pd
from datasets import load_dataset, Dataset, DatasetDict

df = pd.read_csv("data/accuracy.csv")
df = df[["transcription", "most_common_value"]]
df = df.rename(columns={'transcription': 'text', 'most_common_value': 'label'})

# Apply the mapping to the 'labels' column
#df['label'] = df['label'].map(label_mapping)
df.dropna(subset=['label'], inplace=True)
df.reset_index(drop=True, inplace=True)

df.head()

Unnamed: 0,text,label
0,My customer is the good man and he takes care...,3.0
1,. So mobile computer is...,2.0
2,At the many times you have to manage the peop...,0.0
3,Only my appearance and grandpa in my family. ...,3.0
4,The,2.0


In [25]:
df["label"].value_counts()

2.0    2153
3.0    1832
0.0    1346
4.0     126
1.0      47
Name: label, dtype: int64

In [26]:
ds = Dataset.from_pandas(df)
ds

Dataset({
    features: ['text', 'label'],
    num_rows: 5504
})

In [27]:
from datasets import ClassLabel, Value, Sequence
new_features = ds.features.copy()
new_features["label"] = ClassLabel(names=[0, 1, 2, 3, 4])
ds = ds.cast(new_features)

# Step 1: Initial train/test split with stratification
train_test_ds = ds.train_test_split(test_size=0.20, seed=20)

# Step 2: Split the test set into half test, half validation
test_valid_split = train_test_ds['test'].train_test_split(test_size=0.5, seed=20)

# Step 3: Combine everything into a single DatasetDict
ds = DatasetDict({
    'train': train_test_ds['train'],
    'test': test_valid_split['train'],    # This becomes the test set
    'validation': test_valid_split['test']  # This becomes the validation set
})
ds

Casting the dataset:   0%|          | 0/5504 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 4403
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 550
    })
    validation: Dataset({
        features: ['text', 'label'],
        num_rows: 551
    })
})

In [28]:
# Verify label distribution
from collections import Counter

print("Train label counts:", Counter(ds['train']['label']))
print("Test label counts:", Counter(ds['test']['label']))
print("Validation label counts:", Counter(ds['validation']['label']))

Train label counts: Counter({2: 1737, 3: 1453, 0: 1081, 4: 94, 1: 38})
Test label counts: Counter({2: 217, 3: 185, 0: 128, 4: 15, 1: 5})
Validation label counts: Counter({2: 199, 3: 194, 0: 137, 4: 17, 1: 4})


In [29]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

In [30]:
tokenizer(ds["train"][0]["text"])

{'input_ids': [101, 1045, 2293, 2035, 1997, 11901, 1997, 3059, 2833, 2066, 10733, 2030, 2053, 26156, 1998, 2061, 2006, 1012, 1045, 2572, 2025, 1037, 5470, 1997, 2822, 2833, 2138, 1045, 2424, 2070, 10447, 1999, 2859, 2428, 6881, 2066, 2027, 2024, 5983, 2673, 2021, 2027, 2079, 2031, 2070, 11937, 21756, 2477, 2205, 1012, 102], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [31]:
def tokenize_function(examples):
    return tokenizer(
        examples["text"],
        padding="max_length",
        truncation=True,
        max_length=128
    )

In [32]:
tokenized_train = ds["train"].map(tokenize_function, batched=True)

Map:   0%|          | 0/4403 [00:00<?, ? examples/s]

In [33]:
tokenized_test = ds["test"].map(tokenize_function, batched=True)

Map:   0%|          | 0/550 [00:00<?, ? examples/s]

In [34]:
tokenized_validation = ds["validation"].map(tokenize_function, batched=True)

Map:   0%|          | 0/551 [00:00<?, ? examples/s]

In [35]:
unique_labels = set(ds['train']['label'])
num_labels = len(unique_labels)
num_labels

5

In [36]:
from transformers import DistilBertForSequenceClassification, TrainingArguments, Trainer
model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=num_labels)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [37]:
import numpy as np 
import evaluate

metric = evaluate.load("accuracy")

In [38]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [39]:
args = TrainingArguments(
    output_dir="../../model_saved/distilbert-base-ft-speaking-accuracy",
    evaluation_strategy= "epoch",
    save_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=64,
    per_device_eval_batch_size=64,
    num_train_epochs=5,
    weight_decay=0.01,
    save_total_limit=1,   
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
)



In [40]:
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,
    compute_metrics=compute_metrics,
)

In [41]:
trainer.train()

Epoch,Training Loss,Validation Loss


TrainOutput(global_step=690, training_loss=1.0139530790024909, metrics={'train_runtime': 137.5975, 'train_samples_per_second': 319.991, 'train_steps_per_second': 5.015, 'total_flos': 1458212902003200.0, 'train_loss': 1.0139530790024909, 'epoch': 10.0})

In [42]:
print(trainer.evaluate())

{'eval_loss': 1.1719675064086914, 'eval_accuracy': 0.43272727272727274, 'eval_runtime': 0.5175, 'eval_samples_per_second': 1062.703, 'eval_steps_per_second': 17.39, 'epoch': 10.0}


In [43]:
predictions = trainer.predict(tokenized_test)
logits = predictions.predictions

predic_ = np.argmax(logits, axis=-1)
ref = predictions.label_ids
#print(predic_)
#print(predictions.predictions, predictions.label_ids)

from sklearn.metrics import cohen_kappa_score
ck = round(cohen_kappa_score(predic_, ref, weights="quadratic"), 2)
print("cohen kappa==> ",ck)

from sklearn.metrics import classification_report
print(classification_report(ref, predic_))

cohen kappa==>  0.17
              precision    recall  f1-score   support

           0       0.34      0.12      0.17       128
           1       0.00      0.00      0.00         5
           2       0.42      0.58      0.49       217
           3       0.47      0.53      0.50       185
           4       0.00      0.00      0.00        15

    accuracy                           0.43       550
   macro avg       0.25      0.24      0.23       550
weighted avg       0.40      0.43      0.40       550



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [44]:
"""
import boto3

s3 = boto3.client('s3')

# Define the bucket name and file details
bucket_name = "sagemaker-eu-central-1-505049265445"
file_path = "/home/ec2-user/tmps/llm_fine_tuning/model_saved/distilbert-base-ft-speaking-accuracy_onnx/config.json"
object_name = "models/config.json"  # S3 key for the file

# Upload the file
s3.upload_file(file_path, bucket_name, object_name)
print(f"File {file_path} uploaded to s3://{bucket_name}/{object_name}")
"""

'\nimport boto3\n\ns3 = boto3.client(\'s3\')\n\n# Define the bucket name and file details\nbucket_name = "sagemaker-eu-central-1-505049265445"\nfile_path = "/home/ec2-user/tmps/llm_fine_tuning/model_saved/distilbert-base-ft-speaking-accuracy_onnx/config.json"\nobject_name = "models/config.json"  # S3 key for the file\n\n# Upload the file\ns3.upload_file(file_path, bucket_name, object_name)\nprint(f"File {file_path} uploaded to s3://{bucket_name}/{object_name}")\n'