# 1. load library

In [1]:
import pandas as pd
from transformers import BertTokenizer
from datasets import Dataset, DatasetDict
from sklearn.model_selection import train_test_split
import torch
# from datasets import load_dataset
from transformers import BertForSequenceClassification, Trainer, TrainingArguments

  from .autonotebook import tqdm as notebook_tqdm





# 2. Load dataset

In [2]:
dataset = pd.read_csv("hf://datasets/tellikoroma/mentalhealth/mh_data.csv")

# 3. get dataset dan convert ke dataframe

In [47]:
dataset.head()

Unnamed: 0,tag,pattern,response
0,greeting,Hi,Hello there. Tell me how are you feeling today?
1,greeting,Hi,Hi there. What brings you here today?
2,greeting,Hi,Hi there. How are you feeling today?
3,greeting,Hi,Great to see you. How do you feel currently?
4,greeting,Hi,Hello there. Glad to see you're back. What's g...


# 4. Data Preprocessing

## 4.1 Cek Null value

In [49]:
dataset.isna().sum()

tag         0
pattern     0
response    0
dtype: int64

## 4.2 split data

In [57]:
train_df, val_df = train_test_split(dataset, test_size=0.2, random_state=42)

# Convert DataFrames to Hugging Face datasets
train_dataset = Dataset.from_pandas(train_df[['pattern', 'tag']])
val_dataset = Dataset.from_pandas(val_df[['pattern', 'tag']])


# 5. Modelling

## 5.1 Tokenization

In [58]:
# Initialize tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

def tokenize_function(examples):
    return tokenizer(examples['pattern'], padding='max_length', truncation=True)

# Tokenize datasets
tokenized_train_dataset = train_dataset.map(tokenize_function, batched=True)
tokenized_val_dataset = val_dataset.map(tokenize_function, batched=True)

Map: 100%|██████████| 637887/637887 [02:50<00:00, 3746.23 examples/s]
Map: 100%|██████████| 159472/159472 [00:41<00:00, 3862.19 examples/s]


In [61]:
# Optional: Map tags to integers for classification
tag2id = {tag: idx for idx, tag in enumerate(dataset['tag'].unique())}
train_dataset = tokenized_train_dataset.map(lambda x: {'label': tag2id[x['tag']]})
val_dataset = tokenized_val_dataset.map(lambda x: {'label': tag2id[x['tag']]})

Map: 100%|██████████| 637887/637887 [00:44<00:00, 14464.47 examples/s]
Map: 100%|██████████| 159472/159472 [00:09<00:00, 15974.84 examples/s]


In [62]:
# Create DatasetDict
tokenized_datasets = DatasetDict({
    'train': train_dataset,
    'validation': val_dataset
})

## 5.2 Create Model

In [64]:
# Initialize model
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(tag2id))

# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    num_train_epochs=3,
    logging_dir='./logs',
    logging_steps=10,
    load_best_model_at_end=True,
    metric_for_best_model='accuracy',
)

# Define Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['validation']
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  return torch._C._cuda_getDeviceCount() > 0


## 5.3 train model

In [65]:
# Train the model
trainer.train()

  0%|          | 39/239208 [34:12<3639:31:40, 54.78s/it]

# 6. Test