Nama : M. Iqbal Baskoro
# PROJECT 3 : VOICE RECOGNITION
## Klasifikasi Maksud Ucapan (Intent Classification)

#### TUJUAN :
Membangun model yang dapat mengenali dan
mengklasifikasikan maksud (intent) dari sebuah rekaman suara.

#### DATASET:
https://huggingface.co/datasets/PolyAI/minds14

#### <Strong>Data Field</Strong>

| Nama Kolom | Tipe Data | Keterangan |
|:----------:|:---------:|:----------:|
|`path `| String | Path to the audio file |
|`audio `| Dict | Audio object including loaded audio array, sampling rate and path ot audio |
|`transcription `| String | Transcription of the audio file |
|`english_transcription `| String | English transcription of the audio file |
|`intent_class `| Integer | Class id of intent |
|`lang_id `| Integer | Id of language |

#### <strong>Penggunaan Bahasa Pada Dataset</strong>
-`all`

#### <strong>Pengolahan Dataset</strong>
-`Pembagian Dataset` = <strong>Training</strong>(70%), <strong>Validasi</strong>(15%), dan <strong>Test</strong>(15%) <br>
-`Audio Preprocessing` = <strong>Resampling</strong>, Audio diseragamkan ke sampling rate 16000 Hz <br>


#### MODEL :
Training model menggunakan ` Bidirectional LSTM` <br>
Feature Extraction `MFCC`

In [None]:
from datasets import load_dataset

print("Memuat dataset MINDS-14...")
dataset = load_dataset("PolyAI/minds14", name="all", split="train", trust_remote_code=True)
print("\nDataset berhasil dimuat:")
print(dataset)

: 

In [None]:
import pandas as pd
from IPython.display import display

df = dataset.to_pandas()
csv_path = "../data/minds14.csv"
df.to_csv(csv_path, index=False)

print("Data sudah disimpan ke ../data/minds14.csv")

In [None]:
# Menampilkan informasi dasar tentang DataFrame
df.info()

Pengolahan Dataset <br>
-Pembagian Dataset = Training (70%), Validation (15%), dan Test (15%) <br>
-Audio Preprocessing = Resampling, Audio diseragamkan ke sampling rate 16000 Hz agar sesuai dengan model Wav2Vec2. <br>
-Ekstraksi Fitur = Model Wav2Vec2 akan mengekstraksi fitur audio secara internal dari data gelombang mentah. <br>
-Normalisasi = Fitur audio akan dinormalisasi secara internal oleh feature extractor model.

In [None]:
df.head()

In [None]:
from IPython.display import Audio, display

sample = dataset[0]
print("\nStruktur satu sampel data:")
for key, value in sample.items():
    print(f"  - {key}: {value}")

intent_names = dataset.features['intent_class'].names
num_intents = len(intent_names)
print(f"\nDaftar Intent ({num_intents} kelas):")
print(intent_names)

print("\nContoh sample transkrip, intent, dan audio")
print(f"Transkripsi : '{sample['english_transcription']}'")
print(f"Intent : {intent_names[sample['intent_class']]}")

display(Audio(sample['audio']['array'], rate=sample['audio']['sampling_rate']))

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import librosa
import librosa.display
import pandas as pd

sns.set_theme(style="whitegrid")

intent_labels = [intent_names[i] for i in dataset['intent_class']]
value_counts = pd.Series(intent_labels).value_counts()

plt.figure(figsize=(12, 8))
ax = sns.countplot(y=intent_labels, hue=intent_labels, order=value_counts.index, palette="viridis")
ax.bar_label(ax.containers[0], fmt=' %d', label_type='edge', padding=5)

plt.title('Distribusi Kelas Intent dalam Dataset', fontsize=16)
plt.xlabel('Jumlah Sampel', fontsize=12)
plt.ylabel('Intent', fontsize=12)
plt.xlim(0, value_counts.max() * 1.1)
plt.tight_layout()
plt.show()

In [None]:
sns.set_theme(style="darkgrid")

y = sample['audio']['array']
sr = sample['audio']['sampling_rate']

plt.figure(figsize=(12, 4))
librosa.display.waveshow(y, sr=sr, color='lightblue')
plt.title(f'Waveform Audio Sampel\nIntent: {intent_names[sample["intent_class"]]}', fontsize=14)
plt.xlabel('Waktu (detik)', fontsize=12)
plt.ylabel('Amplitudo', fontsize=12)
plt.tight_layout()
plt.show()

In [None]:
from transformers import Wav2Vec2FeatureExtractor

model_checkpoint = "facebook/wav2vec2-base-960h"
feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(model_checkpoint)

intent_names = dataset.features['intent_class'].names
label2id, id2label = dict(), dict()
for i, label in enumerate(intent_names):
    label2id[label] = str(i)
    id2label[str(i)] = label
num_labels = len(id2label)

print("Nama-nama Intent :", intent_names)
print("\nJumlah Label :", num_labels)

In [None]:
def preprocess_function(examples):
    audio_arrays = [x["array"] for x in examples["audio"]]
    inputs = feature_extractor(
        audio_arrays, 
        sampling_rate=feature_extractor.sampling_rate, 
        max_length=int(feature_extractor.sampling_rate * 5.0), # padding audio ke 5 detik
        truncation=True,
    )
    # Ganti nama kolom 'intent_class' menjadi 'label'
    inputs["label"] = examples["intent_class"]
    return inputs

In [None]:
from datasets import Audio

dataset = dataset.cast_column("audio", Audio(sampling_rate=16000))

# Membagi dataset 70% train, 15% validation, 15% test
train_test_valid = dataset.train_test_split(test_size=0.3, seed=42)
test_valid = train_test_valid['test'].train_test_split(test_size=0.5, seed=42)

train_dataset = train_test_valid['train']
val_dataset = test_valid['train']
test_dataset = test_valid['test']

encoded_train_dataset = train_dataset.map(preprocess_function, remove_columns=dataset.column_names, batched=True)
encoded_val_dataset = val_dataset.map(preprocess_function, remove_columns=dataset.column_names, batched=True)
encoded_test_dataset = test_dataset.map(preprocess_function, remove_columns=dataset.column_names, batched=True)

print(f"Ukuran data train : {len(train_dataset)}")
print(f"Ukuran data validasi : {len(val_dataset)}")
print(f"Ukuran data test : {len(test_dataset)}")

In [None]:
from transformers import Wav2Vec2ForSequenceClassification

model = Wav2Vec2ForSequenceClassification.from_pretrained(
    model_checkpoint,
    num_labels=num_labels,
    label2id=label2id,
    id2label=id2label,
)

# Freeze layer Wav2Vec2.
for param in model.wav2vec2.parameters():
    param.requires_grad = False
print("Lapisan dasar berhasil dibekukan.")

trainable_params = 0
all_param = 0
for name, param in model.named_parameters():
    all_param += param.numel()
    if param.requires_grad:
        trainable_params += param.numel()
        print(f"  - {name}")
print(
    f"\nTotal Parameter : {all_param}"
    f"\nParameter Training : {trainable_params}"
    f"\nPersentase yang dilatih : {100 * trainable_params / all_param:.2f}%"
)

In [None]:
import numpy as np
import evaluate
from transformers import TrainingArguments, Trainer

accuracy_metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    predictions = np.argmax(eval_pred.predictions, axis=1)
    return accuracy_metric.compute(predictions=predictions, references=eval_pred.label_ids)

training_args = TrainingArguments(
    output_dir="./wav2vec2-minds14-intent-classification",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    gradient_accumulation_steps=2,
    eval_strategy="epoch",
    num_train_epochs=5,
    fp16=True,
    save_strategy="epoch",
    logging_steps=10,
    learning_rate=0.001,
    save_total_limit=2,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    report_to="none"
)

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=encoded_train_dataset,
    eval_dataset=encoded_val_dataset,
    tokenizer=feature_extractor,
    compute_metrics=compute_metrics,
)

print("\nMulai Training")
trainer.train()
print("\nTraining Selesai")

In [None]:
print("\nEvaluasi model pada test set...")
test_results = trainer.evaluate(encoded_test_dataset)
print(test_results)