In [60]:
import os
import librosa
import numpy as np
from tqdm import tqdm
import matplotlib.pyplot as plt

# Pfad zum TIMIT-Train-Ordner 
TIMIT_DIR = "/home/aaron/automated_speech_recognition/TIMIT/TIMIT/TRAIN"
 # //wsl.localhost/Ubuntu/home/aaron
 # Beispiel: "/Users/deinname/Daten/TIMIT/TRAIN"

# //wsl.localhost/Ubuntu/home/aaron/automated_speech_recognition/TIMIT/TIMIT

# Einstellungen für Audioverarbeitung
SAMPLE_RATE = 16000   # Einheitliche Samplingrate (16 kHz ist Standard für ASR)
N_MFCC = 13           # Anzahl der MFCC-Koeffizienten pro Frame

# Funktion zum Extrahieren von MFCC + Delta + Delta-Delta aus einer WAV-Datei

# this funcion uses the librosa package to extract the MFCC
def extract_mfcc(wav_path, sample_rate=SAMPLE_RATE, n_mfcc=N_MFCC):
    # load audio signal (resample if necessary)
    signal, sr = librosa.load(wav_path, sr=sample_rate)

    # extract mfcc with librosa
    mfcc = librosa.feature.mfcc(y=signal, sr=sr, n_mfcc=n_mfcc)

    # calculate first delta
    delta = librosa.feature.delta(mfcc)

    # calculate second delta
    delta2 = librosa.feature.delta(mfcc, order=2)

    # combine all features (static, 1-delat, 2-delta) to a 39 matrix (features per timeslot)
    combined = np.vstack([mfcc, delta, delta2])  # shape: [39, time]

    #transpone the matrix for [time_steps, 39]
    return combined.T

def load_transcript(txt_path):
    """
    Lädt die komplette Transkriptionszeile aus .TXT oder .WRD
    TIMIT .TXT-Dateien enthalten typischerweise eine Zeile wie:
      THIS IS A SAMPLE .
    """
    with open(txt_path, 'r', encoding='utf8') as f:
        # ganze Zeile, ohne Zeilenende
        return f.read().strip()

def load_phoneme_labels(phn_path):
    """
    Lädt die PHN-Datei und gibt eine einfache Liste von Phonemen zurück,
    z.B. ['sil', 'sh', 'iy', 'hv', ...]
    """
    phonemes = []
    with open(phn_path, 'r', encoding='utf8') as f:
        for line in f:
            _, _, label = line.strip().split()
            phonemes.append(label)
    return phonemes

In [61]:
def load_timit_data(timit_root):
    data = []  # Liste für alle Datenpunkte

    for root, _, files in os.walk(timit_root):
        print("Durchsuche Ordner:", root)
        print("Dateien:", files)
        for file in files:
            if file.lower().endswith(".wav"):
                wav_path = os.path.join(root, file)
                txt_path = wav_path.replace(".wav", ".TXT").replace(".WAV", ".TXT")
                phn_path = wav_path.replace(".wav", ".PHN").replace(".WAV", ".PHN")  

                if not os.path.exists(txt_path):
                    print("Keine Transkript-Datei gefunden für:", wav_path)
                    continue

                try:
                    mfcc_seq = extract_mfcc(wav_path)
                    # transcript = load_transcript(txt_path)
                    # tokens = list(transcript)
                    
                    phonemes = load_phoneme_labels(phn_path)
                    # phonemes_61 = load_phoneme_labels(phn_path)
                    # phonemes_39 = map_61_to_39(phonemes)

                    data.append({
                        "mfcc": mfcc_seq,
                        # "transcript": transcript,
                        # "tokens": tokens,
                        "phonemes": phonemes,
                        "path": wav_path, 
                        # "phonemes_61": phonemes_61,
                        # "phonemes_39": phonemes_39,
                    })

                except Exception as e:
                    print(f"Fehler bei {wav_path}: {e}")

    return data

# this function only loads the mfccs out of the timt data    
# def load_timit_data(timit_root):
#     data = []  

#     for root, _, files in os.walk(timit_root):
#         print("Durchsuche Ordner:", root)
#         print("Dateien:", files)
#         for file in files:
#             if file.lower().endswith(".wav"):
#                 wav_path = os.path.join(root, file)
                

#                 try:
#                     mfcc_seq = extract_mfcc(wav_path)
                   
#                     data.append({
#                         "mfcc": mfcc_seq,
#                     })

#                 except Exception as e:
#                     print(f"Fehler bei {wav_path}: {e}")

#     return data

In [62]:
dataset = load_timit_data(TIMIT_DIR)
# 🖨️ Beispielausgabe: Anzahl und Beispiel
print(f"Geladene Beispiele: {len(dataset)}")
# print("Beispieltranskript:", dataset[0]["transcript"])
print("Tokenliste:", dataset[0]["tokens"])
print("MFCC Shape:", dataset[0]["mfcc"].shape)

Durchsuche Ordner: /home/aaron/automated_speech_recognition/TIMIT/TIMIT/TRAIN
Dateien: []
Durchsuche Ordner: /home/aaron/automated_speech_recognition/TIMIT/TIMIT/TRAIN/DR6
Dateien: []
Durchsuche Ordner: /home/aaron/automated_speech_recognition/TIMIT/TIMIT/TRAIN/DR6/MEAL0
Dateien: ['SX347.wav:Zone.Identifier', 'SI2177.wav:Zone.Identifier', 'SX107.wav:Zone.Identifier', 'SA1.PHN:Zone.Identifier', 'SI1547.WRD', 'SX347.PHN', 'SI2177.wav', 'SX287.PHN:Zone.Identifier', 'SI1547.wav:Zone.Identifier', 'SI917.PHN:Zone.Identifier', 'SI1547.PHN:Zone.Identifier', 'SI917.TXT', 'SX197.TXT', 'SX347.TXT', 'SX107.wav', 'SI917.wav', 'SX197.wav:Zone.Identifier', 'SI1547.WRD:Zone.Identifier', 'SX197.TXT:Zone.Identifier', 'SI2177.WRD:Zone.Identifier', 'SA2.PHN:Zone.Identifier', 'SA1.WRD:Zone.Identifier', 'SI917.wav:Zone.Identifier', 'SX347.wav', 'SX377.WRD', 'SX377.TXT:Zone.Identifier', 'SX377.TXT', 'SX287.WRD', 'SX287.PHN', 'SX377.wav:Zone.Identifier', 'SA1.WRD', 'SX197.PHN', 'SA2.TXT', 'SX347.TXT:Zone.Iden

KeyError: 'tokens'

In [None]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence


# # === MFCCs save for PyTorch ===
def save_data(data, output_path):
    torch.save(data, output_path)
    print(f"Gespeichert unter: {output_path}")

# # === PyTorch Dataset ===
class TIMITMFCCDataset(Dataset):
    def __init__(self, data_path):
        self.data = torch.load(data_path)

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        mfcc = self.data[idx]["mfcc"]
        return torch.tensor(mfcc, dtype=torch.float32)

# === Collate-Funktion für Padding ===
def pad_collate(batch):
    return pad_sequence(batch, batch_first=True)  # => [batch, max_len, 39]#

In [64]:
# update the location!!
save_data(dataset, "/home/aaron/automated_speech_recognition/data1/timit_mfcc_data.pt")

Gespeichert unter: /home/aaron/automated_speech_recognition/data1/timit_mfcc_data.pt


In [65]:
# Lade die Datei
data = torch.load("/home/aaron/automated_speech_recognition/data1/timit_mfcc_data.pt", weights_only=False)

# analysing the data
print(type(data))            # z.B. <class 'list'>
print(len(data))             # z.B. Anzahl der Beispiele
print(type(data[0]))         # z.B. <class 'dict'>
print(data[0].keys())        # z.B. dict_keys(['mfcc'])
print(data[0]['mfcc'].shape) # z.B. torch.Size([123, 39])




<class 'list'>
4620
<class 'dict'>
dict_keys(['mfcc', 'phonemes', 'path'])
(78, 39)


In [66]:
data[1]

{'mfcc': array([[-7.43217407e+02,  5.11423645e+01,  3.72760468e+01, ...,
          4.05487746e-01, -1.76596975e+00,  4.28943634e-01],
        [-7.14529724e+02,  3.65434113e+01,  3.51341553e+01, ...,
          4.05487746e-01, -1.76596975e+00,  4.28943634e-01],
        [-7.24818176e+02,  5.14628887e+00,  1.87460880e+01, ...,
          4.05487746e-01, -1.76596975e+00,  4.28943634e-01],
        ...,
        [-5.98549316e+02,  8.02442474e+01,  7.14992046e+00, ...,
         -8.71393144e-01, -3.89135652e-03,  1.12605953e+00],
        [-6.42915771e+02,  6.33814735e+01,  1.24211855e+01, ...,
         -8.71393144e-01, -3.89135652e-03,  1.12605953e+00],
        [-6.82504211e+02,  6.10673752e+01,  2.24931469e+01, ...,
         -8.71393144e-01, -3.89135652e-03,  1.12605953e+00]],
       shape=(65, 39), dtype=float32),
 'phonemes': ['h#',
  'b',
  'er',
  'th',
  'dcl',
  'd',
  'ey',
  'pcl',
  'p',
  'aa',
  'r',
  'dx',
  'iy',
  'z',
  'hh',
  'eh',
  'v',
  'kcl',
  'k',
  'ah',
  'pcl',
  'k',

In [34]:
# read a phonme data; data analyseses
def read_phonemes(phn_path):
    phonemes = []
    with open(phn_path, "r") as f:
        for line in f:
            parts = line.strip().split()
            if len(parts) == 3:
                _, _, phoneme = parts
                phonemes.append(phoneme)
    return phonemes

read_phonemes("/home/aaron/automated_speech_recognition/TIMIT/TIMIT/TRAIN/DR1/FCJF0/SA1.PHN")

['h#',
 'sh',
 'ix',
 'hv',
 'eh',
 'dcl',
 'jh',
 'ih',
 'dcl',
 'd',
 'ah',
 'kcl',
 'k',
 's',
 'ux',
 'q',
 'en',
 'gcl',
 'g',
 'r',
 'ix',
 's',
 'ix',
 'w',
 'ao',
 'sh',
 'epi',
 'w',
 'ao',
 'dx',
 'axr',
 'ao',
 'l',
 'y',
 'ih',
 'axr',
 'h#']

In [None]:
# Funktion zur Textnormalisierung (vereinfachte Variante)
def normalize_text(text):
    # In Kleinbuchstaben umwandeln
    text = text.lower()

    # Sonderzeichen entfernen (nur Buchstaben, Zahlen, Leerzeichen bleiben erhalten)
    text = ''.join(c for c in text if c.isalnum() or c.isspace())

    # Überflüssige Leerzeichen am Anfang/Ende entfernen
    return text.strip()

# Funktion zum Laden der Transkription aus der zugehörigen .TXT-Datei
def load_transcript(txt_path):
    with open(txt_path, 'r') as f:
        # Jede .TXT-Datei enthält eine Zeile wie: 0 3400 she had your dark suit
        # Wir holen nur den tatsächlichen Text (nach dem zweiten Leerzeichen)
        return normalize_text(f.readline().split(' ', 2)[-1])

{'mfcc': array([[-7.43217407e+02,  5.11423645e+01,  3.72760468e+01, ...,
          4.05487746e-01, -1.76596975e+00,  4.28943634e-01],
        [-7.14529724e+02,  3.65434113e+01,  3.51341553e+01, ...,
          4.05487746e-01, -1.76596975e+00,  4.28943634e-01],
        [-7.24818176e+02,  5.14628887e+00,  1.87460880e+01, ...,
          4.05487746e-01, -1.76596975e+00,  4.28943634e-01],
        ...,
        [-5.98549316e+02,  8.02442474e+01,  7.14992046e+00, ...,
         -8.71393144e-01, -3.89135652e-03,  1.12605953e+00],
        [-6.42915771e+02,  6.33814735e+01,  1.24211855e+01, ...,
         -8.71393144e-01, -3.89135652e-03,  1.12605953e+00],
        [-6.82504211e+02,  6.10673752e+01,  2.24931469e+01, ...,
         -8.71393144e-01, -3.89135652e-03,  1.12605953e+00]],
       shape=(65, 39), dtype=float32)}