In [1]:
from datasets import load_dataset, Audio
from transformers import MimiModel, AutoFeatureExtractor

import torch
from snac import SNAC
import torchaudio
import soundfile as sf

In [20]:
filepath = "00001_001_sales_000_1.wav"
target_sample_rate = 24000

# Load the audio file
waveform, sample_rate = torchaudio.load(filepath)

# Resample if necessary
if sample_rate != target_sample_rate:
    resampler = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=target_sample_rate)
    waveform = resampler(waveform)

# Ensure it's mono
if waveform.shape[0] > 1:
    waveform = torch.mean(waveform, dim=0, keepdim=True)

audio = waveform.squeeze(0)

In [22]:
audio.shape

torch.Size([334383])

In [23]:
model = MimiModel.from_pretrained("kyutai/mimi")
feature_extractor = AutoFeatureExtractor.from_pretrained("kyutai/mimi")

In [24]:
feature_extractor.sampling_rate

24000

In [25]:
# pre-process the inputs
inputs = feature_extractor(raw_audio=audio, sampling_rate=feature_extractor.sampling_rate, return_tensors="pt")

In [33]:
audio

tensor([ 0.0000,  0.0000,  0.0000,  ..., -0.0017, -0.0021, -0.0006])

In [32]:
inputs["input_values"][0][0]

tensor([ 0.0000,  0.0000,  0.0000,  ..., -0.0017, -0.0021, -0.0006])

In [29]:
# explicitly encode then decode the audio inputs
encoder_outputs = model.encode(inputs["input_values"])
audio_values = model.decode(encoder_outputs.audio_codes)[0]

In [31]:
output_filepath = "mimi.wav"
# --- 4. Save the Reconstructed Audio ---
print(f"Saving reconstructed audio to: {output_filepath}")
audio_to_save = audio_values.squeeze().cpu().detach().numpy()
sf.write(output_filepath, audio_to_save, target_sample_rate)
print("Done.")

Saving reconstructed audio to: mimi.wav
Done.
