In [2]:
# !pip install datasets
# !pip install transformers
# !pip install snac
# !pip install soundfile
# !pip install torchaudio --index-url https://download.pytorch.org/whl/cpu

Collecting datasets
  Downloading datasets-4.2.0-py3-none-any.whl.metadata (18 kB)
Collecting pyarrow>=21.0.0 (from datasets)
  Downloading pyarrow-21.0.0-cp312-cp312-manylinux_2_28_x86_64.whl.metadata (3.3 kB)
Collecting dill<0.4.1,>=0.3.0 (from datasets)
  Downloading dill-0.4.0-py3-none-any.whl.metadata (10 kB)
Collecting pandas (from datasets)
  Downloading pandas-2.3.3-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl.metadata (91 kB)
Collecting tqdm>=4.66.3 (from datasets)
  Downloading tqdm-4.67.1-py3-none-any.whl.metadata (57 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.6.0-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl.metadata (13 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py312-none-any.whl.metadata (7.2 kB)
Collecting huggingface-hub<2.0,>=0.25.0 (from datasets)
  Downloading huggingface_hub-0.35.3-py3-none-any.whl.metadata (14 kB)
Collecting aiohttp!=4.0.0a0,!=4.0.0a1 (f

In [1]:
from datasets import load_dataset, Audio
from transformers import MimiModel, AutoFeatureExtractor

import torch
from snac import SNAC
import torchaudio
import soundfile as sf

In [2]:
filepath = "00001_001_sales_000_1.wav"
target_sample_rate = 24000

# Load the audio file
waveform, sample_rate = torchaudio.load(filepath)

# Resample if necessary
if sample_rate != target_sample_rate:
    resampler = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=target_sample_rate)
    waveform = resampler(waveform)

# Ensure it's mono
if waveform.shape[0] > 1:
    waveform = torch.mean(waveform, dim=0, keepdim=True)

audio = waveform.squeeze(0)

In [3]:
audio.shape

torch.Size([334383])

In [4]:
model = MimiModel.from_pretrained("kyutai/mimi")
feature_extractor = AutoFeatureExtractor.from_pretrained("kyutai/mimi")

model.safetensors:   0%|          | 0.00/385M [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/234 [00:00<?, ?B/s]

In [5]:
feature_extractor.sampling_rate

24000

In [6]:
# pre-process the inputs
inputs = feature_extractor(raw_audio=audio, sampling_rate=feature_extractor.sampling_rate, return_tensors="pt")

In [7]:
audio

tensor([ 0.0000,  0.0000,  0.0000,  ..., -0.0017, -0.0021, -0.0006])

In [8]:
inputs["input_values"][0][0]

tensor([ 0.0000,  0.0000,  0.0000,  ..., -0.0017, -0.0021, -0.0006])

In [29]:
# explicitly encode then decode the audio inputs
encoder_outputs = model.encode(inputs["input_values"])
audio_values = model.decode(encoder_outputs.audio_codes)[0]

In [31]:
output_filepath = "mimi.wav"
# --- 4. Save the Reconstructed Audio ---
print(f"Saving reconstructed audio to: {output_filepath}")
audio_to_save = audio_values.squeeze().cpu().detach().numpy()
sf.write(output_filepath, audio_to_save, target_sample_rate)
print("Done.")

Saving reconstructed audio to: mimi.wav
Done.
