In [1]:
# Clone the VITS repository
!git clone https://github.com/jaywalnut310/vits.git
%cd vits

# Install required libraries
!pip install -r requirements.txt
!pip install librosa soundfile websockets


Cloning into 'vits'...
remote: Enumerating objects: 81, done.[K
remote: Total 81 (delta 0), reused 0 (delta 0), pack-reused 81 (from 1)[K
Receiving objects: 100% (81/81), 3.33 MiB | 12.55 MiB/s, done.
Resolving deltas: 100% (22/22), done.
/content/vits
Collecting Cython==0.29.21 (from -r requirements.txt (line 1))
  Downloading Cython-0.29.21-py2.py3-none-any.whl.metadata (2.6 kB)
Collecting librosa==0.8.0 (from -r requirements.txt (line 2))
  Downloading librosa-0.8.0.tar.gz (183 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m183.9/183.9 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting matplotlib==3.3.1 (from -r requirements.txt (line 3))
  Downloading matplotlib-3.3.1.tar.gz (38.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m38.8/38.8 MB[0m [31m17.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting numpy==1.18.5 (from -r 

In [None]:
import os
import librosa
import soundfile as sf
from sklearn.model_selection import train_test_split

# Define paths
audio_path = f'{data_dir}/audio'  # Original audio files location
processed_audio_path = f'{data_dir}/processed_audio'
os.makedirs(processed_audio_path, exist_ok=True)

# Preprocess audio files (normalize and resample to 22050 Hz)
for file_name in os.listdir(audio_path):
    if file_name.endswith('.wav'):
        file_path = os.path.join(audio_path, file_name)
        audio, sr = librosa.load(file_path, sr=22050)
        norm_audio = librosa.util.normalize(audio)
        sf.write(os.path.join(processed_audio_path, file_name), norm_audio, sr)

# Split into train, validation, and test sets (80/10/10 ratio)
audio_files = sorted(os.listdir(processed_audio_path))
train_files, test_files = train_test_split(audio_files, test_size=0.2, random_state=42)
val_files, test_files = train_test_split(test_files, test_size=0.5, random_state=42)

# Save file lists for train, val, and test sets
def save_file_list(file_list, filename):
    with open(filename, 'w') as f:
        for file_name in file_list:
            f.write(f"{file_name}\n")

save_file_list(train_files, f'{data_dir}/train_files.txt')
save_file_list(val_files, f'{data_dir}/val_files.txt')
save_file_list(test_files, f'{data_dir}/test_files.txt')


In [None]:
import json

# Load configuration file
config_path = './configs/config.json'
with open(config_path, 'r') as f:
    config = json.load(f)

# Update dataset paths in configuration
config['data']['training_files'] = f'{data_dir}/train_files.txt'
config['data']['validation_files'] = f'{data_dir}/val_files.txt'
config['data']['sampling_rate'] = 22050  # Ensure sampling rate matches preprocessing

# Save updated config
with open(config_path, 'w') as f:
    json.dump(config, f, indent=4)


In [None]:
# Train the model
!python train.py -c ./configs/config.json -m my_non_hindi_model

# Save checkpoints to Google Drive after certain intervals (e.g., every few hours)
!cp -r logs/my_non_hindi_model {checkpoint_dir}


In [None]:
# Evaluate the model
!python evaluate.py -c ./configs/config.json -m my_non_hindi_model --output_path ./output


In [None]:
import websockets
import asyncio
import soundfile as sf
from io import BytesIO

async def process_audio(websocket, path):
    async for message in websocket:
        # Load WAV data from received message
        wav_data = BytesIO(message)
        audio, sr = sf.read(wav_data)

        # Convert to FLAC
        flac_data = BytesIO()
        sf.write(flac_data, audio, sr, format='FLAC')
        await websocket.send(flac_data.getvalue())

# Run the WebSocket server (for local deployment)
start_server = websockets.serve(process_audio, "localhost", 8765)

asyncio.get_event_loop().run_until_complete(start_server)
asyncio.get_event_loop().run_forever()
