In [1]:
! pip install torch torchaudio librosa



In [2]:
! pip install SpeechRecognition
! pip install pydub
! pip install ffmpeg



In [1]:
import os

def print_files_in_folder(folder_path):
    """
    Print the names of all files present in the given folder.

    Args:
      folder_path: The path to the folder.
    """
    if not os.path.exists(folder_path):
        print(f"Folder not found: {folder_path}")
        return

    if not os.path.isdir(folder_path):
        print(f"{folder_path} is not a directory.")
        return

    files = os.listdir(folder_path)

    if not files:
        print(f"No files found in {folder_path}")
        return

    print(f"Files in {folder_path}:")
    for file_name in files:
        print(file_name)

# Example usage
folder_path = 'recordings'  # Replace with the path to your folder
print_files_in_folder(folder_path)


Files in recordings:
Bait.mp3
Bait.wav
Intro.mp3
Video 1 (6).mp3
Video 2 (7-22).mp3
Video 3.mp3
Video 4.mp3


In [4]:
## Linux only code
# from pydub import AudioSegment
# import os

# def convert_mp3_to_wav(directory):
#     """
#     Convert all MP3 files in the given directory to WAV format.

#     Args:
#       directory: The path to the directory containing MP3 files.
#     """
#     for filename in os.listdir(directory):
#         if filename.endswith(".mp3"):
#             mp3_path = os.path.join(directory, filename)
#             wav_path = os.path.join(directory, filename.replace(".mp3", ".wav"))

#             # Convert MP3 to WAV
#             audio = AudioSegment.from_mp3(mp3_path)
#             audio.export(wav_path, format="wav")
#             print(f"Converted {filename} to WAV format")

# # Example usage
# convert_mp3_to_wav('recordings')  # Replace with your directory path


In [2]:
import os
import speech_recognition as sr
import json

def transcribe_audio(wav_path):
    """
    Transcribe the given WAV audio file to text.

    Args:
      wav_path: The path to the WAV audio file.

    Returns:
      A string containing the transcribed text.
    """
    recognizer = sr.Recognizer()

    with sr.AudioFile(wav_path) as source:
        audio_data = recognizer.record(source)
        try:
            text = recognizer.recognize_google(audio_data)
            return text
        except sr.UnknownValueError:
            return "Audio Unintelligible"
        except sr.RequestError:
            return "Request Failed"

def create_transcripts_file(audio_folder_path):
    """
    Given an audio folder path, create the 'transcripts.json' file in the current working directory.

    Args:
      audio_folder_path: The path to the audio folder.
    """

    transcripts_file_path = 'transcripts.json'
    transcripts = {}
    for audio_file in os.listdir(audio_folder_path):
        if audio_file.endswith('.wav'):
            wav_path = os.path.join(audio_folder_path, audio_file)
            transcripts[audio_file] = transcribe_audio(wav_path)

    with open(transcripts_file_path, 'w') as f:
        json.dump(transcripts, f, indent=4)

# Example usage
create_transcripts_file('recordings')


In [8]:
import os
import librosa
import json

def preprocess_data(audio_folder, transcript_file):
    data = []
    with open(transcript_file, 'r') as f:
        transcripts = json.load(f)

    for filename in os.listdir(audio_folder):
        if filename.endswith('.wav'):
            transcript = transcripts.get(filename)
            if transcript:
                path = os.path.join(audio_folder, filename)
                audio, sr = librosa.load(path, sr=None)
                data.append({'audio': audio, 'transcript': transcript, 'sampling_rate': sr})

    return data

dataset = preprocess_data('recordings', 'transcripts.json')


In [9]:
print(dataset[:5])  # Print the first 5 elements of the dataset


[{'audio': array([0., 0., 0., ..., 0., 0., 0.], dtype=float32), 'transcript': 'get your free LinkedIn no good AI generated headshot snap no need to spend hours to get ready to pay a photographer to get the headshot script Di is here to save you with the bonus free prompt at the end head to update gaana.com now', 'sampling_rate': 48000}]


In [10]:
import torch.nn as nn

class SimpleTTSModel(nn.Module):
    def __init__(self):
        super(SimpleTTSModel, self).__init__()
        self.lstm = nn.LSTM(input_size=100, hidden_size=128, num_layers=2, batch_first=True)
        self.fc = nn.Linear(128, 1)  # Adjust output size according to your data

    def forward(self, x):
        x, _ = self.lstm(x)
        x = self.fc(x)
        return x


In [11]:
# import torch
# from torch.utils.data import DataLoader
# import torch.nn as nn
# import torch.optim as optim
# import numpy as np

# # Assuming you have a model defined as `SimpleTTSModel`
# # and your dataset prepared as `dataset`

# # A function to convert text to a tensor (this is a placeholder, you'll need to define this based on your model's needs)
# def text_to_tensor(text):
#     # This function should convert text to a tensor
#     # For simplicity, let's assume each character is converted to an ASCII value
#     return torch.tensor([ord(c) for c in text], dtype=torch.float32)

# # A function to prepare your batch
# def prepare_batch(batch):
#     audio_tensors = [torch.tensor(item['audio'], dtype=torch.float32) for item in batch]
#     text_tensors = [text_to_tensor(item['transcript']) for item in batch]

#     # Padding sequences to have same length
#     audio_tensors = nn.utils.rnn.pad_sequence(audio_tensors, batch_first=True)
#     text_tensors = nn.utils.rnn.pad_sequence(text_tensors, batch_first=True)

#     return audio_tensors, text_tensors

# # Custom Dataset class
# class CustomDataset(torch.utils.data.Dataset):
#     def __init__(self, data):
#         self.data = data

#     def __len__(self):
#         return len(self.data)

#     def __getitem__(self, idx):
#         return self.data[idx]

# # Creating a DataLoader
# batch_size = 4  # You can modify this based on your dataset and GPU capabilities
# train_loader = DataLoader(CustomDataset(dataset), batch_size=batch_size, shuffle=True, collate_fn=prepare_batch)

# # Training function
# def train(model, train_loader, epochs):
#     model.train()
#     criterion = nn.MSELoss()
#     optimizer = optim.Adam(model.parameters(), lr=0.001)

#     for epoch in range(epochs):
#         for batch_idx, (audio, transcripts) in enumerate(train_loader):
#             optimizer.zero_grad()

#             # Assuming your model's forward method takes audio and returns predicted transcripts
#             predictions = model(audio)

#             # Ensure predictions and targets are the same shape
#             transcripts = transcripts.to(predictions.device)
#             transcripts = nn.utils.rnn.pad_sequence([transcripts[i][:len(predictions[i])] for i in range(len(transcripts))], batch_first=True)

#             loss = criterion(predictions, transcripts)
#             loss.backward()
#             optimizer.step()

#             if batch_idx % 10 == 0:
#                 print(f'Epoch: {epoch+1}, Batch: {batch_idx}, Loss: {loss.item()}')

# # Initialize your model
# model = SimpleTTSModel()

# # Train the model
# train(model, train_loader, epochs=10)
# torch.save(model.state_dict(), 'tts_model.pth')


# model = SimpleTTSModel()
# model.load_state_dict(torch.load('tts_model.pth'))
# model.eval()


In [None]:
import torch
from torch.utils.data import DataLoader
import torch.nn as nn
import torch.optim as optim

# Define your SimpleTTSModel here

def text_to_tensor(text):
    # Converts text to a tensor
    return torch.tensor([ord(c) for c in text], dtype=torch.float32)

def prepare_batch(batch):
    audio_tensors = [item[0] for item in batch]
    text_tensors = [item[1] for item in batch]

    audio_tensors = nn.utils.rnn.pad_sequence(audio_tensors, batch_first=True)
    text_tensors = nn.utils.rnn.pad_sequence(text_tensors, batch_first=True)

    return audio_tensors, text_tensors

class CustomDataset(torch.utils.data.Dataset):
    def __init__(self, dataset):
        self.dataset = dataset

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        audio = torch.tensor(self.dataset[idx]['audio'], dtype=torch.float32)
        transcript = text_to_tensor(self.dataset[idx]['transcript'])
        return audio, transcript


def train(model, train_loader, epochs):
    model.train()
    criterion = nn.MSELoss()
    optimizer = optim.Adam(model.parameters(), lr=0.001)

    for epoch in range(epochs):
        for batch_idx, (audio, transcript) in enumerate(train_loader):
            optimizer.zero_grad()

            audio, transcript = audio.to(device), transcript.to(device)
            prediction = model(audio)
            prediction = prediction.squeeze(0)
            loss = criterion(prediction, transcript)

            loss.backward()
            optimizer.step()

            if batch_idx % 10 == 0:
                print(f'Epoch: {epoch+1}, Batch: {batch_idx}, Loss: {loss.item()}')
            del audio, transcript, prediction
            torch.cuda.empty_cache()

batch_size = 1
train_loader = DataLoader(CustomDataset(dataset), batch_size=batch_size, shuffle=True, collate_fn=prepare_batch)

# Define your SimpleTTSModel and train function here
model = SimpleTTSModel()
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

train(model, train_loader, epochs=10)

torch.save(model.state_dict(), 'tts_model.pth')
model.load_state_dict(torch.load('tts_model.pth'))
model.eval()
