In [1]:
!pip install soundfile pydub SpeechRecognition transformers nltk matplotlib


Collecting SpeechRecognition
  Downloading speechrecognition-3.14.2-py3-none-any.whl.metadata (30 kB)
Downloading speechrecognition-3.14.2-py3-none-any.whl (32.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m32.9/32.9 MB[0m [31m57.8 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[?25hInstalling collected packages: SpeechRecognition
Successfully installed SpeechRecognition-3.14.2


In [2]:
!apt-get install -y ffmpeg


Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
ffmpeg is already the newest version (7:4.4.2-0ubuntu0.22.04.1).
0 upgraded, 0 newly installed, 0 to remove and 122 not upgraded.


In [17]:
import os
import numpy as np
import soundfile as sf
import speech_recognition as sr
import pydub
from pydub import AudioSegment
import pandas as pd
import nltk
from transformers import pipeline


In [18]:
from pydub import AudioSegment
import numpy as np
import os

# Add noise function
def add_noise(audio_path, noise_level=0.02):
    # Load the clean audio
    audio = AudioSegment.from_file(audio_path)
    
    # Generate white noise: random values with the same length as the audio
    noise = np.random.normal(0, noise_level, len(audio.get_array_of_samples()))
    
    # Convert the noise into an AudioSegment
    noise_audio = AudioSegment(
        noise.tobytes(), 
        frame_rate=audio.frame_rate,
        sample_width=audio.sample_width, 
        channels=audio.channels
    )
    
    # Overlay noise onto the original audio
    noisy_audio = audio + noise_audio
    return noisy_audio

# Save the noisy audio
def save_noisy_audio(clean_audio_file, output_path):
    noisy_audio = add_noise(clean_audio_file)
    noisy_audio.export(output_path, format="wav")
    print(f"🔊 Noisy audio saved to {output_path}")

# Function to process all audio files in the directory
def process_all_audio_files(input_dir, output_dir, noise_level=0.02):
    # Ensure output directory exists
    os.makedirs(output_dir, exist_ok=True)
    
    # Loop through all files in the directory
    for filename in os.listdir(input_dir):
        if filename.endswith(".mp3") or filename.endswith(".wav"):
            clean_audio_path = os.path.join(input_dir, filename)
            noisy_audio_path = os.path.join(output_dir, f"noisy_{filename.replace('.mp3', '.wav').replace('.wav', '.wav')}")
            
            # Add noise and save
            save_noisy_audio(clean_audio_path, noisy_audio_path)

# Example usage to process all files inside the "Common Voice" directory
input_dir = "/kaggle/input/16000hzwav/16000Hz/103/1240"
output_dir = "/kaggle/working/noisy_audio_files"

process_all_audio_files(input_dir, output_dir)


🔊 Noisy audio saved to /kaggle/working/noisy_audio_files/noisy_103-1240-0047_16000Hz.wav
🔊 Noisy audio saved to /kaggle/working/noisy_audio_files/noisy_103-1240-0031_16000Hz.wav
🔊 Noisy audio saved to /kaggle/working/noisy_audio_files/noisy_103-1240-0002_16000Hz.wav
🔊 Noisy audio saved to /kaggle/working/noisy_audio_files/noisy_103-1240-0055_16000Hz.wav
🔊 Noisy audio saved to /kaggle/working/noisy_audio_files/noisy_103-1240-0036_16000Hz.wav
🔊 Noisy audio saved to /kaggle/working/noisy_audio_files/noisy_103-1240-0038_16000Hz.wav
🔊 Noisy audio saved to /kaggle/working/noisy_audio_files/noisy_103-1240-0025_16000Hz.wav
🔊 Noisy audio saved to /kaggle/working/noisy_audio_files/noisy_103-1240-0024_16000Hz.wav
🔊 Noisy audio saved to /kaggle/working/noisy_audio_files/noisy_103-1240-0046_16000Hz.wav
🔊 Noisy audio saved to /kaggle/working/noisy_audio_files/noisy_103-1240-0015_16000Hz.wav
🔊 Noisy audio saved to /kaggle/working/noisy_audio_files/noisy_103-1240-0022_16000Hz.wav
🔊 Noisy audio saved t

In [23]:
import os
import spacy
import speech_recognition as sr

# Load the spaCy model
nlp = spacy.load("en_core_web_sm")

def transcribe_audio(audio_file):
    recognizer = sr.Recognizer()
    with sr.AudioFile(audio_file) as source:
        audio = recognizer.record(source)
        try:
            transcription = recognizer.recognize_google(audio)
            return transcription
        except sr.UnknownValueError:
            return ""
        except sr.RequestError:
            return ""

# spaCy processing function
def process_text_spacy(text):
    doc = nlp(text)
    tokens = [token.text for token in doc]
    pos_tags = [token.pos_ for token in doc]
    entities = [(ent.text, ent.label_) for ent in doc.ents]
    return tokens, pos_tags, entities

# Main function to process only the first 3 audio files
def process_first_3_audio_files(input_dir, output_dir, noise_level=0.02):
    os.makedirs(output_dir, exist_ok=True)
    
    all_transcripts = []  # List to accumulate all transcriptions
    
    # Get only first 3 files
    all_files = [f for f in os.listdir(input_dir) if f.endswith(".mp3") or f.endswith(".wav")]
    first_3_files = all_files[:3]
    
    # Loop through first 3 files
    for filename in first_3_files:
        clean_audio_path = os.path.join(input_dir, filename)
        noisy_audio_path = os.path.join(output_dir, f"noisy_{filename.replace('.mp3', '.wav').replace('.wav', '.wav')}")
        
        
        
        # Transcribe the noisy audio
        transcript = transcribe_audio(noisy_audio_path)
        all_transcripts.append(transcript)  # Add the transcript to the list
        
        # Process text with spaCy
        tokens, pos_tags, entities = process_text_spacy(transcript)
        
        # Print results for the current file
        print(f"\nFile: {filename}")
        print(f"Tokens: {tokens}")
        print(f"POS Tags: {pos_tags}")
        print(f"Entities: {entities}")
    
    # Combine all transcriptions
    full_transcript = " ".join(all_transcripts)
    print("\nFull Transcription from the first 3 files:", full_transcript)
    return full_transcript

# Example usage
input_dir = "/kaggle/input/16000hzwav/16000Hz/103/1240"
output_dir = "/kaggle/working/noisy_audio_files"

full_transcript = process_first_3_audio_files(input_dir, output_dir)



File: 103-1240-0047_16000Hz.wav
Tokens: ['and', 'I', 'know', 'another', 'case', 'where', 'an', 'adopted', 'boy', 'used', 'to', 'suck', 'the', 'eggs', 'they', 'could', "n't", 'break', 'him', 'of', 'it', 'if', 'you', 'had', 'asked', 'my', 'advice', 'in', 'the', 'matter', 'which', 'you', 'did', "n't", 'do', 'marilla', 'I', "'d", 'have', 'said', 'for', 'mercy', 'cig', 'not', 'to', 'think', 'of', 'such', 'a', 'thing', 'that', "'s", 'what', 'I', 'want', 'you', 'Google', 'hey', 'Google', 'hey', 'Google', 'hey', 'Google', 'hey', 'Google', 'hey', 'Google', 'hey', 'Google']
POS Tags: ['CCONJ', 'PRON', 'VERB', 'DET', 'NOUN', 'SCONJ', 'DET', 'ADJ', 'NOUN', 'VERB', 'PART', 'VERB', 'DET', 'NOUN', 'PRON', 'AUX', 'PART', 'VERB', 'PRON', 'ADP', 'PRON', 'SCONJ', 'PRON', 'AUX', 'VERB', 'PRON', 'NOUN', 'ADP', 'DET', 'NOUN', 'PRON', 'PRON', 'AUX', 'PART', 'VERB', 'NOUN', 'PRON', 'AUX', 'AUX', 'VERB', 'ADP', 'NOUN', 'PROPN', 'PART', 'PART', 'VERB', 'ADP', 'DET', 'DET', 'NOUN', 'PRON', 'AUX', 'PRON', 'PRON'

In [25]:


# Remove "hey Google" from the transcript
cleaned_transcript = full_transcript.replace("hey Google", "").strip()

# Print the cleaned transcription
print("\nCleaned Full Transcription from the first 3 files:\n", cleaned_transcript)



Cleaned Full Transcription from the first 3 files:
 and I know another case where an adopted boy used to suck the eggs they couldn't break him of it if you had asked my advice in the matter which you didn't do marilla I'd have said for mercy cig not to think of such a thing that's what I want you Google       but there was a saving something about her mouth which if it had been ever so slightly developed might have been considered indicative of a sense of humour we're all pretty well said Mrs Rachel I was kind of afraid you weren't though when I saw Matthew starting off today I thought maybe he was going to the Doctor Who what what it probably was conscious that Mrs Rachel was sitting at her window keeping a sharp eye on everything that passed from Brooks and children up


In [26]:
import nltk
from transformers import pipeline

# Ensure you have nltk tokenization tools downloaded
nltk.download('punkt')

# KWIC Analysis Function
def kwic_analysis(text, keyword):
    words = nltk.word_tokenize(text)  # Tokenize the entire text
    indices = [i for i, word in enumerate(words) if word.lower() == keyword.lower()]
    
    # Create context by looking at 5 words before and after the keyword
    context = [(words[max(i-5, 0):i+6], words[i]) for i in indices]
    
    # Print the context of the keyword in the text
    for c in context:
        print(f"Context: {c[0]}, Keyword: {c[1]}")
    
    return context

[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [28]:
# Question Answering Function
def answer_question(transcript, question, qa_pipeline):
    # Use the QA pipeline to answer a question based on the provided transcript
    result = qa_pipeline(question=question, context=transcript)
    print(f"Answer: {result['answer']}")
    return result['answer']

# Example keyword for KWIC analysis
kwic_context = kwic_analysis(full_transcript, "speech")

# Example QA pipeline for question answering (ensure you have a QA model loaded)
qa_pipeline = pipeline("question-answering", model="distilbert-base-uncased-distilled-squad")

# Example question for answering
answer = answer_question(full_transcript, "Where is Mrs Rachel Sitting?", qa_pipeline)

Device set to use cuda:0


Answer: at her window


In [46]:
# Check the full transcript
print(f"Full Transcript: {full_transcript[:5000]}...")  # Show the first 500 characters

# Run the QA pipeline on the full transcript
answer = answer_question(full_transcript, "What is the main topic of the transcript?", qa_pipeline)


Full Transcript: Your combined transcription from all files goes here...
Answer: Your combined transcription from all files


In [30]:
import os

def save_results(transcript, kwic_context, answer, file_name):
    results_dir = "/kaggle/working/results"
    
    # Ensure the results directory exists
    os.makedirs(results_dir, exist_ok=True)
    
    # Save the transcript to a file
    with open(f"{results_dir}/{file_name}_transcript.txt", "w") as f:
        f.write(transcript)
    
    # Save the KWIC context to a file
    with open(f"{results_dir}/{file_name}_kwic.txt", "w") as f:
        for context in kwic_context:
            f.write(f"Context: {context[0]}, Keyword: {context[1]}\n")
    
    # Save the answer to a file
    with open(f"{results_dir}/{file_name}_answer.txt", "w") as f:
        f.write(f"Answer: {answer}")

    print(f"Results saved for {file_name}.")
save_results(full_transcript, kwic_context, answer, "noisy_audio_file_name")

Results saved for noisy_audio_file_name.


In [43]:
import spacy

nlp = spacy.load("en_core_web_sm")
def process_all_audio_files(input_dir, output_dir, noise_level=0.02):
    os.makedirs(output_dir, exist_ok=True)
    
    all_transcripts = []  # List to accumulate all transcriptions
    
    # Loop through all files in the directory
    for filename in os.listdir(input_dir):
        if filename.endswith(".mp3") or filename.endswith(".wav"):
            clean_audio_path = os.path.join(input_dir, filename)
            noisy_audio_path = os.path.join(output_dir, f"noisy_{filename.replace('.mp3', '.wav').replace('.wav', '.wav')}")
            
            # Add noise and save noisy audio
            save_noisy_audio(clean_audio_path, noisy_audio_path)
            
            # Transcribe the noisy audio
            transcript = transcribe_audio(noisy_audio_path)
            all_transcripts.append(transcript)  # Add the transcript to the list
            
            # Process text with spaCy
            tokens, pos_tags, entities = process_text_spacy(transcript)
            
            # Print results for the current file
            print(f"Tokens for {filename}:", tokens)
            print(f"POS Tags for {filename}:", pos_tags)
            print(f"Entities for {filename}:", entities)
    
    # After processing all files, combine all transcriptions
    full_transcript = " ".join(all_transcripts)
    print("Full Transcription from all files:", full_transcript)

# Example usage
input_dir = "/kaggle/input/16000hzwav/16000Hz/103/1240"
output_dir = "/kaggle/working/noisy_audio_files"

process_all_audio_files(input_dir, output_dir)


🔊 Noisy audio saved to /kaggle/working/noisy_audio_files/noisy_103-1240-0047_16000Hz.wav
Transcription: they couldn't break him of it if you had asked my advice in the matter which you didn't do Marilla I'd have said for Mercy sick not to think of such a thing that's what
Tokens for 103-1240-0047_16000Hz.wav: ['they', 'could', "n't", 'break', 'him', 'of', 'it', 'if', 'you', 'had', 'asked', 'my', 'advice', 'in', 'the', 'matter', 'which', 'you', 'did', "n't", 'do', 'Marilla', 'I', "'d", 'have', 'said', 'for', 'Mercy', 'sick', 'not', 'to', 'think', 'of', 'such', 'a', 'thing', 'that', "'s", 'what']
POS Tags for 103-1240-0047_16000Hz.wav: [('they', 'PRON'), ('could', 'AUX'), ("n't", 'PART'), ('break', 'VERB'), ('him', 'PRON'), ('of', 'ADP'), ('it', 'PRON'), ('if', 'SCONJ'), ('you', 'PRON'), ('had', 'AUX'), ('asked', 'VERB'), ('my', 'PRON'), ('advice', 'NOUN'), ('in', 'ADP'), ('the', 'DET'), ('matter', 'NOUN'), ('which', 'PRON'), ('you', 'PRON'), ('did', 'AUX'), ("n't", 'PART'), ('do', 'VERB