Start by making sure you have the following packages in your environment:

In [1]:
# ! pip install tqdm pandas pdfplumber requests

In [2]:
RAW_DATA = "../data/sc_speech.csv"
DATA_DIR = "../data"

In [3]:
import os
AUDIO_DIR = os.path.join(DATA_DIR, "audio")
TEXT_DIR = os.path.join(DATA_DIR, "text")

! mkdir -p $AUDIO_DIR $TEXT_DIR

In [4]:
from IPython.display import display, HTML

# Set a maximum height for all output areas
display(HTML('''<style>.output_area pre {max-height: 100px; overflow-y: scroll;}</style>'''))

## Cleaning transcripts

Download transcripts and format into dialog format

In [5]:
from tqdm import tqdm
import pandas as pd
import json

import os
import re
import time
import requests
import pdfplumber
from io import BytesIO

In [6]:
class Transcript:
    def __init__(self, url, debug=False):
        self.debug = debug
        self.time = time.time()
        self.doc = self.__get_pdf(url)
        self.lines = self.__parse_doc()

    def __log_time(self, key):
        if self.debug:
            elapsed_time = time.time() - self.time
            print(f"{key}: {elapsed_time:.2f} seconds")
        self.time = time.time()

    def __get_pdf(self, pdf_url):
        response = requests.get(pdf_url)
        response.raise_for_status()
        doc = pdfplumber.open(BytesIO(response.content))
        self.__log_time("download pdf")
        return doc

    def __parse_doc(self):
        SKIP_LIST = ["IST", "LIVE FEED", "END OF", "Transcribed by"]
        number_pattern = re.compile(r'^\d+\s?')
        lines = []

        # skip first page of every document, as irrelevant
        for page in self.doc.pages[1:]:
            text = page.extract_text().strip()
            for line in text.split('\n'):
                # clean line numbers
                clean_line = number_pattern.sub('', line)
                
                # skip lines that are not relevant
                if clean_line and all(key not in clean_line for key in SKIP_LIST):
                    lines.append(clean_line)

        self.__log_time("parse pdf")
        return lines

    def format(self):
        SEP = ": "
        speakers = []
        transcript = []

        for line in self.lines:
            if SEP in line:
                speaker, text = line.split(SEP, 1)
                if speaker not in speakers:
                    speakers.append(speaker)
                transcript.append({"speaker": speaker, "text": text})
            else:
                transcript[-1]["text"] = f"{transcript[-1]['text']} {line}"

        self.__log_time("format")
        return transcript


In [7]:
data = pd.read_csv(RAW_DATA)
data.drop(data.tail(1).index,inplace=True) # drop last empty row
# data.head(1)

In [8]:
# # TEST

# urls = data["Transcript Link"].to_list()
# transcript = Transcript(urls[14], debug=True).format()

In [9]:
# # KEYWORD SPOTTING

# pattern = re.compile(r'\b[A-Z]{2,}(?:\s+[A-Z]{2,})*\b')
# for t in transcript:
#     finds = pattern.findall(t["text"])
#     if len(finds)>0:
#         print(finds)

In [10]:
tqdm.pandas()
data["transcript"] = data["Transcript Link"].progress_apply(lambda url: json.dumps(Transcript(url, debug=False).format()))

100%|██████████| 24/24 [04:36<00:00, 11.52s/it]


Format transcripts into text strings

In [11]:
# Clean and format to optimise for segment quality

def clean_transcript(transcript):    
    
    # replace ellipsis by stop token
    # replace stop tokens by semi-colon
    def clean_text(text):
        pattern = re.compile(r'\.\.+')
        return pattern.sub(".", text.rstrip(".")).replace("…",".").replace(".", ";")
    
    text = clean_text(transcript[0]["text"])
    for t in transcript[1:]:
        sample = clean_text(t["text"])
        # add stop token between speaker changes
        text = ". ".join([text, sample])
    return text

In [12]:
data["text"] = data["transcript"].apply(lambda transcript: json.dumps(clean_transcript(json.loads(transcript))))
data.columns

Index(['Sr. No.', 'Case Name', 'Case Number', 'Hearing Date',
       'Transcript Link', 'Oral Hearing Link', 'Hearing Duration(in Minutes)',
       'mp3 format link', 'transcript', 'text'],
      dtype='object')

## Preparing data

Prepare data for segmentation - audios and texts are stored in a specific directory structure

In [13]:
samples = list(zip(data["mp3 format link"], data["text"]))

In [14]:
import concurrent.futures
from tqdm import tqdm
import os

def download_and_write(sample, i):
    url, text = sample
    
    # Download audio
    audio_file = f'{AUDIO_DIR}/{i}.wav'
    ! rm -rf $audio_file
    ! wget -qO- "$url" | ffmpeg -i pipe:0 -ac 1 $audio_file -loglevel error -y
    
    # Write text
    text_file = f'{TEXT_DIR}/{i}.txt'
    with open(text_file, 'w') as file:
        file.write(text)

# spread downloads across threads
def parallel_download(samples):
    with concurrent.futures.ThreadPoolExecutor() as executor:
        list(tqdm(executor.map(download_and_write, samples, range(len(samples))), total=len(samples)))


In [15]:
parallel_download(samples)

100%|██████████| 24/24 [01:05<00:00,  2.72s/it]



--DATA_DIR

     |----audio
            |---1.wav
            |---2.wav
            
     |-----text
            |---1.txt
            |---2.txt