In [2]:
import pandas as pd
import numpy as np
import os

In [3]:
directory_path = '/kaggle/input/business-json/Task_3_aud_files/kaggle/working/Task_3_aud_files'

num_files = len([name for name in os.listdir(directory_path) if os.path.isfile(os.path.join(directory_path, name))])

print(f'Total number of files in the directory: {num_files}')


Total number of files in the directory: 1215


In [4]:
!pip install torch torchaudio transformers



In [22]:
import torchaudio
import torch
import librosa
from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC
from sklearn.model_selection import train_test_split
from transformers import RobertaTokenizer
from transformers import RobertaTokenizerFast
from torch.utils.data import DataLoader, Dataset
from transformers import RobertaForTokenClassification, AdamW, get_linear_schedule_with_warmup
from transformers import GPT2Tokenizer, GPT2LMHeadModel
from transformers import pipeline
from torch.nn.utils.rnn import pad_sequence
import warnings
warnings.filterwarnings("ignore")
import ast

In [38]:
class AudioToTextDataset(Dataset):
    def __init__(self, csv_file, audio_dir, processor, device, target_sample_rate=16000,sample_fraction =  1, k_sec = 7):
#         self.data = pd.read_csv(csv_file).sample(frac=sample_fraction, random_state=42).reset_index(drop=True)
        self.data = pd.read_csv(csv_file)
        self.audio_dir = audio_dir
        self.processor = processor
        self.device = device
        self.target_sample_rate = target_sample_rate
        self.cutoff = k_sec
    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx, k_seconds=None):
        if torch.is_tensor(idx):
            idx = idx.tolist()
        q_id = self.data.iloc[idx]['QID']
        all_sent_str = self.data.iloc[idx]['all_sentences']
        all_sent = ast.literal_eval(all_sent_str)
        audio_paths = []
        inputs = []
        k_seconds = self.cutoff
        for i in range(len(all_sent)):
            pth = f"{self.audio_dir}/Sentence_{q_id}_{i}.wav"
            audio_paths.append(pth)
            waveform1, sample_rate1 = torchaudio.load(pth)

            if sample_rate1 != self.target_sample_rate:
                resampler1 = torchaudio.transforms.Resample(orig_freq=sample_rate1, new_freq=self.target_sample_rate)
                waveform1 = resampler1(waveform1)

            if k_seconds is not None:
                num_samples = int(self.target_sample_rate * k_seconds)
                waveform1 = waveform1[:, :num_samples]

            inputs1 = self.processor(waveform1.squeeze().numpy(), sampling_rate=self.target_sample_rate, return_tensors="pt", padding=True)
            inputs1 = {key: val.to(self.device) for key, val in inputs1.items()}
            inputs.append(inputs1)

        return inputs, q_id

    def generate_asr_outputs(self, model):
        results = []
        model.eval()

        for idx in range(len(self)):
            inputs_list, q_id = self[idx]

            # Process each sentence independently
            sentence_transcriptions = []
            sent_ids = []
            for j,inputs in enumerate(inputs_list):
                with torch.no_grad():
                    logits = model(**inputs).logits
                predicted_ids = torch.argmax(logits, dim=-1)
                transcription = self.processor.batch_decode(predicted_ids)[0]
                sentence_transcriptions.append(transcription)
                sent_ids.append(j)
            # Append transcription for each input sentence
            results.append({
                'QID': q_id,
                'transcriptions': sentence_transcriptions,
                'Sentence_ids': sent_ids
            })

            if idx == 0:
                print(sentence_transcriptions)
            
        return pd.DataFrame(results)
            
        # Return the results as a DataFrame or a list
#         return results
#             results.append(
#             {
#                 'Sentence_1_transcript': transcription1,
#                 'Sentence_2_transcript': transcription2,
#                 'QID': q_id
#             }
#             )
#         return pd.DataFrame(results)


In [40]:

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")


processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h")
asr_model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h").to(device)



Using device: cuda


Some weights of the model checkpoint at facebook/wav2vec2-base-960h were not used when initializing Wav2Vec2ForCTC: ['wav2vec2.encoder.pos_conv_embed.conv.weight_g', 'wav2vec2.encoder.pos_conv_embed.conv.weight_v']
- This IS expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1', 'wav2vec2.masked_spec_embed']
You sho

In [41]:
csv_file = "/kaggle/input/business-json/Task_3_data.csv"
audio_dir = "/kaggle/input/business-json/Task_3_aud_files/kaggle/working/Task_3_aud_files"
df = pd.read_csv(csv_file)
print(df.iloc[0]['all_sentences'])

dataset = AudioToTextDataset(csv_file, audio_dir, processor, device)

# dataset.generate_asr_outputs(asr_model)
df_asr_outputs = dataset.generate_asr_outputs(asr_model)

# df_train, df_test = train_test_split(df_asr_outputs, test_size=0.2, random_state=42)

# print(df_train.head())

['Nissan has Yoshisuke Aikawa as CEO.', 'Chevrolet is a type of Auto showroom.', 'Telstra is a type of Telecom store.', 'DATS petrol stations has origin country Belgium.', 'Halliburton has Erle P. Halliburton as CEO.', 'Public Bank Berhad has Teh Hong Piow as CEO.', 'Barclays is a type of Bank.', 'Total S.A. was established on 1924-03-28.', 'Great Canadian Dollar Store was established on 1993-01-01.', "Freddy's Frozen Custard is a type of Restaurant.", 'Franprix is a type of Retail store.', 'Mango was established on 1984-01-01.', 'Indian Oil Corporation was established on 1964-01-01.', 'Kasikornbank has Choti Lamsam as CEO.']
['NISSON HAS OSHASU ACAWAY I SEO', 'CHEVRULIT IS A TYPE OF ARTOCHORUM', 'TELSTRAY IS A TYPE OF TELECHUM STORE', "DAD'S PATROL STATIONS HAS ORIGIN COUNTRY BELGIUM", 'HALIBURTON HES ERLUP HALIBURTON ASSEO', 'PUBLIC BANK BERHAD HASTEJON PIO AS SEO', "BARCLAY'S IS A TYPE OF BANK", 'TOTAL SAY WAS ESTABLISHED ON NINETEEN TWENTY FOUR MINUSTS THREE MINUSTS TWENTY EIGHT', 

In [44]:
print(df_asr_outputs.head())

   QID                                     transcriptions  \
0    0  [NISSON HAS OSHASU ACAWAY I SEO, CHEVRULIT IS ...   
1    1  [HARDY IS A TYPE OF ARTOSHORROOM, FUGIT HAS HE...   
2    2  [NISSON HASIOSHA SUK BAKE AWAY AS SEO, IN DAY ...   
3    3  [CAM MOTORS HAS HEADQUARTERS IN SEAL, UNDAY MO...   
4    4  [NISSON HAS YOSHA SUC ACAWAY AS SEO, PORSCHI I...   

                                     Sentence_ids  
0  [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13]  
1                     [0, 1, 2, 3, 4, 5, 6, 7, 8]  
2      [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]  
3                        [0, 1, 2, 3, 4, 5, 6, 7]  
4              [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]  


In [45]:
df_asr_outputs.to_csv("task_3_asr_out.csv")

In [32]:
entities_csv = "/kaggle/input/business-json/data (3).csv" 
entities_df = pd.read_csv(entities_csv) 
train_df = df_train.copy()
merged_df = pd.merge(train_df, entities_df, left_on="Sentence_ID", right_on="Sentence_id", how="inner")
tokenizer = RobertaTokenizerFast.from_pretrained("roberta-base")

def tokenize_and_align_labels(text, entities):
    tokens = tokenizer(text, truncation=True, return_offsets_mapping=True, return_tensors="pt")
    labels = ["O"] * len(tokens.input_ids[0]) 

    for entity in eval(entities):
        mention = entity["mention"]
        link = entity["link"]

        start_idx = text.find(mention)
        end_idx = start_idx + len(mention)

        for idx, (start, end) in enumerate(tokens["offset_mapping"][0]):
            if start >= start_idx and end <= end_idx:
                if start == start_idx:
                    labels[idx] = "B-" + link  
                else:
                    labels[idx] = "I-" + link  
                    
    return tokens.input_ids[0], labels


tokenized_texts_and_labels = []
for idx, row in merged_df.iterrows():
    text = row["ASR_Output"]
    entities = row["entities"]  
    input_ids, labels = tokenize_and_align_labels(text, entities)
    tokenized_texts_and_labels.append((input_ids, labels))


train_data = pd.DataFrame(tokenized_texts_and_labels, columns=["input_ids", "labels"])


print(train_data.head())

                                           input_ids  \
0  [tensor(0), tensor(2688), tensor(387), tensor(...   
1  [tensor(0), tensor(387), tensor(1889), tensor(...   
2  [tensor(0), tensor(771), tensor(2118), tensor(...   
3  [tensor(0), tensor(25912), tensor(1301), tenso...   
4  [tensor(0), tensor(495), tensor(2606), tensor(...   

                                              labels  
0  [I-Q422, I-Q422, I-Q422, I-Q422, I-Q907, O, O,...  
1  [I-Q983, I-Q983, I-Q983, I-Q983, I-Q983, I-Q98...  
2  [I-Q5, I-Q5, I-Q5, I-Q5, O, O, O, O, O, O, O, ...  
3  [I-Q400, I-Q400, I-Q400, I-Q400, O, O, O, O, O...  
4  [I-Q596, I-Q596, I-Q596, I-Q596, I-Q596, I-Q59...  


In [33]:
unique_labels = sorted({label for labels in train_data['labels'] for label in labels})
label_map = {label: idx for idx, label in enumerate(unique_labels)}

train_data['labels'] = train_data['labels'].apply(lambda x: [label_map[label] for label in x])

class NERDataset(Dataset):
    def __init__(self, df):
        self.input_ids = df['input_ids'].values
        self.labels = df['labels'].values

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return torch.tensor(self.input_ids[idx], dtype=torch.long), torch.tensor(self.labels[idx], dtype=torch.long)

def collate_fn(batch):
    input_ids, labels = zip(*batch)
    
    input_ids_padded = pad_sequence(input_ids, batch_first=True, padding_value=tokenizer.pad_token_id)
    labels_padded = pad_sequence(labels, batch_first=True, padding_value=-100)  
    
    return input_ids_padded, labels_padded


train_df, val_df = train_test_split(train_data, test_size=0.1, random_state=42)

train_dataset = NERDataset(train_df)
val_dataset = NERDataset(val_df)

train_dataloader = DataLoader(train_dataset, batch_size=16, shuffle=True, collate_fn=collate_fn)
val_dataloader = DataLoader(val_dataset, batch_size=16, collate_fn=collate_fn)

In [34]:
bert_model = RobertaForTokenClassification.from_pretrained("roberta-base", num_labels=len(unique_labels))
bert_model = bert_model.to(device)

optimizer = AdamW(bert_model.parameters(), lr=2e-5, eps=1e-8)
total_steps = len(train_dataloader) * 4 
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [35]:
epochs = 10
for epoch in range(epochs):
    bert_model.train()
    total_loss = 0

    for batch in train_dataloader:
        input_ids, labels = batch
        input_ids = input_ids.to(device)
        labels = labels.to(device)

        bert_model.zero_grad()

        outputs = bert_model(input_ids=input_ids, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()

        loss.backward()
        torch.nn.utils.clip_grad_norm_(bert_model.parameters(), max_norm=1.0)
        optimizer.step()
        scheduler.step()

    avg_train_loss = total_loss / len(train_dataloader)
    print(f"Epoch {epoch + 1}, Average Training Loss: {avg_train_loss}")

    bert_model.eval()
    val_loss = 0

    with torch.no_grad():
        for batch in val_dataloader:
            input_ids, labels = batch
            input_ids = input_ids.to(device)
            labels = labels.to(device)

            outputs = bert_model(input_ids=input_ids, labels=labels)
            loss = outputs.loss
            val_loss += loss.item()

    avg_val_loss = val_loss / len(val_dataloader)
    print(f"Epoch {epoch + 1}, Average Validation Loss: {avg_val_loss}")

print("Training complete.")

Epoch 1, Average Training Loss: 3.078959424956506
Epoch 1, Average Validation Loss: 2.653327992984227
Epoch 2, Average Training Loss: 2.457566067952068
Epoch 2, Average Validation Loss: 2.331597183431898
Epoch 3, Average Training Loss: 2.2940263868379995
Epoch 3, Average Validation Loss: 2.2603525604520525
Epoch 4, Average Training Loss: 2.195709489974655
Epoch 4, Average Validation Loss: 2.2232161590031216
Epoch 5, Average Training Loss: 2.178873149286799
Epoch 5, Average Validation Loss: 2.2232161590031216
Epoch 6, Average Training Loss: 2.1746659889942457
Epoch 6, Average Validation Loss: 2.2232161590031216
Epoch 7, Average Training Loss: 2.1874842683808144
Epoch 7, Average Validation Loss: 2.2232161590031216
Epoch 8, Average Training Loss: 2.1750024497008122
Epoch 8, Average Validation Loss: 2.2232161590031216
Epoch 9, Average Training Loss: 2.183828016289142
Epoch 9, Average Validation Loss: 2.2232161590031216
Epoch 10, Average Training Loss: 2.178804987618903
Epoch 10, Average Va

In [36]:
tokenizer = RobertaTokenizerFast.from_pretrained("roberta-base")

test_dataset = NERDataset(train_data)  
test_dataloader = DataLoader(test_dataset, batch_size=16, collate_fn=collate_fn)

bert_model.eval()

predictions = []
true_labels = []

with torch.no_grad():
    for batch in test_dataloader:
        input_ids, labels = batch
        input_ids = input_ids.to(device)
        labels = labels.to(device)

        outputs = bert_model(input_ids=input_ids)
        logits = outputs.logits

        preds = torch.argmax(logits, dim=-1)
        
        for i in range(input_ids.size(0)):
            sentence = tokenizer.decode(input_ids[i], skip_special_tokens=True)
            
            pred_labels = preds[i].cpu().numpy()
            label_ids = labels[i].cpu().numpy()
            
            pred_labels = [unique_labels[id] for id in pred_labels if id != -100]
            true_labels = [unique_labels[id] for id in label_ids if id != -100]
            
            predictions.append({
                "sentence": sentence,
                "predicted_entities": pred_labels
            })
            true_labels.append({
                "sentence": sentence,
                "true_entities": true_labels
            })

predictions_df = pd.DataFrame(predictions)
true_labels_df = pd.DataFrame(true_labels)

print("Predictions and true labels saved.")

Predictions and true labels saved.


In [37]:
print(predictions_df)

                                               sentence  \
0                 IDBUD BANK HAS HEADQUARTERS IN MUMBAY   
1        BAN IN A REPUBLIC IS A TYPE OF APPERIL'S STORE   
2     WALGREENS WAS ESTABLISHED ON NINETEEN O ONEMIN...   
3     ATAZON WAS ESTABLISHED ON NINETEEN SEVENTY NIN...   
4     DAD'S PATROL STATIONS IS A TYPE OF PATROL STATION   
...                                                 ...   
2095       HE O BIGILO IS FROM UNITED STATES OF AMERICA   
2096     BRICKOAT HAS HEADQUARTERS IN LONGPOMP SIR ORGE   
2097  CODA AUTO PRODUCES CAR CODA FABIA CODA RAPI TW...   
2098    AT ANT HAS HEADQUARTERS AND WOULD TAKE RE TOWER   
2099  CASIO BELONGS TO THE ELECTRONIC'S INDUSTRY IND...   

                                     predicted_entities  
0     [I-Q1085, I-Q892, I-Q1085, I-Q892, O, O, O, O,...  
1     [I-Q94, I-Q94, I-Q94, I-Q94, I-Q94, I-Q94, I-Q...  
2     [O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...  
3     [O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...  
4

In [50]:
asr_processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-large-960h")

roberta_tokenizer = RobertaTokenizerFast.from_pretrained("roberta-base")


gpt2_tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
gpt2_model = GPT2LMHeadModel.from_pretrained("gpt2").to(device)

knowledge_df = predictions_df.copy()

def qa_pipeline(audio_path, question):
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    
    audio, _ = librosa.load(audio_path, sr=16000)
    
    audio_input = asr_processor(audio, return_tensors="pt", padding=True, sampling_rate=16000).to(device)
    with torch.no_grad():
        logits = asr_model(**audio_input).logits
    predicted_ids = torch.argmax(logits, dim=-1)
    transcription = asr_processor.decode(predicted_ids[0].cpu())
    print(f"Transcription: {transcription}")
    
    inputs = roberta_tokenizer(transcription, return_tensors="pt", truncation=True, padding=True).to(device)
    with torch.no_grad():
        outputs = bert_model(**inputs)
    predictions = torch.argmax(outputs.logits, dim=-1).squeeze().cpu().tolist()
    

    labels = roberta_tokenizer.decode(predictions)
    print(labels)
    entity_ids = {label.split('-')[1] for label in labels if label.startswith('I-')}

    relevant_sentences = []
    for entity_id in entity_ids:
        relevant_knowledge = knowledge_df[knowledge_df['predicted_entities'].apply(lambda x: any(e for e in x if e.endswith(entity_id)))]['sentence'].tolist()
        relevant_sentences.extend(relevant_knowledge)
    
    knowledge_text = " ".join(relevant_sentences)
    print(knowledge_text)
    input_text = f"Question: {question}\nContext: {knowledge_text}"
    inputs = gpt2_tokenizer.encode(input_text, return_tensors="pt").to(device)
    
    with torch.no_grad():
        outputs = gpt2_model.generate(inputs, max_length=150, num_return_sequences=1, early_stopping=True)
    answer = gpt2_tokenizer.decode(outputs[0].cpu(), skip_special_tokens=True)
    
    return answer

audio_path = "/kaggle/input/business-json/aud_files/kaggle/working/TTS/aud_files/sentence_1.wav"
question = "What is Carl's Jr.?"
answer = qa_pipeline(audio_path, question)
print(f"Answer: {answer}")

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Transcription: CARL'S JUNIOR IS A TYPE OF RESTORENT
 shooting light light light light light light light light light light light light light light shooting

Answer: Question: What is Carl's Jr.?
Context:  Carl's Jr. is a fictional character in the popular TV series, The Simpsons.  He is a young man who is raised by his father, Carl, and his mother, Mary.  He is a very intelligent man, and he is a very good friend of his father.  He is a very good friend of his mother, and he is a very good friend of his father.  He is a very good friend of his father's family, and he is a very good friend of his mother's family.  He is a very good friend of his father's family, and he is a very good friend of his mother's family. 
