In [1]:
import numpy as np 
import pandas as pd 
import os

for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


/kaggle/input/squad-translated/translated_squad2.0_ru.csv
/kaggle/input/squad-translated/translated_squad2.0_hi.csv
/kaggle/input/squad-translated/translated_squad2.0_ja.csv


In [2]:
!pip install gtts
!pip install pydub
!pip install transformers torch
!pip install torchaudio

Collecting gtts
  Downloading gTTS-2.5.2-py3-none-any.whl.metadata (4.1 kB)
Downloading gTTS-2.5.2-py3-none-any.whl (29 kB)
Installing collected packages: gtts
Successfully installed gtts-2.5.2


In [30]:
from gtts import gTTS
import io
import numpy as np
from scipy.io import wavfile
from pydub import AudioSegment
from transformers import BertTokenizer, BertForQuestionAnswering, BertConfig
from torch.utils.data import DataLoader, Dataset
from transformers import AdamW
import torch
import torchaudio
from transformers import Wav2Vec2Processor, Wav2Vec2Model
import pandas as pd
import ast
import warnings
import logging
import torch.nn as nn

logging.getLogger("transformers.tokenization_utils_base").setLevel(logging.ERROR)
warnings.filterwarnings("ignore")

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

In [20]:

def tokenize_input(context, question, answer=None, max_length=100):
    inputs = tokenizer.encode_plus(
        question,
        context,
        add_special_tokens=True,  
        return_tensors="pt",       
        padding='max_length',      
        max_length=max_length,    
        truncation=True,
        return_overflowing_tokens=False
    )
    
    if answer:
        answer_dict = ast.literal_eval(answer)
        
        if answer_dict['text'] and answer_dict['answer_start']:
            answer_text = answer_dict['text'][0]  
            answer_start = answer_dict['answer_start'][0]  
            
            context_tokens_before_answer = tokenizer.encode(context[:answer_start], add_special_tokens=False)
            answer_tokens = tokenizer.encode(answer_text, add_special_tokens=False)
            
            start_position = len(context_tokens_before_answer) + 1 
            end_position = start_position + len(answer_tokens) - 1
        else:
            start_position = 0
            end_position = 0
        
        inputs.update({'start_positions': torch.tensor([start_position]),
                       'end_positions': torch.tensor([end_position])})
    
    return inputs

In [21]:
class QADataset(Dataset):
    def __init__(self, contexts, questions, answers):
        self.contexts = contexts
        self.questions = questions
        self.answers = answers
    
    def __len__(self):
        return len(self.questions)
    
    def __getitem__(self, idx):
        context = self.contexts[idx]
        question = self.questions[idx]
        answer = self.answers[idx]
        inputs = tokenize_input(context, question, answer)
        return inputs

In [22]:
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
model = BertForQuestionAnswering.from_pretrained('bert-base-multilingual-cased').to(device)

Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [23]:
df_hi = pd.read_csv('/kaggle/input/squad-translated/translated_squad2.0_hi.csv')
df_ru = pd.read_csv('/kaggle/input/squad-translated/translated_squad2.0_ru.csv')
df_ja =  pd.read_csv('/kaggle/input/squad-translated/translated_squad2.0_ja.csv')

contexts =  df_hi['context']
questions = df_hi['question']
answers = df_hi['answers']

In [24]:
train_dataset = QADataset(contexts, questions, answers)
train_dataloader = DataLoader(train_dataset, batch_size=8, shuffle=True)



optimizer = AdamW(model.parameters(), lr=0.0001)

model.train()
for epoch in range(3): 
    for i, batch in enumerate(train_dataloader):
        
        batch['input_ids'] = batch['input_ids'].squeeze(1)
        batch['attention_mask'] = batch['attention_mask'].squeeze(1)
        batch['token_type_ids'] = batch['token_type_ids'].squeeze(1)
        batch['start_positions'] = batch['start_positions'].squeeze(1)
        batch['end_positions'] = batch['end_positions'].squeeze(1)
        batch.to(device)
        optimizer.zero_grad()
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()
        
        print(f'for batch {i+1}/{len(train_dataloader)}, epoch {epoch + 1}: loss = {loss.item()}')


for batch 1/326, epoch 1: loss = 4.461539268493652
for batch 2/326, epoch 1: loss = 4.41354513168335
for batch 3/326, epoch 1: loss = 4.601458549499512
for batch 4/326, epoch 1: loss = 4.66028356552124
for batch 5/326, epoch 1: loss = 4.483882904052734
for batch 6/326, epoch 1: loss = 4.41224479675293
for batch 7/326, epoch 1: loss = 4.699667453765869
for batch 8/326, epoch 1: loss = 4.517522811889648
for batch 9/326, epoch 1: loss = 4.50960111618042
for batch 10/326, epoch 1: loss = 4.7938127517700195
for batch 11/326, epoch 1: loss = 4.669885635375977
for batch 12/326, epoch 1: loss = 4.535154342651367
for batch 13/326, epoch 1: loss = 4.446372985839844
for batch 14/326, epoch 1: loss = 4.585618019104004
for batch 15/326, epoch 1: loss = 4.556934356689453
for batch 16/326, epoch 1: loss = 4.641994476318359
for batch 17/326, epoch 1: loss = 4.504547595977783
for batch 18/326, epoch 1: loss = 4.452451229095459
for batch 19/326, epoch 1: loss = 4.444398880004883
for batch 20/326, epoch 

In [None]:
# def text_to_audio_array(text, lang='en'):
#     audio_stream = io.BytesIO()
#     tts = gTTS(text=text, lang=lang)
#     tts.write_to_fp(audio_stream)
    
#     audio_stream.seek(0)
    
#     audio_segment = AudioSegment.from_file(audio_stream, format="mp3")
    
#     wav_stream = io.BytesIO()
#     audio_segment.export(wav_stream, format="wav")
#     wav_stream.seek(0)

#     sample_rate, audio_data = wavfile.read(wav_stream)
    
#     return sample_rate, audio_data

# text = "Hello, how are you?"
# sample_rate, audio_data = text_to_audio_array(text, lang='en')

# audio_tensor = torch.tensor(audio_data, dtype=torch.float32)
# if sample_rate != 16000:
#     resampler = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)
#     audio_tensor = resampler(audio_tensor)


# audio_tensor = audio_tensor.unsqueeze(0)

# processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h")
# model = Wav2Vec2Model.from_pretrained("facebook/wav2vec2-base-960h")

# inputs = processor(audio_tensor.squeeze(), sampling_rate=16000, return_tensors="pt")

# with torch.no_grad():
#     features = model(**inputs).last_hidden_state

# print(f"Features Shape: {features.shape}")


In [34]:
class QADataset(Dataset):
    def __init__(self, contexts, questions, answers, wav2vec_processor, wav2vec_model, tokenizer, lang='en'):
        self.contexts = contexts
        self.questions = questions
        self.answers = answers
        self.processor = wav2vec_processor
        self.model = wav2vec_model
        self.tokenizer = tokenizer
        self.lang = lang

    def __len__(self):
        return len(self.questions)

    def __getitem__(self, idx):
        context = self.contexts[idx]
        question = self.questions[idx]
        answer = self.answers[idx]

        # Tokenize and pad input
        inputs = self.tokenizer(context, question, truncation=True, padding='max_length', max_length=512, return_tensors='pt')
        
        # Generate context audio features
        context_audio_features = self.text_to_audio_features(context)
        
        # Create attention mask
        attention_mask = inputs['attention_mask'].squeeze()  # Squeeze to remove extra dimensions

        return {
            'input_ids': inputs['input_ids'].squeeze(), 
            'attention_mask': attention_mask, 
            'context_audio_features': context_audio_features
        }
    def text_to_audio_array(self, text):
        audio_stream = io.BytesIO()
        tts = gTTS(text=text, lang=self.lang)
        tts.write_to_fp(audio_stream)
        
        audio_stream.seek(0)
        
        audio_segment = AudioSegment.from_file(audio_stream, format="mp3")
        
        wav_stream = io.BytesIO()
        audio_segment.export(wav_stream, format="wav")
        wav_stream.seek(0)

        sample_rate, audio_data = wavfile.read(wav_stream)
        
        return sample_rate, audio_data
    
    def text_to_audio_features(self, text):
        sample_rate, audio_data = self.text_to_audio_array(text)
        
        audio_tensor = torch.tensor(audio_data, dtype=torch.float32)
        if sample_rate != 16000:
            resampler = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)
            audio_tensor = resampler(audio_tensor)
        
        audio_tensor = audio_tensor.unsqueeze(0)
        
        inputs = self.processor(audio_tensor.squeeze(), sampling_rate=16000, return_tensors="pt")

        with torch.no_grad():
            inputs.to(device)
            features = self.model(**inputs).last_hidden_state
        
        return features

In [47]:
import torch
import torchaudio
import numpy as np

class QADataset(Dataset):
    def __init__(self, contexts, questions, answers, wav2vec_processor, wav2vec_model, tokenizer, lang='en', max_audio_length=16000):
        self.contexts = contexts
        self.questions = questions
        self.answers = answers
        self.processor = wav2vec_processor
        self.model = wav2vec_model
        self.tokenizer = tokenizer
        self.lang = lang
        self.max_audio_length = max_audio_length

    def __len__(self):
        return len(self.questions)

    def __getitem__(self, idx):
        context = self.contexts[idx]
        question = self.questions[idx]
        answer = self.answers[idx]

        inputs = self.tokenizer(context, question, truncation=True, padding='max_length', max_length=512, return_tensors='pt')
        
        context_audio_features = self.text_to_audio_features(context)
        
        context_audio_features = self.pad_or_truncate_audio_features(context_audio_features)

        attention_mask = inputs['attention_mask'].squeeze()  

        return {
            'inputs': inputs, 
            'attention_mask': attention_mask, 
            'context_audio_features': context_audio_features
        }
    
    def text_to_audio_array(self, text):
        audio_stream = io.BytesIO()
        tts = gTTS(text=text, lang=self.lang)
        tts.write_to_fp(audio_stream)
        
        audio_stream.seek(0)
        
        audio_segment = AudioSegment.from_file(audio_stream, format="mp3")
        
        wav_stream = io.BytesIO()
        audio_segment.export(wav_stream, format="wav")
        wav_stream.seek(0)

        sample_rate, audio_data = wavfile.read(wav_stream)
        
        return sample_rate, audio_data
    
    def text_to_audio_features(self, text):
        sample_rate, audio_data = self.text_to_audio_array(text)
        
        audio_tensor = torch.tensor(audio_data, dtype=torch.float32)
        if sample_rate != 16000:
            resampler = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)
            audio_tensor = resampler(audio_tensor)
        
        audio_tensor = audio_tensor.unsqueeze(0)
        
        inputs = self.processor(audio_tensor.squeeze(), sampling_rate=16000, return_tensors="pt")

        with torch.no_grad():
            inputs = inputs.to(device)
            features = self.model(**inputs).last_hidden_state
        
        return features

    def pad_or_truncate_audio_features(self, features):
        num_frames = features.size(1)
        if num_frames > self.max_audio_length:
            features = features[:, :self.max_audio_length, :]
        elif num_frames < self.max_audio_length:
            pad_length = self.max_audio_length - num_frames
            pad = torch.zeros(features.size(0), pad_length, features.size(2), device=features.device)
            features = torch.cat((features, pad), dim=1)
        return features


In [48]:
processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h")
model1 = Wav2Vec2Model.from_pretrained("facebook/wav2vec2-base-960h").to(device)
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')

dataset = QADataset(contexts, questions, answers, processor, model,tokenizer)



Some weights of Wav2Vec2Model were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1', 'wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [39]:
# # print(f"Text Inputs: {sample['inputs']}")
# %time
# import time
# s = time.time()
# sample = dataset[0]
# print(f"Context Audio Features Shape: {sample['context_audio_features'].shape}")
# e = time.time()

# print(e - s)

CPU times: user 3 µs, sys: 0 ns, total: 3 µs
Wall time: 6.68 µs
Context Audio Features Shape: torch.Size([1, 16000, 768])
4.345856666564941


In [49]:

class MultimodalTransformer(nn.Module):
    def __init__(self, text_model_name='bert-base-multilingual-cased', audio_feature_dim=768, hidden_dim=512):
        super(MultimodalTransformer, self).__init__()
        self.text_model = transformers.AutoModel.from_pretrained(text_model_name)
        self.audio_linear = nn.Linear(audio_feature_dim, hidden_dim)
        self.cross_attention = nn.MultiheadAttention(embed_dim=hidden_dim, num_heads=8)
        self.fc = nn.Linear(hidden_dim, 1) 

    def forward(self, input_ids, attention_mask, audio_features):
        text_outputs = self.text_model(input_ids=input_ids, attention_mask=attention_mask)
        text_features = text_outputs.last_hidden_state

        audio_features = self.audio_linear(audio_features)
        audio_features = audio_features.unsqueeze(0) 

        text_features = text_features.permute(1, 0, 2)  
        attn_output, _ = self.cross_attention(text_features, audio_features, audio_features)
        output = self.fc(attn_output.mean(dim=0))  

        return output


In [53]:
import torch
import torch.optim as optim
from torch.utils.data import DataLoader
import transformers

batch_size = 4
num_epochs = 1
learning_rate = 1e-5

train_dataset = QADataset(
    contexts=contexts, 
    questions=questions, 
    answers=answers, 
    wav2vec_processor=processor, 
    wav2vec_model=model1,
    tokenizer = tokenizer
)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = MultimodalTransformer().to(device)
optimizer = optim.Adam(model.parameters(), lr=learning_rate)
criterion = nn.MSELoss()  

for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    
    for batch in train_loader:
        inputs = batch['inputs']
        context_audio_features = batch['context_audio_features']
        inputs['input_ids'] = inputs['input_ids'].squeeze(1)
        inputs['attention_mask'] = inputs['attention_mask'].squeeze(1)
        inputs['token_type_ids'] = inputs['token_type_ids'].squeeze(1)
        input_ids = inputs['input_ids'].to(device)
        attention_mask = inputs['attention_mask'].to(device)
        audio_features = context_audio_features.to(device)
        
        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask, audio_features)
    
        targets = torch.zeros_like(outputs)  
        loss = criterion(outputs, targets)
        
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
        print(loss.item())
    print(f"Epoch {epoch + 1}/{num_epochs}, Loss: {total_loss / len(train_loader)}")


print("Training complete!")


AssertionError: For batched (3-D) `query`, expected `key` and `value` to be 3-D but found 5-D and 5-D tensors respectively