In [1]:
import re
import torch
from torch import nn
import spacy
import pandas as pd
from datasets import load_dataset
from torchtext.vocab import build_vocab_from_iterator
from torch.utils.data import DataLoader


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
ds = load_dataset("Pinkstack/thinking-multilingual-30-23-small-690")


In [3]:
ds


DatasetDict({
    train: Dataset({
        features: ['language', 'instruction', 'output', 'conversations'],
        num_rows: 690
    })
})

In [4]:
ds = ds["train"]


In [5]:
dsData = pd.DataFrame(ds)


In [6]:
dsData


Unnamed: 0,language,instruction,output,conversations
0,en,Evaluate the following integral using the Cauc...,"<think>\nAlright, I need to evaluate the integ...","[{'from': 'human', 'value': 'Evaluate the foll..."
1,ar,تقييم التكامل التالي باستخدام صيغة Cauchy Inte...,<think>حسنًا ، أحتاج إلى تقييم التكامل\n\n\ [\...,"[{'from': 'human', 'value': 'تقييم التكامل الت..."
2,zh-cn,使用Cauchy积分公式评估以下积分：\n\n$ \ int_ {| z | = 1} \ ...,<think>好吧，我需要评估积分\n\n\ [\n\ int_ {| z | = 1} \...,"[{'from': 'human', 'value': '使用Cauchy积分公式评估以下积..."
3,cs,Vyhodnoťte následující integrál pomocí integrá...,"<think>Dobře, musím zhodnotit integrál\n\n\ [\...","[{'from': 'human', 'value': 'Vyhodnoťte násled..."
4,da,Evaluer følgende integral ved hjælp af Cauchy ...,"<think>Okay, jeg er nødt til at evaluere integ...","[{'from': 'human', 'value': 'Evaluer følgende ..."
...,...,...,...,...
685,es,Suponga que f (x) = (4x - 5)^2 y g (x) = cos (...,"<think>Muy bien, necesito encontrar la derivad...","[{'from': 'human', 'value': 'Suponga que f (x)..."
686,sv,Anta att f (x) = (4x - 5)^2 och g (x) = cos (7...,"<think>Okej, jag måste hitta derivatet av komp...","[{'from': 'human', 'value': 'Anta att f (x) = ..."
687,th,สมมติว่า f (x) = (4x - 5)^2 และ g (x) = cos (7...,<think>เอาล่ะฉันต้องค้นหาอนุพันธ์ของฟังก์ชั่นค...,"[{'from': 'human', 'value': 'สมมติว่า f (x) = ..."
688,tr,Diyelim ki f (x) = (4x - 5)^2 ve g (x) = cos (...,"<think>Pekala, f (g (x)) kompozit fonksiyonunu...","[{'from': 'human', 'value': 'Diyelim ki f (x) ..."


In [7]:
dsData = dsData[["language","instruction","output"]]


In [None]:
dsData = dsData[(dsData["language"] == "ar") | (dsData["language"] == "en")]
dsData = dsData.reset_index()


In [9]:
dsData


Unnamed: 0,index,language,instruction,output
0,0,en,Evaluate the following integral using the Cauc...,"<think>\nAlright, I need to evaluate the integ..."
1,1,ar,تقييم التكامل التالي باستخدام صيغة Cauchy Inte...,<think>حسنًا ، أحتاج إلى تقييم التكامل\n\n\ [\...
2,23,en,Ryan works in an office that has an even numbe...,"<think>\nAlright, let's tackle this problem st..."
3,24,ar,يعمل ريان في مكتب لديه عدد متساو من الرجال وال...,<think>حسنًا ، دعنا نتعامل مع هذه المشكلة خطوة...
4,46,en,If a card is drawn from a well shuffled pack o...,"<think>\nAlright, let's tackle this probabilit..."
5,47,ar,إذا تم سحب بطاقة من حزمة من البطاقات بشكل جيد ...,<think>حسنًا ، دعنا نتعامل مع مشكلة الاحتمال خ...
6,69,en,"In trapezoid $ABCD$, leg $\overline{BC}$ is pe...","<think>\nAlright, let's tackle this geometry p..."
7,70,ar,في منحرف $ ABCD $ ، الساق $ \ overline {bc} $ ...,<think>حسنًا ، دعنا نتعامل مع مشكلة الهندسة هذ...
8,92,en,Cars emerging from a motorway arrive at a junc...,"<think>\nAlright, let's tackle this problem st..."
9,93,ar,تصل السيارات الناشئة من طريق سريع إلى تقاطع يق...,<think>حسنًا ، دعنا نتناول هذه المشكلة خطوة بخ...


In [10]:
nlp = spacy.load("en_core_web_sm")

def tokenizeText(text):
    doc = nlp(text)
    tokens = []
    for token in doc:
        if token.is_punct or token.is_space:
            continue
        tokens.append(token.text)
    return tokens


In [11]:

def yieldTokens(data, tokenizer):
    for text in data:
        yield tokenizer(text)


def buildVocab(data, tokenizer):
    vocab = build_vocab_from_iterator(
        yieldTokens(data, tokenizer),
        specials=['<unk>', '<pad>', '<sos>', '<eos>'],
        min_freq=2
    )
    vocab.set_default_index(vocab['<unk>'])
    return vocab



In [12]:
inputs = dsData["instruction"]
outputs = dsData["output"]


In [13]:
vocabInput = buildVocab(inputs, tokenizeText)
vocabOutput = buildVocab(outputs,tokenizeText)


In [14]:
def dataProcess(inputs, outputs, seqLen=None):
    data = []
    for rawInput, rawOutput in zip(inputs, outputs):
        tokensText = [vocabInput['<sos>']] + [vocabInput[token] for token in tokenizeText(rawInput)] + [vocabInput['<eos>']]
        tokensCode = [vocabOutput['<sos>']] + [vocabOutput[token] for token in tokenizeText(rawOutput)] + [vocabOutput['<eos>']]
        
        if seqLen is not None:
            if len(tokensText) > seqLen:
                tokensText = tokensText[:seqLen]
            else:
                tokensText += [vocabInput['<pad>']] * (seqLen - len(tokensText))
            
            if len(tokensCode) > seqLen:
                tokensCode = tokensCode[:seqLen]
            else:
                tokensCode += [vocabOutput['<pad>']] * (seqLen - len(tokensCode))
        
        data.append((torch.tensor(tokensText, dtype=torch.long),torch.tensor(tokensCode, dtype=torch.long)))
    return data


In [15]:
trainData = dataProcess(inputs,outputs,500)

device = torch.device('cuda' if torch.cuda.is_available() else  'cpu')
batchSize = 32
trainLoader = DataLoader(
    trainData, 
    batch_size=batchSize,
    shuffle=True
    )


In [24]:
class TransformerDecoderModel(nn.Module):
    def __init__(self, inputVocabSize, outputVocabSize, sequenceLen, embedSize, hiddenSize, numLayers, numHeads, dropout=0.1):
        super(TransformerDecoderModel, self).__init__()

        self.srcEmbedding = nn.Embedding(inputVocabSize, embedSize)
        
        self.tgtEmbedding = nn.Embedding(outputVocabSize, embedSize)
        self.positionalEncoding = nn.Parameter(torch.zeros(1, sequenceLen, embedSize))
        decoder_layer = nn.TransformerDecoderLayer(
            d_model=embedSize, nhead=numHeads, dim_feedforward=hiddenSize, dropout=dropout
        )
        self.decoder = nn.TransformerDecoder(decoder_layer, num_layers=numLayers)
        self.fc_out = nn.Linear(embedSize, outputVocabSize)
        
    def forward(self, tgt, src, tgtMask=None):
        
        srcEmb = self.srcEmbedding(src) + self.positionalEncoding[:, :src.size(1), :]
        srcEmb = srcEmb.permute(1, 0, 2)  # (seq_len, batch, embed_size)
        
        tgtEmb = self.tgtEmbedding(tgt) + self.positionalEncoding[:, :tgt.size(1), :]
        tgtEmb = tgtEmb.permute(1, 0, 2)  # (seq_len, batch, embed_size)
        
        output = self.decoder(tgtEmb, srcEmb, tgt_mask=tgtMask)
        output = output.permute(1, 0, 2)  # (batch, seq_len, embed_size)
        return self.fc_out(output)

inputVocabSize = len(vocabInput)
outputVocabSize = len(vocabOutput)
embedSize = 512
hiddenSize = 1024
numLayers = 6
numHeads = 8
dropout = 0.4



In [25]:
decoderModel = TransformerDecoderModel(inputVocabSize,outputVocabSize,700, embedSize, hiddenSize, numLayers, numHeads, dropout)
print(decoderModel)


TransformerDecoderModel(
  (srcEmbedding): Embedding(404, 512)
  (tgtEmbedding): Embedding(2251, 512)
  (decoder): TransformerDecoder(
    (layers): ModuleList(
      (0-5): 6 x TransformerDecoderLayer(
        (self_attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
        )
        (multihead_attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
        )
        (linear1): Linear(in_features=512, out_features=1024, bias=True)
        (dropout): Dropout(p=0.4, inplace=False)
        (linear2): Linear(in_features=1024, out_features=512, bias=True)
        (norm1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (norm2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (norm3): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (dropout1): Dropout(p=0.4, inplace=False)
        (dropout2): Dropout(p=0.

In [26]:
totalParams = sum(p.numel() for p in decoderModel.parameters())
totalParams


21799115

In [19]:
criterion = nn.CrossEntropyLoss(ignore_index=vocabOutput['<pad>'])
optimizer = torch.optim.Adam(decoderModel.parameters(), lr=0.0001)


patience = 10
bestValidLoss = float('inf')
patienceCounter = 0
bestModelWeights = None

decoderModel = decoderModel.to(device)

numEpochs = 300
for epoch in range(numEpochs):
    decoderModel.train()
    totalLoss = 0
    
    for batch_idx, (srcSeq, tgtSeq) in enumerate(trainLoader):
        srcSeq = srcSeq.to(device)
        tgtSeq = tgtSeq.to(device)
        
        tgtInput = tgtSeq[:, :-1]
        tgtOutput = tgtSeq[:, 1:]

        tgtMask = nn.Transformer.generate_square_subsequent_mask(tgtInput.size(1)).to(device)
        
        outputs = decoderModel(tgt=tgtInput, src=srcSeq, tgtMask=tgtMask)
        
        loss = criterion(outputs.view(-1, outputs.size(-1)), tgtOutput.contiguous().view(-1))
        
        optimizer.zero_grad()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(decoderModel.parameters(), max_norm=1)
        optimizer.step()
        
        totalLoss += loss.item()
        
    
    avgTrainLoss = totalLoss / len(trainLoader)
    if epoch + 1 % 50 == 0:
        print(f'Epoch [{epoch+1}/{numEpochs}] Train Loss: {avgTrainLoss:.4f}')

    if avgTrainLoss < bestValidLoss:
        bestValidLoss = avgTrainLoss
        patienceCounter = 0
        bestModelWeights = decoderModel.state_dict().copy()
        torch.save(decoderModel.state_dict(),"model.pth")
        print(f"Validation loss improved Saving best model")
    else:
        patienceCounter += 1
        print(f"Validation loss didn't improve. Patience: {patienceCounter}/{patience}")
        
    if patienceCounter >= patience:
        print(f"Early stopping  after {epoch+1} epochs")
        break


Validation loss improved Saving best model
Validation loss improved Saving best model
Validation loss improved Saving best model
Validation loss improved Saving best model
Validation loss improved Saving best model
Validation loss improved Saving best model
Validation loss improved Saving best model
Validation loss improved Saving best model
Validation loss improved Saving best model
Validation loss improved Saving best model
Validation loss improved Saving best model
Validation loss improved Saving best model
Validation loss improved Saving best model
Validation loss improved Saving best model
Validation loss improved Saving best model
Validation loss improved Saving best model
Validation loss improved Saving best model
Validation loss improved Saving best model
Validation loss improved Saving best model
Validation loss improved Saving best model
Validation loss improved Saving best model
Validation loss improved Saving best model
Validation loss improved Saving best model
Validation 

In [45]:
decoderModel.load_state_dict(torch.load("model.pth"))
decoderModel.to(device).eval()

def generate_response(input_text, max_len=100):

    src_tokens = [vocabInput['<sos>']] + [vocabInput[token] for token in tokenizeText(input_text)] + [vocabInput['<eos>']]

    src = torch.tensor(src_tokens, dtype=torch.long).unsqueeze(0).to(device)

    tgt = torch.tensor([vocabOutput['<sos>']], dtype=torch.long).unsqueeze(0).to(device)

    with torch.no_grad():

        for _ in range(max_len):

            tgt_mask = nn.Transformer.generate_square_subsequent_mask(tgt.size(1)).to(device)

            output = decoderModel(tgt=tgt, src=src, tgtMask=tgt_mask)

            next_token_logits = output[:, -1, :]

            next_token = next_token_logits.argmax(dim=-1).unsqueeze(0)

            tgt = torch.cat([tgt, next_token], dim=1)

            if next_token.item() == vocabOutput['<eos>']:
                break



    generated_ids = tgt.squeeze().tolist()
    output_tokens = [vocabOutput.get_itos()[token] for token in generated_ids]
    
    if output_tokens[0] == '<sos>':
        output_tokens = output_tokens[1:]
    if output_tokens[-1] == '<eos>':
        output_tokens = output_tokens[:-1]
    

    response = " ".join(output_tokens)
    return response



In [46]:
input_text = "السلام عليكم"
response = generate_response(input_text)
print("Generated response:", response)


Generated response: < think > حسنًا أحتاج إلى العثور على الحد الأدنى الهندسة فهم العملية الاحتمال الأدنى الهندسة فهم معينة أولاً أحتاج فهم معينة أولاً أحتاج فهم كسور فهم كسور أولاً أحتاج فهم كسور أولاً أحتاج فهم كسور أولاً أحتاج فهم كسور أولاً أحتاج فهم كسور أولاً أحتاج فهم كسور أولاً أحتاج فهم كسور أولاً أحتاج فهم كسور أولاً تذكر الجزء الأصلي نفهم أولاً أحتاج فهم كسور أولاً أحتاج أكثر وضوحًا chess مشروع تم اختياره أولاً أحتاج فهم كسور أولاً أحتاج أكثر وضوحًا chess مشروع تم اختياره أولاً أحتاج أكثر وضوحًا chess مشروع تم اختياره أولاً أحتاج أكثر وضوحًا chess مشروع تم
