In [1]:
import pandas as pd

# Load dataset
file_path = r"/kaggle/input/codedataset/spoc-train-train.tsv"  # Change to your dataset's path
df = pd.read_csv(file_path, sep="\t")  # Specify tab separator

# Display dataset information
print(df.info())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 246086 entries, 0 to 246085
Data columns (total 7 columns):
 #   Column    Non-Null Count   Dtype 
---  ------    --------------   ----- 
 0   text      181862 non-null  object
 1   code      246086 non-null  object
 2   workerid  246086 non-null  int64 
 3   probid    246086 non-null  object
 4   subid     246086 non-null  int64 
 5   line      246086 non-null  int64 
 6   indent    246086 non-null  int64 
dtypes: int64(4), object(3)
memory usage: 13.1+ MB
None


In [2]:
import nltk
from nltk.tokenize import word_tokenize
import nltk

# Download the 'punkt' tokenizer data
nltk.download('punkt')

# If 'punkt_tab' is still missing, try:
nltk.download('punkt_tab')


# Download punkt tokenizer
nltk.download('punkt')

[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /usr/share/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [3]:
df["text"] = df["text"].astype(str)
df["text"] = df["text"].astype(str).fillna("").replace("nan", "")

df["text_tokens"] = df["text"].apply(word_tokenize)
print(df["text_tokens"])

0                                                  []
1                                 [create, string, s]
2         [create, integers, x1, ,, y1, ,, x2, ,, y2]
3                                           [read, s]
4                    [set, x1, to, s, [, 0, ], -, 96]
                             ...                     
246081                                             []
246082                                             []
246083             [print, ``, YES, '', and, newline]
246084                                             []
246085                                             []
Name: text_tokens, Length: 246086, dtype: object


In [4]:
df["code"] = df["code"].astype(str).fillna("").replace("nan", "")
df["code_tokens"] = df["code"].apply(word_tokenize)
print(df["code_tokens"])

0                             [int, main, (, ), {]
1                                   [string, s, ;]
2                [int, x1, ,, y1, ,, x2, ,, y2, ;]
3                                [cin, >, >, s, ;]
4                    [x1, =, s, [, 0, ], -, 96, ;]
                            ...                   
246081                                         [}]
246082                                         [}]
246083    [cout, <, <, ``, YES, '', <, <, endl, ;]
246084                              [return, 0, ;]
246085                                         [}]
Name: code_tokens, Length: 246086, dtype: object


In [5]:
# Print samples from index 1-4
print("Samples from index 1-4:")
for i in range(1, 4):
    print(f"Index {i}:")
    print("Tokenized Pseudocode:", df["text_tokens"].iloc[i])
    print("Tokenized C++ Code:", df["code_tokens"].iloc[i])
    print("-" * 50)


Samples from index 1-4:
Index 1:
Tokenized Pseudocode: ['create', 'string', 's']
Tokenized C++ Code: ['string', 's', ';']
--------------------------------------------------
Index 2:
Tokenized Pseudocode: ['create', 'integers', 'x1', ',', 'y1', ',', 'x2', ',', 'y2']
Tokenized C++ Code: ['int', 'x1', ',', 'y1', ',', 'x2', ',', 'y2', ';']
--------------------------------------------------
Index 3:
Tokenized Pseudocode: ['read', 's']
Tokenized C++ Code: ['cin', '>', '>', 's', ';']
--------------------------------------------------


In [6]:
# Save tokenized pseudocode and C++ code to CSV
output_file = "/kaggle/working/tokenized_spoc.csv"
df[["text_tokens", "code_tokens"]].to_csv(output_file, index=False)

print(f"Tokenized data saved to {output_file}")

Tokenized data saved to /kaggle/working/tokenized_spoc.csv


In [7]:
# Add start and end tokens to tokenized C++ code
df["code_tokens"] = df["code_tokens"].apply(lambda tokens: ["<start>"] + tokens + ["<end>"])
# Save updated tokenized data to CSV
output_file = "tokenized_spoc_with_tokens.csv"
df[["text_tokens", "code_tokens"]].to_csv(output_file, index=False)

print(f"Updated tokenized data saved to {output_file}")

Updated tokenized data saved to tokenized_spoc_with_tokens.csv


In [8]:
for i in range(len(df)):  # Iterate over all rows
    text_tokens = df["text_tokens"].iloc[i]  # Get text tokens
    code_tokens = df["code_tokens"].iloc[i]  # Get code tokens
    
    text_len = len(text_tokens)
    code_len = len(code_tokens)
    
    max_len = max(text_len, code_len)  # Find the bigger length
    
    # Pad the smaller list to match max_len
    if text_len < max_len:
        df.at[i, "text_tokens"] = text_tokens + ["<pad>"] * (max_len - text_len)
    
    if code_len < max_len:
        df.at[i, "code_tokens"] = code_tokens + ["<pad>"] * (max_len - code_len)

# Save padded tokenized data to CSV
output_file = "tokenized-padded.csv"
df[["text_tokens", "code_tokens"]].to_csv(output_file, index=False)

print(f"Padded tokenized data saved to {output_file}")




Padded tokenized data saved to tokenized-padded.csv


In [9]:
import json

# Define special tokens with fixed indices
vocab = {
    "<unk>": 0,
    "<pad>": 1,
    "<start>": 2,
    "<end>": 3
}

# Assign indices to other tokens
for column in ["text_tokens", "code_tokens"]:
    for tokens in df[column]:
        for token in tokens:
            if token not in vocab:
                vocab[token] = len(vocab)

# Save vocabulary to JSON
vocab_file = "vocabulary.json"
with open(vocab_file, "w") as f:
    json.dump(vocab, f, indent=4)

print(f"Vocabulary saved to {vocab_file}")

Vocabulary saved to vocabulary.json


In [10]:
# Load vocabulary
with open("/kaggle/working/vocabulary.json", "r") as f:
    vocab = json.load(f)

# Load tokenized data
df = pd.read_csv("/kaggle/working/tokenized-padded.csv")

# Convert string tokens to lists
df["text_tokens"] = df["text_tokens"].apply(eval)
df["code_tokens"] = df["code_tokens"].apply(eval)

# Convert tokens to sequences using vocabulary
df["text_sequences"] = df["text_tokens"].apply(lambda tokens: [vocab.get(token, vocab["<unk>"]) for token in tokens])
df["code_sequences"] = df["code_tokens"].apply(lambda tokens: [vocab.get(token, vocab["<unk>"]) for token in tokens])

# Save sequences to CSV
output_file = "tokenized_sequences.csv"
df[["text_sequences", "code_sequences"]].to_csv(output_file, index=False)

print(f"Tokenized sequences saved to {output_file}")

Tokenized sequences saved to tokenized_sequences.csv


In [11]:
from torch.utils.data import DataLoader, Dataset
import pandas as pd
import torch
import ast
from torch.nn.utils.rnn import pad_sequence
from tqdm import tqdm

class DataLoad(Dataset):
    def __init__(self, file_path):
        df = pd.read_csv(file_path)
        self.inputs = [ast.literal_eval(x) for x in df['code_sequences']]
        self.outputs = [ast.literal_eval(x) for x in df['text_sequences']]

    def __len__(self):
        return len(self.inputs)

    def __getitem__(self, idx):
        input_tensor = torch.tensor(self.inputs[idx], dtype=torch.int64)
        output_tensor = torch.tensor(self.outputs[idx], dtype=torch.int64)
        return input_tensor, output_tensor

def Add_Pad(batch):
    inputs, outputs = zip(*batch)
    inputs = pad_sequence(inputs, batch_first=True, padding_value=0)
    outputs = pad_sequence(outputs, batch_first=True, padding_value=0)
    return inputs, outputs

# Load dataset and dataloader
dataset = DataLoad('/kaggle/working/tokenized_sequences.csv')
dataloader = DataLoader(dataset, batch_size=64, shuffle=True, collate_fn=Add_Pad)

# Iterate with progress bar
data_iter = iter(dataloader)
for batch in tqdm(dataloader, desc="Loading Batches"):
    features, labels = batch  # Get a batch of data
    break  # Remove this if you want to iterate over all batches

print("Batch loaded successfully!")

Loading Batches:   0%|          | 0/3846 [00:00<?, ?it/s]

Batch loaded successfully!





In [12]:
import torch
import torch.nn as nn
import torch.optim as optim
import math

# Transformer Hyperparameters
class Config:
    vocab_size =12388  # Adjust based on vocabulary.json
    max_length = 100  # Adjust based on sequence length
    embed_dim = 256
    num_heads = 8
    num_layers =2
    feedforward_dim = 512
    dropout = 0.1
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

config = Config()

# Positional Encoding
class PositionalEncoding(nn.Module):
    def __init__(self, embed_dim, max_len=100):
        super(PositionalEncoding, self).__init__()
        pe = torch.zeros(max_len, embed_dim)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, embed_dim, 2).float() * (-math.log(10000.0) / embed_dim))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        self.pe = pe.unsqueeze(0)  # Shape: (1, max_len, embed_dim)

    def forward(self, x):
        return x + self.pe[:, :x.size(1)].to(x.device)

# Transformer Model
class PseudoCodeTransformer(nn.Module):
    def __init__(self, config):
        super(PseudoCodeTransformer, self).__init__()
        self.embedding = nn.Embedding(config.vocab_size, config.embed_dim)
        self.positional_encoding = PositionalEncoding(config.embed_dim, config.max_length)

        self.transformer = nn.Transformer(
            d_model=config.embed_dim,
            nhead=config.num_heads,
            num_encoder_layers=config.num_layers,
            num_decoder_layers=config.num_layers,
            dim_feedforward=config.feedforward_dim,
            dropout=config.dropout
        )

        self.fc_out = nn.Linear(config.embed_dim, config.vocab_size)
        self.dropout = nn.Dropout(config.dropout)

    def generate_square_subsequent_mask(self, sz):
        return torch.triu(torch.ones(sz, sz) * float('-inf'), diagonal=1).to(config.device)

    def forward(self, src, tgt):
        src_emb = self.embedding(src) * math.sqrt(config.embed_dim)
        tgt_emb = self.embedding(tgt) * math.sqrt(config.embed_dim)

        src_emb = self.positional_encoding(src_emb)
        tgt_emb = self.positional_encoding(tgt_emb)

        src_mask = self.generate_square_subsequent_mask(src.size(1))
        tgt_mask = self.generate_square_subsequent_mask(tgt.size(1))

        out = self.transformer(src_emb.permute(1, 0, 2), tgt_emb.permute(1, 0, 2),
                               src_mask=src_mask, tgt_mask=tgt_mask)

        out = self.fc_out(out.permute(1, 0, 2))  # Convert back to batch-first
        return out

# Initialize Model
model = PseudoCodeTransformer(config).to(config.device)
print("Model initialized successfully!")



Model initialized successfully!


In [13]:
def translate(model, code_tokens, vocab, device, max_length=50):
    model.eval()

    # Convert code tokens to numerical indices
    input_ids = [vocab.get(token, vocab["<unk>"]) for token in code_tokens]
    input_tensor = torch.tensor(input_ids, dtype=torch.long, device=device).unsqueeze(0)  # Add batch dimension

    # Start token for generation
    output_ids = [vocab["<start>"]]

    for _ in range(max_length):
        output_tensor = torch.tensor(output_ids, dtype=torch.long, device=device).unsqueeze(0)

        # Get model predictions
        with torch.no_grad():
            predictions = model(input_tensor, output_tensor)

        # Select the most probable token
        next_token_id = predictions[:, -1, :].argmax(dim=-1).item()
        output_ids.append(next_token_id)

        # Stop if end token is generated
        if next_token_id == vocab["<end>"]:
            break

    # Convert token indices back to words
    id_to_token = {idx: token for token, idx in vocab.items()}
    translated_pseudocode = " ".join(id_to_token.get(idx, "<unk>") for idx in output_ids[1:])  # Exclude <start> token

    return translated_pseudocode


In [14]:
import json

# Load vocabulary
with open("vocabulary.json", "r") as f:
    vocab = json.load(f)

# Ensure vocab is a dictionary
print(f"✅ Vocabulary loaded with {len(vocab)} tokens")

✅ Vocabulary loaded with 12388 tokens


In [15]:
from torch.utils.data import DataLoader
import torch.nn.functional as F
from tqdm import tqdm
import os

# Check for GPU availability
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"🔹 Using device: {device}")

# Move model to device
model.to(device)

# Loss Function & Optimizer
criterion = nn.CrossEntropyLoss(ignore_index=1)  # Ignore padding token
optimizer = optim.AdamW(model.parameters(), lr=3e-4, weight_decay=1e-3)

# Create directory to save models
os.makedirs("check", exist_ok=True)

# Training Loop
num_epochs = 8
for epoch in range(num_epochs):
    model.train()
    epoch_loss = 0

    progress_bar = tqdm(dataloader, desc=f"Epoch {epoch+1}/{num_epochs}")
    for batch in progress_bar:
        src, tgt = batch
        src, tgt = src.to(device), tgt.to(device)  # Move batch to GPU

        tgt_input = tgt[:, :-1]  # Remove <end> token
        tgt_output = tgt[:, 1:]  # Shifted version

        optimizer.zero_grad()
        output = model(src, tgt_input)

        loss = criterion(output.view(-1, config.vocab_size), tgt_output.contiguous().view(-1))
        loss.backward()
        optimizer.step()

        epoch_loss += loss.item()
        progress_bar.set_postfix(loss=loss.item())

    avg_loss = epoch_loss / len(dataloader)
    print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {avg_loss:.4f}")

    # Save Model Checkpoint
    torch.save(model.state_dict(), f"check/transformer_epoch_{epoch+1}.pth")
    print(f"✅ Model saved: check/transformer_epoch_{epoch+1}.pth")

    # Print Example Prediction
    model.eval()
    example_pseudocode = ["create", "integer", "x"]
    translated_code = translate(model, example_pseudocode, vocab, device)
    print(f"🔹 Example Prediction (Pseudocode → C++): {translated_code}\n")

🔹 Using device: cuda


Epoch 1/8: 100%|██████████| 3846/3846 [02:37<00:00, 24.41it/s, loss=0.908] 


Epoch [1/8], Loss: 0.3588
✅ Model saved: check/transformer_epoch_1.pth
🔹 Example Prediction (Pseudocode → C++): x / x = x = x / x = x / x = x / x / x = x / x / x = x / x = x = x / x / x / x = x / x = x / x = x =



Epoch 2/8: 100%|██████████| 3846/3846 [02:47<00:00, 22.96it/s, loss=0.158] 


Epoch [2/8], Loss: 0.1823
✅ Model saved: check/transformer_epoch_2.pth
🔹 Example Prediction (Pseudocode → C++): x is odd x , power / x is not x power power power power power function power function power function power function power function power function power function power function power function power function power function power function power function power function power function power function power function power



Epoch 3/8: 100%|██████████| 3846/3846 [02:47<00:00, 22.96it/s, loss=0.25]  


Epoch [3/8], Loss: 0.1471
✅ Model saved: check/transformer_epoch_3.pth
🔹 Example Prediction (Pseudocode → C++): of x print x print power and x and x / x , set power to x <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk>



Epoch 4/8: 100%|██████████| 3846/3846 [02:47<00:00, 22.98it/s, loss=0.183] 


Epoch [4/8], Loss: 0.1294
✅ Model saved: check/transformer_epoch_4.pth
🔹 Example Prediction (Pseudocode → C++): of x , set power to x / y to bitwise , set power if x , set power , set power to x to bitwise , set power , set power , set power , set power to x to x to x to x ^ x ^ x



Epoch 5/8: 100%|██████████| 3846/3846 [02:47<00:00, 22.99it/s, loss=0.267] 


Epoch [5/8], Loss: 0.1185
✅ Model saved: check/transformer_epoch_5.pth
🔹 Example Prediction (Pseudocode → C++): set of x bitwise , set myset to num2 bitwise power - fin of x bitwise , set of x bitwise , set of x bitwise , set of x bitwise , set of x bitwise , set of x bitwise , set of x bitwise , set of ones



Epoch 6/8: 100%|██████████| 3846/3846 [02:47<00:00, 22.95it/s, loss=0.176] 


Epoch [6/8], Loss: 0.1098
✅ Model saved: check/transformer_epoch_6.pth
🔹 Example Prediction (Pseudocode → C++): set to itself x bitshift x , set cntone to the bitwise , set the bitwise , set cntone to the bitwise , set the bitwise , set the bitwise , set the bitwise , set the bitwise , set cntone to the bitwise , set the bitwise , set



Epoch 7/8: 100%|██████████| 3846/3846 [02:47<00:00, 22.99it/s, loss=0.61]  


Epoch [7/8], Loss: 0.1042
✅ Model saved: check/transformer_epoch_7.pth
🔹 Example Prediction (Pseudocode → C++): , set cout as output of x , set cout as output cntone , set output as output as output of x , set output as output as output as cntone , set output as output as output as cntone , set output as output as of x , set



Epoch 8/8: 100%|██████████| 3846/3846 [02:47<00:00, 22.93it/s, loss=0.179] 


Epoch [8/8], Loss: 0.0992
✅ Model saved: check/transformer_epoch_8.pth
🔹 Example Prediction (Pseudocode → C++): , cntone , cntone , cntone , cntone , cntone , cntone , cntone , cntone , cntone , cntone , cntone , cntone , cntone , cntone , cntone , cntone , cntone , cntone , cntone , cntone , cntone , cntone , cntone , cntone , cntone



🔹 Using device: cuda


  model.load_state_dict(torch.load("/kaggle/working/check/transformer_epoch_1.pth", map_location=device))


C++ Code:  for ( int i = 0 ; i < n ; i ++ ) { cout << arr [ i ] ; }
Pseudocode: [ i ] = 0 to n exclusive , increment arr [ i ] [ i ] on arr [ i ] [ i ] [ i ] [ i ] on each loop iteration [ i ] on each loop iteration [ i ] on each loop iteration [

