In [36]:
import pandas as pd
import os

# Define file path
uploaded_tsv_path = "/kaggle/input/spoc-data/spoc-train.tsv"

# Check if file exists
if os.path.exists(uploaded_tsv_path):
    print("TSV file found:", uploaded_tsv_path)
else:
    raise FileNotFoundError("TSV file not found!")

# Load TSV file
df = pd.read_csv(uploaded_tsv_path, sep="\t")

# Rename columns
df.rename(columns={"text": "pseudocode", "code": "code"}, inplace=True)

# Define CSV file path
csv_path = "/kaggle/working/spoc-train.csv"

# Save selected columns to CSV
df[["pseudocode", "code"]].to_csv(csv_path, index=False)

# Load CSV file for further operations
df_csv = pd.read_csv(csv_path)

# Print first five rows of the CSV-loaded DataFrame
print(df_csv.head(5))

TSV file found: /kaggle/input/spoc-data/spoc-train.tsv
                                        pseudocode  \
0                in the function gcd(a,b=integers)   
1  if b=1 return a, else call function gcd(b, a%b)   
2                                              NaN   
3                                              NaN   
4               n , nn, ans = integers with ans =0   

                             code  
0         int gcd(int a, int b) {  
1  return !b ? a : gcd(b, a % b);  
2                               }  
3                    int main() {  
4             int n, nn, ans = 0;  


In [37]:
def clean_text(text):
    return "" if pd.isna(text) else str(text)

df_csv["pseudocode"] = df_csv["pseudocode"].apply(clean_text)
df_csv["code"] = df_csv["code"].apply(clean_text)
print(df_csv.head(5))

                                        pseudocode  \
0                in the function gcd(a,b=integers)   
1  if b=1 return a, else call function gcd(b, a%b)   
2                                                    
3                                                    
4               n , nn, ans = integers with ans =0   

                             code  
0         int gcd(int a, int b) {  
1  return !b ? a : gcd(b, a % b);  
2                               }  
3                    int main() {  
4             int n, nn, ans = 0;  


In [38]:
import pandas as pd
import numpy as np

# Assuming df_csv is already loaded and cleaned
# e.g., df_csv = pd.read_csv("/kaggle/working/spoc-train.csv")
# df_csv["pseudocode"] = df_csv["pseudocode"].apply(lambda x: "" if pd.isna(x) else str(x))
# df_csv["code"] = df_csv["code"].apply(lambda x: "" if pd.isna(x) else str(x))

def reassign_code(df_csv):
    # Create a working copy of the DataFrame
    df_csv = df_csv.copy()
    
    # Iterate through the DataFrame rows
    i = 0
    while i < len(df_csv):
        # Check if current row has missing pseudocode
        if df_csv.loc[i, "pseudocode"] == "":
            # Skip if it's the first row (no previous row to assign to)
            if i == 0:
                i += 1
                continue
                
            # Check for consecutive missing pseudocode
            if i + 1 < len(df_csv) and df_csv.loc[i + 1, "pseudocode"] == "":
                # Handle consecutive case
                # 1. Move current row's code to previous row (append)
                if df_csv.loc[i, "code"] != "":
                    if df_csv.loc[i - 1, "code"] != "":
                        df_csv.loc[i - 1, "code"] += "\n" + df_csv.loc[i, "code"]
                    else:
                        df_csv.loc[i - 1, "code"] = df_csv.loc[i, "code"]
                    df_csv.loc[i, "code"] = ""  # Clear moved code
                
                # 2. Move next row's code to the next valid pseudocode row (prepend)
                if i + 1 < len(df_csv) and df_csv.loc[i + 1, "code"] != "":
                    for j in range(i + 1, len(df_csv)):
                        if df_csv.loc[j, "pseudocode"] != "":
                            if df_csv.loc[j, "code"] != "":
                                # Prepend the second code to the existing code
                                df_csv.loc[j, "code"] = df_csv.loc[i + 1, "code"] + "\n" + df_csv.loc[j, "code"]
                            else:
                                df_csv.loc[j, "code"] = df_csv.loc[i + 1, "code"]
                            df_csv.loc[i + 1, "code"] = ""  # Clear moved code
                            break
                    i += 2  # Skip the next row since we processed it
                else:
                    i += 1
            else:
                # Single missing pseudocode: move code to previous row (append)
                if df_csv.loc[i, "code"] != "":
                    if df_csv.loc[i - 1, "code"] != "":
                        df_csv.loc[i - 1, "code"] += "\n" + df_csv.loc[i, "code"]
                    else:
                        df_csv.loc[i - 1, "code"] = df_csv.loc[i, "code"]
                    df_csv.loc[i, "code"] = ""  # Clear moved code
                i += 1
        else:
            i += 1
    
    return df_csv

# Apply the function to df_csv
df_csv = reassign_code(df_csv)

# Display the result
print(df_csv.head(10))  # Adjust to see more rows if needed

                                          pseudocode  \
0                  in the function gcd(a,b=integers)   
1    if b=1 return a, else call function gcd(b, a%b)   
2                                                      
3                                                      
4                 n , nn, ans = integers with ans =0   
5                                             Read n   
6                             for i=2 to n-1 execute   
7                                        set nn to n   
8  while nn is not equal to 0, set ans to ans + n...   
9                                                      

                                    code  
0                int gcd(int a, int b) {  
1      return !b ? a : gcd(b, a % b);\n}  
2                                         
3                                         
4      int main() {\nint n, nn, ans = 0;  
5                              cin >> n;  
6     for (int i = 2; i <= n - 1; ++i) {  
7                                nn = n

In [39]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.utils.data as data
from torch.utils.data import DataLoader
import pandas as pd
import re
from collections import Counter
import math
from torch.nn.utils.rnn import pad_sequence


# ---
# ### 1. Custom Tokenizer
# ---
class CustomTokenizer:
    def __init__(self):
        self.word2idx = {}
        self.idx2word = {}
        self.vocab_size = 0
        self.special_tokens = ["<sos>", "<eos>", "<pad>"]
    
    def build_vocab(self, texts):
        """Build vocabulary from a list of texts."""
        all_words = Counter()
        for text in texts:
            words = self.tokenize(text)
            all_words.update(words)
        
        # Add special tokens first
        for token in self.special_tokens:
            self.word2idx[token] = self.vocab_size
            self.idx2word[self.vocab_size] = token
            self.vocab_size += 1
        
        # Add other words
        for word, _ in all_words.most_common():
            if word not in self.word2idx:
                self.word2idx[word] = self.vocab_size
                self.idx2word[self.vocab_size] = word
                self.vocab_size += 1
    
    def tokenize(self, text):
        """Tokenize text into words and punctuation."""
        return re.findall(r'\w+|[^\w\s]', text.lower())
    
    def encode(self, text):
        """Convert text to token IDs."""
        tokens = self.tokenize(text)
        return [self.word2idx.get(token, self.word2idx["<pad>"]) for token in tokens]
    
    def decode(self, token_ids):
        """Convert token IDs back to text."""
        tokens = [self.idx2word.get(idx, "<unk>") for idx in token_ids]
        return " ".join(tokens)

# Build vocabulary from the dataset
all_texts = df_csv["pseudocode"].tolist() + df_csv["code"].tolist()
tokenizer = CustomTokenizer()
tokenizer.build_vocab(all_texts)

# Special token IDs
SOS_TOKEN_ID = tokenizer.word2idx["<sos>"]
EOS_TOKEN_ID = tokenizer.word2idx["<eos>"]
PAD_TOKEN_ID = tokenizer.word2idx["<pad>"]

# Preprocess data with <sos> and <eos>
def preprocess_data(row):
    pseudo = row["pseudocode"]
    code = row["code"]
    source = [SOS_TOKEN_ID] + tokenizer.encode(pseudo) + [EOS_TOKEN_ID] if pseudo else [SOS_TOKEN_ID, EOS_TOKEN_ID]
    target = [SOS_TOKEN_ID] + tokenizer.encode(code) + [EOS_TOKEN_ID] if code else [SOS_TOKEN_ID, EOS_TOKEN_ID]
    return {"source": source, "target": target}

train_data = df_csv.apply(preprocess_data, axis=1).tolist()

# ---
# ### 2. Dataset and DataLoader
# ---
class TranslationDataset(data.Dataset):
    def __init__(self, data):
        self.data = data
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        return torch.tensor(self.data[idx]["source"]), torch.tensor(self.data[idx]["target"])

def collate_fn(batch):
    """Pad sequences in a batch to the longest length."""
    sources, targets = zip(*batch)
    sources_padded = pad_sequence(sources, batch_first=True, padding_value=PAD_TOKEN_ID)
    targets_padded = pad_sequence(targets, batch_first=True, padding_value=PAD_TOKEN_ID)
    return sources_padded, targets_padded

dataset = TranslationDataset(train_data)
dataloader = DataLoader(dataset, batch_size=16, shuffle=True, collate_fn=collate_fn)

# ---
# ### 3. Transformer Model
# ---
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=5000):
        super().__init__()
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * -(math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        self.pe = pe.unsqueeze(0)
    
    def forward(self, x):
        return x + self.pe[:, :x.size(1), :].to(x.device)

class Transformer(nn.Module):
    def __init__(self, num_layers, d_model, num_heads, dff, vocab_size):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, d_model)
        self.pos_encoding = PositionalEncoding(d_model)
        self.transformer = nn.Transformer(
            d_model=d_model,
            nhead=num_heads,
            num_encoder_layers=num_layers,
            num_decoder_layers=num_layers,
            dim_feedforward=dff,
            batch_first=True
        )
        self.fc_out = nn.Linear(d_model, vocab_size)
    
    def forward(self, src, tgt):
        src_emb = self.pos_encoding(self.embedding(src))
        tgt_emb = self.pos_encoding(self.embedding(tgt))
        src_padding_mask = (src == PAD_TOKEN_ID)
        tgt_padding_mask = (tgt == PAD_TOKEN_ID)
        tgt_mask = nn.Transformer.generate_square_subsequent_mask(tgt.size(1)).to(tgt.device)
        out = self.transformer(
            src_emb, tgt_emb,
            src_key_padding_mask=src_padding_mask,
            tgt_key_padding_mask=tgt_padding_mask,
            tgt_mask=tgt_mask
        )
        return self.fc_out(out)

# ---
# ### 4. Training Setup
# ---
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = Transformer(
    num_layers=4,
    d_model=256,
    num_heads=4,
    dff=1024,
    vocab_size=tokenizer.vocab_size
).to(device)

criterion = nn.CrossEntropyLoss(ignore_index=PAD_TOKEN_ID)
optimizer = optim.Adam(model.parameters(), lr=1e-4, betas=(0.9, 0.98), eps=1e-9)

def train_epoch(model, dataloader, optimizer, criterion):
    model.train()
    total_loss = 0
    for src, tgt in dataloader:
        src, tgt = src.to(device), tgt.to(device)
        optimizer.zero_grad()
        output = model(src, tgt[:, :-1])
        loss = criterion(output.reshape(-1, output.shape[-1]), tgt[:, 1:].reshape(-1))
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    return total_loss / len(dataloader)

# Train for 10 epochs
for epoch in range(10):
    loss = train_epoch(model, dataloader, optimizer, criterion)
    print(f"Epoch {epoch+1}/10, Loss: {loss:.4f}")

# Save the model
torch.save(model.state_dict(), "transformer_model.pth")



Epoch 1/10, Loss: 0.7686
Epoch 2/10, Loss: 0.4402
Epoch 3/10, Loss: 0.3818
Epoch 4/10, Loss: 0.3648
Epoch 5/10, Loss: 0.3533
Epoch 6/10, Loss: 0.3478
Epoch 7/10, Loss: 0.3456
Epoch 8/10, Loss: 0.3448
Epoch 9/10, Loss: 0.3452
Epoch 10/10, Loss: 0.3457


In [47]:
def generate_code(model, pseudocode, max_len=100):
    model.eval()
    with torch.no_grad():
        # Split pseudocode into individual lines
        pseudocode_lines = pseudocode.strip().split('\n')
        generated_code_lines = []

        for line in pseudocode_lines:
            # Tokenize the current line
            src_tokens = [SOS_TOKEN_ID] + tokenizer.encode(line) + [EOS_TOKEN_ID]
            src = torch.tensor([src_tokens]).to(device)
            tgt = torch.tensor([[SOS_TOKEN_ID]]).to(device)
            
            # Generate code for the current line
            for _ in range(max_len):
                output = model(src, tgt)
                next_token = output[:, -1, :].argmax(dim=-1).item()
                if next_token == EOS_TOKEN_ID:
                    break
                tgt = torch.cat([tgt, torch.tensor([[next_token]]).to(device)], dim=1)
            
            # Decode the generated tokens and add to results
            generated_code_lines.append(tokenizer.decode(tgt[0].tolist()))
        
        # Join all generated lines into a single string
        return "\n".join(generated_code_lines)

# Load and test the model
model.load_state_dict(torch.load("transformer_model.pth", map_location=device))

# Example multi-line pseudocode input
test_pseudo = """
    create integers ans1, ans2 with ans1 = 0, ans2= 0
create integers n, a, b, c
create constant integer maxn with maxn = 105
create boolean array visit with size maxn
create integer vector array adj with size maxn
create 2d integer array dist with size maxn by maxn


"""

print(f"Pseudocode:\n{test_pseudo}")
print(f"Generated Code:\n{generate_code(model, test_pseudo)}")

  model.load_state_dict(torch.load("transformer_model.pth", map_location=device))


Pseudocode:

    create integers ans1, ans2 with ans1 = 0, ans2= 0
create integers n, a, b, c
create constant integer maxn with maxn = 105
create boolean array visit with size maxn
create integer vector array adj with size maxn
create 2d integer array dist with size maxn by maxn



Generated Code:
<sos> int ans1 = 0 , ans2 = 0 ;
<sos> int main ( ) { int n , a , b , c ;
<sos> } const int maxn = 105 ;
<sos> bool visit [ maxn ] ;
<sos> vector < int > adj [ maxn ] ;
<sos> int dist [ maxn ] [ maxn ] ;


In [42]:
import pickle
with open("tokenizer.pkl", "wb") as f:
    pickle.dump(tokenizer, f)

In [45]:
import json

# Convert tokenizer object to a dictionary (assuming it has a vocabulary attribute)
tokenizer_dict = tokenizer.__dict__  # Extracts attributes as a dictionary

# Save as JSON
with open("tokenizer.json", "w") as f:
    json.dump(tokenizer_dict, f)
