# Setting up environment

In [None]:
!pip install -q transformers datasets accelerate evaluate sacrebleu nltk torch sentencepiece
!pip install -q git+https://github.com/microsoft/CodeBLEU.git || true

In [None]:
import os
import random
import numpy as np
import pandas as pd
from datasets import Dataset, DatasetDict
from transformers import (AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer, default_data_collator)
from transformers import DataCollatorForLanguageModeling
import torch
from sklearn.model_selection import train_test_split
import evaluate
import sacrebleu


In [None]:
Seed=42
random.seed(Seed)
np.random.seed(Seed)
os.environ['PYTHONHASHSEED']=str(Seed)


# Import SPOC dataset

In [None]:
# Use the single dataset provided on Kaggle exactly as requested
path = '/kaggle/input/psuedocode-and-python/dataSet.txt'
assert os.path.exists(path), f'Dataset not found at {path}'
# The file contains alternating <|pseudocode|> and <|code|> blocks. Parse them into pairs.
with open(path, 'r', encoding='utf-8', errors='ignore') as f:
    text = f.read()
pairs = []
# Split by the pseudocode marker and extract following code block
for part in text.split('<|pseudocode|>'):
    if '<|code|>' in part:
        pseudo_part, rest = part.split('<|code|>', 1)
        code_part = rest.split('<|pseudocode|>')[0] if '<|pseudocode|>' in rest else rest
        pseudo = pseudo_part.strip()
        code = code_part.strip()
        if pseudo and code:
            pairs.append({'pseudo': pseudo, 'code': code})
df = pd.DataFrame(pairs)
print('Parsed pairs:', len(df))
print(df.head(5).to_dict(orient='records'))


# Preprocessing