## 1 Mounting the Drive

In [None]:
from google.colab import drive
drive.mount('/content/drive')

import os
INPUT_DIR = '/content/drive/MyDrive/datafolder'
OUTPUT_TXT = '/content/combined_papers.txt'


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## 2 Install important dependencies for making chunks according to the context window of the model

In [None]:
import nltk

nltk.download('punkt')
nltk.download('punkt_tab')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [None]:
import os
from nltk import sent_tokenize

all_text = []
for fn in sorted(os.listdir(INPUT_DIR)):
    if fn.endswith('.txt'):
        path = os.path.join(INPUT_DIR, fn)
        with open(path, 'r', encoding='utf-8') as f:
            text = f.read()
        all_text.append(text.strip())

combined_text = "\n".join(all_text)
sentences = sent_tokenize(combined_text)

with open(OUTPUT_TXT, 'w', encoding='utf-8') as out:
    for s in sentences:
        out.write(s.replace('\n', ' ') + '\n')
print("✅ Combined and sentence-split into:", OUTPUT_TXT)
print(f"Total sentences: {len(sentences)}")


✅ Combined and sentence-split into: /content/combined_papers.txt
Total sentences: 44201


## 3 Creating the chunks 2048 is good nough for the kaggel/collab pre train , you can increase if you have High end GPU access

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained('unsloth/llama-3-8b-bnb-4bit')
max_tokens = 1024

chunks, current = [], []
current_len = 0

with open(OUTPUT_TXT, 'r', encoding='utf-8') as f:
    for line in f:
        line = line.strip()
        if not line: continue
        tokens = tokenizer(line).input_ids
        if current_len + len(tokens) <= max_tokens:
            current.append(line)
            current_len += len(tokens)
        else:
            chunks.append(" ".join(current))
            current = [line]
            current_len = len(tokens)

if current:
    chunks.append(" ".join(current))

print(f"✅ Created {len(chunks)} text chunks (up to {max_tokens} tokens each)")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

✅ Created 1533 text chunks (up to 1024 tokens each)


In [None]:
os.makedirs('/content/chunks', exist_ok=True)
for i, block in enumerate(chunks):
    with open(f'/content/chunks/block_{i:05d}.txt', 'w', encoding='utf-8') as f:
        f.write(block + '\n')
print("✅ Saved text chunks to /content/chunks/")


✅ Saved text chunks to /content/chunks/


## 4 Hugging Face Upload of the dataset

#### Make sure you fill your credentials and HF_TOKEN ( write access )

In [None]:
from datasets import Dataset, DatasetDict
from huggingface_hub import login

login()  # paste your HF token when prompted

ds = Dataset.from_dict({'text': chunks})
data_dict = DatasetDict({'train': ds.shuffle(seed=42).select(range(len(chunks)))})
repo = "your_dataset_name"
data_dict.push_to_hub(repo, private=False)
print(f"✅ Uploaded text dataset to: https://huggingface.co/{repo}")


VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]

✅ Uploaded text dataset to: https://huggingface.co/Harshu0117/AKS_IISC_1024_processed


## 5 Check the Dataset if its loading or not and all the entries are correct or not

In [None]:
from datasets import load_dataset
ds = load_dataset("your_dataset_name", split="train")
