In [None]:
# Install required dependencies
!pip install datasets huggingface_hub

In [1]:
# Download jablonkagroup/chempile-education dataset from Hugging Face
from datasets import load_dataset
import pandas as pd

# Load the dataset (must specify a config name)
print("Downloading jablonkagroup/chempile-education dataset from Hugging Face...")
dataset = load_dataset("jablonkagroup/chempile-education", "LibreText_Chemistry-default")

# Display dataset information
print(f"Dataset keys: {list(dataset.keys())}")
print(f"Number of examples in train split: {len(dataset['train'])}")

# Convert to pandas DataFrame for easier exploration
df = dataset['train'].to_pandas()
print(f"DataFrame shape: {df.shape}")
print(f"Columns: {df.columns.tolist()}")

# Display first few rows
print("\nFirst 3 rows of the dataset:")
df.head(3)

  from .autonotebook import tqdm as notebook_tqdm


Downloading jablonkagroup/chempile-education dataset from Hugging Face...
Dataset keys: ['train', 'test', 'val']
Number of examples in train split: 53051
DataFrame shape: (53051, 2)
Columns: ['url', 'text']

First 3 rows of the dataset:


Unnamed: 0,url,text
0,Bookshelves/Physical_and_Theoretical_Chemistry...,The laws of probability apply to events that a...
1,Courses/University_of_Georgia/CHEM_3212%3A_Phy...,One important consequence of Botlzmann’s propo...
2,Bookshelves/General_Chemistry/Chemistry_2e_(Op...,1. A burning match and a bonfire may have the ...


In [None]:
# env file
from dotenv import load_dotenv
import os
load_dotenv()

In [None]:
from huggingface_hub import HfApi

# Initialize the Hugging Face API
api = HfApi()
user_info = api.whoami(token=os.getenv("HF_TOKEN"))
print(user_info)

DATASET_ID = "iAli61/chempile-education-train"

In [8]:
!pip install -qqqU datasets rensa semhash

16050.07s - pydevd: Sending message related to process being replaced timed-out after 5 seconds


In [None]:
# remove duplicate text

# 🧼 AutoDedup

# Automatically deduplicate your datasets using [Rensa](https://github.com/beowolx/rensa), a high-performance MinHash implementation in Rust, made by [@beowolx](https://x.com/beowolx).

# 🤗 Dataset parameters

# `COLUMN` is the name of the column to deduplicate in your dataset.
COLUMN = "text" 

#  `SPLIT` is the split of the dataset to load (e.g., "all", "train", "test", etc.)
SPLIT = "train" # @param {type:"string"}

#  `ALGORITHM` is the MinHash implementation used by Rensa (see details [here](https://github.com/beowolx/rensa?tab=readme-ov-file#r-minhash-original-rensa-variant)).
ALGORITHM = "CMinHash" 


import pandas as pd
from datasets import load_dataset
from rensa import CMinHash, RMinHash
from tqdm import tqdm

def get_minhash(text, minhash_class, num_perm, seed=0):
    """Function to generate MinHash"""
    m = minhash_class(num_perm=num_perm, seed=seed)
    if type(text) == str:
        m.update(text.split())
        return m
    elif type(text) == list and type(text[0]) == dict:
        for t in text:
            key = "content" if "content" in t else "value"
            m.update(t[key].split())
        return m
    else:
        raise ValueError(f"Column {COLUMN} must contain a string or a list of dicts")

def deduplicate_dataset(dataset, text_column, minhash_class, num_perm=128):
    hash_to_index = {}  # Maps hash to the first occurrence index
    deduplicated_indices = []
    info = []
    minhash_class = CMinHash if minhash_class == "CMinHash" else RMinHash

    for idx, example in tqdm(enumerate(dataset), total=len(dataset), desc="MinHash deduplication"):
        minhash_obj = get_minhash(example[text_column], minhash_class, num_perm)
        hash_tuple = tuple(minhash_obj.digest())

        if hash_tuple not in hash_to_index:
            # First occurrence of this hash (keep it)
            hash_to_index[hash_tuple] = idx
            deduplicated_indices.append(idx)
        else:
            # Duplicate found (record which sample it's similar to)
            original_idx = hash_to_index[hash_tuple]
            info.append({
                'removed_index': idx,
                'similar_to_index': original_idx,
                'hash': hash_tuple
            })

    return deduplicated_indices, info

def get_removed(dataset, info, text_column):
    """Create a simple DataFrame with removed samples"""

    if not info:
        return pd.DataFrame()

    removed_data = []

    for info in tqdm(info, desc="Collecting removed samples"):
        removed_idx = info['removed_index']
        similar_idx = info['similar_to_index']

        # Get the text content from both samples
        removed_text = dataset[removed_idx][text_column]
        similar_text = dataset[similar_idx][text_column]

        # Create a record
        record = {
            'removed_index': removed_idx,
            'similar_to_index': similar_idx,
            'removed_text': removed_text,
            'similar_text': similar_text,
        }

        removed_data.append(record)

    return pd.DataFrame(removed_data)

# Load dataset
dataset = load_dataset("jablonkagroup/chempile-education", "LibreText_Chemistry-default", split=SPLIT)

# Deduplicate dataset
dedup_indices, info = deduplicate_dataset(
    dataset,
    text_column=COLUMN,
    minhash_class=ALGORITHM,
)

# Create the deduplicated dataset
dedup_dataset = dataset.select(dedup_indices)

# Create simple DataFrame with only the 4 required columns
removed_df = get_removed(dataset, info, COLUMN)

print(f"\nOriginal dataset size: {len(dataset)} samples")
print(f"Deduplicated dataset size: {len(dedup_dataset)} samples")
print(f"{len(info)} samples were removed from the original dataset\n")

# Push to HF
dedup_dataset.push_to_hub(f"{DATASET_ID}-dedup")

removed_df

MinHash deduplication: 100%|██████████| 53051/53051 [00:27<00:00, 1931.57it/s]
Collecting removed samples: 100%|██████████| 24387/24387 [00:05<00:00, 4756.40it/s]



Original dataset size: 53051 samples
Deduplicated dataset size: 28664 samples
24387 samples were removed from the original dataset



NameError: name 'DATASET_ID' is not defined

In [12]:
removed_df

Unnamed: 0,removed_index,similar_to_index,removed_text,similar_text
0,91,49,Consider the following system under equilibriu...,Consider the following system under equilibriu...
1,92,53,Welcome to the Chemistry Library. This Living ...,Welcome to the Chemistry Library. This Living ...
2,97,53,Welcome to the Chemistry Library. This Living ...,Welcome to the Chemistry Library. This Living ...
3,124,53,Welcome to the Chemistry Library. This Living ...,Welcome to the Chemistry Library. This Living ...
4,135,67,This text is disseminated via the Open Educati...,This text is disseminated via the Open Educati...
...,...,...,...,...
24382,53045,18128,Learning Objectives Explain the bonding nature...,Learning Objectives Explain the bonding nature...
24383,53046,736,Learning Objectives Distinguish between adhesi...,Learning Objectives Distinguish between adhesi...
24384,53047,266,Learning Objectives To assign a Lewis dot symb...,Learning Objectives To assign a Lewis dot symb...
24385,53048,6481,Coordination Chemistry Now let us direct our f...,Coordination Chemistry Now let us direct our f...


In [13]:
dedup_dataset

Dataset({
    features: ['url', 'text'],
    num_rows: 28664
})

In [3]:
dataset

NameError: name 'dataset' is not defined

In [17]:
from importlib.metadata import version

print(f"tiktoken version: {version('tiktoken')}")
print(f"rensa version: {version('rensa')}")
print(f"datasets version: {version('datasets')}")
print(f"pandas version: {version('pandas')}")
print(f"torch version: {version('torch')}")

tiktoken version: 0.9.0
rensa version: 0.2.7
datasets version: 3.6.0
pandas version: 2.3.0
torch version: 2.7.1


In [1]:
import tiktoken
tokenizer = tiktoken.get_encoding("gpt2")
tokenizer.encode("Hello, world!")

[15496, 11, 995, 0]

In [2]:
tokenizer.n_vocab

50257

In [None]:
import tiktoken
import torch
from torch.utils.data import DataLoader, Dataset
COLUMN = "text"  # Column to be used for deduplication

class CHEMPILE_DS_V1(Dataset):
    def __init__(self, dataset, tokenizer, max_length=256, stride=256):
        self.dataset = dataset
        self.tokenizer = tokenizer
        self.max_length = max_length
        # add "<|endoftext|>" to the end of each text
        self.texts = " ".join([text + "<|endoftext|>" for text in dataset[COLUMN]][:10])
        self.token_ids = tokenizer.encode(self.texts, allowed_special={"<|endoftext|>"})
        self.input_ids = []
        self.target_ids = []

        # using a sliding window approach to create input-target pairs
        for i in range(0, len(self.token_ids) - self.max_length, stride):
            input_ids = self.token_ids[i:i + self.max_length]
            target_ids = self.token_ids[i + 1:i + self.max_length + 1]
            self.input_ids.append(torch.tensor(input_ids))
            self.target_ids.append(torch.tensor(target_ids))

    def __len__(self):
        return len(self.input_ids)
    
    def __getitem__(self, idx):
        return self.input_ids[idx], self.target_ids[idx]


def create_dataloader(dataset, 
                      tokenizer, 
                      batch_size=8, 
                      max_length=256, 
                      stride=256,
                      drop_last=True,
                      shuffle=True
                      ):

    chempile_ds = CHEMPILE_DS_V1(dataset, tokenizer, max_length=max_length, stride=stride)

    return DataLoader(chempile_ds, 
                      batch_size=batch_size, 
                      drop_last=drop_last,
                      shuffle=shuffle,
                      )


# Example usage
tokenizer = tiktoken.get_encoding("cl100k_base")
dataloader = create_dataloader(dataset, tokenizer, batch_size=1, max_length=8, stride=8)


dataiter = iter(dataloader)
input_ids, target_ids = next(dataiter)
print(f"Input IDs: {input_ids}")
print(f"Target IDs: {target_ids}")
print(f"Input IDs shape: {input_ids.shape}")
print(f"Target IDs shape: {target_ids.shape}")


Input IDs: tensor([[  449,   279, 20081, 51180,   477, 17055,    87,   579]])
Target IDs: tensor([[  279, 20081, 51180,   477, 17055,    87,   579,    13]])
Input IDs shape: torch.Size([1, 8])
Target IDs shape: torch.Size([1, 8])
