In [1]:
import torch
from torch.utils.data import Dataset

In [2]:
import numpy as np



In [3]:
"""
class LanguageDataset(Dataset):
    def __init__(self, max_length, num_samples, p, device='cpu'):
        """"""Generate a language dataset.

        Args:
            max_length: Maximum length of the sequence
            num_samples: Number of samples to generate
            p: Probability of a sample being from the language
            device: Device to store the data ('cpu' or 'cuda')
        """""""
        super().__init__()
        self.max_length = max_length
        self.num_samples = num_samples
        self.p = p
        self.device = device
        self.generate_data()

    def generate_data(self):
        data = []
        labels = []
        
        while len(data) < self.num_samples:
            n = np.random.randint(1, min((self.max_length + 2) // 3, 8))
            total_length = 3 * n
            if total_length > self.max_length:
                continue

            if np.random.rand() < self.p:
                sample = 'a' * n + 'b' * n + 'c' * n
                data.append([ord(c) for c in sample])  # Convert chars to ASCII values
                labels.append(1)
            else:
                sample = ''.join(np.random.choice(['a', 'b', 'c'], total_length))
                data.append([ord(c) for c in sample])  # Convert chars to ASCII values
                labels.append(0)

        self.data = torch.tensor(data, dtype=torch.float32).to(self.device)
        self.labels = torch.tensor(labels, dtype=torch.long).to(self.device)

    def __len__(self):
        return self.num_samples

    def __getitem__(self, idx):
        data_point = self.data[idx]
        data_label = self.labels[idx]
        return data_point, data_label

    def toNumpy(self):
        return self.data.cpu().numpy(), self.labels.cpu().numpy()"""

'\nclass LanguageDataset(Dataset):\n    def __init__(self, max_length, num_samples, p, device=\'cpu\'):\n        Generate a language dataset.\n\n        Args:\n            max_length: Maximum length of the sequence\n            num_samples: Number of samples to generate\n            p: Probability of a sample being from the language\n            device: Device to store the data (\'cpu\' or \'cuda\')\n        "\n        super().__init__()\n        self.max_length = max_length\n        self.num_samples = num_samples\n        self.p = p\n        self.device = device\n        self.generate_data()\n\n    def generate_data(self):\n        data = []\n        labels = []\n        \n        while len(data) < self.num_samples:\n            n = np.random.randint(1, min((self.max_length + 2) // 3, 8))\n            total_length = 3 * n\n            if total_length > self.max_length:\n                continue\n\n            if np.random.rand() < self.p:\n                sample = \'a\' * n + \'b\' * 

# Try this with padded sequences 

In [4]:

class LanguageDataset(Dataset):
    def __init__(self, max_length, num_samples, p, device='cpu'):
        """Generate a language dataset.

        Args:
            max_length: Maximum length of the sequence
            num_samples: Number of samples to generate
            p: Probability of a sample being from the language
            device: Device to store the data ('cpu' or 'cuda')
        """
        super().__init__()
        self.max_length = max_length
        self.num_samples = num_samples
        self.p = p
        self.device = device
        self.generate_data()

    def generate_data(self):
        data = []
        labels = []

        while len(data) < self.num_samples:
            n = np.random.randint(1, min((self.max_length + 2) // 3, 8))
            total_length = 3 * n
            if total_length > self.max_length:
                continue

            if np.random.rand() < self.p:
                sample = 'a' * n + 'b' * n + 'c' * n
                #data.append([ord(c) for c in sample])  # Convert chars to ASCII values
                labels.append(1)
            else:
                sample = ''.join(np.random.choice(['a', 'b', 'c'], total_length))
                #data.append([ord(c) for c in sample])  # Convert chars to ASCII values
                labels.append(0)

        # Pad sequences to ensure consistent dimensions
        padded_data = self.pad_sequences(data, self.max_length)

        self.data = torch.tensor(padded_data, dtype=torch.float32).to(self.device)
        self.labels = torch.tensor(labels, dtype=torch.long).to(self.device)

    def pad_sequences(self, sequences, max_length):
        padded_sequences = np.zeros((len(sequences), max_length))
        for i, seq in enumerate(sequences):
            padded_sequences[i, :len(seq)] = seq
        return padded_sequences

    def __len__(self):
        return self.num_samples

    def __getitem__(self, idx):
        data_point = self.data[idx]
        data_label = self.labels[idx]
        return data_point, data_label

    def toNumpy(self):
        return self.data.cpu().numpy(), self.labels.cpu().numpy()




In [6]:
# Example usage:
dataset = LanguageDataset(max_length=20, num_samples=100, p=0.5, device='cpu')
print(len(dataset))
print(dataset[0])