In [6]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, random_split
import torch.optim as optim
import numpy as np
import torch

In [2]:
# Function to generate data
def generate_data(max_length, num_samples, p):
    data = []
    while len(data) < num_samples:
        n = np.random.randint(1, min((max_length + 2) // 3, 8))  # Limit n to ensure total length <= 20
        total_length = 3 * n
        if total_length > max_length:
            continue

        # Determine if the sample should be from the language or not based on probability p
        if np.random.rand() < p:
            sample = 'a' * n + 'b' * n + 'c' * n
            data.append((sample, 1))  # Label 1 for samples in the language
        else:
            sample = ''
            for _ in range(total_length):
                char = np.random.choice(['a', 'b', 'c'])
                sample += char
            data.append((sample, 0))  # Label 0 for non-language samples

    return data

#print(generate_data(20, 20,0.5))

In [3]:
class LanguageDataset(Dataset):
    def __init__(self, max_length, p, num_samples):
        self.max_length = max_length
        self.p = p
        self.num_samples = num_samples
        
        self.samples = self.generate_samples()
    
    def generate_samples(self):
        samples = []
        for _ in range(self.num_samples):
            length = np.random.randint(1, self.max_length + 1)
            sequence = self.generate_sequence(length)
            samples.append((sequence, int(self.is_language(sequence))))
        return samples
    
    def generate_sequence(self, length):
        sequence = ""
        for _ in range(length):
            choice = np.random.choice(['a', 'b', 'c'], p=self.p)
            sequence += choice
        return sequence
    
    def is_language(self, sequence):
        # Check if the sequence belongs to the language a^n b^n c^n
        counts = {'a': 0, 'b': 0, 'c': 0}
        for char in sequence:
            counts[char] += 1
        return counts['a'] == counts['b'] == counts['c']
    
    def __len__(self):
        return len(self.samples)
    
    def __getitem__(self, idx):
        return self.samples[idx]

def generate_data(max_length, p, num_samples, batch_size=32):
    dataset = LanguageDataset(max_length, p, num_samples)
    data_loader = DataLoader(dataset, batch_size=batch_size, shuffle=True)
    return data_loader

# Example usage:
max_length = 20
p = [0.3, 0.3, 0.4]  # Probability distribution for choosing characters
num_samples = 1000
batch_size = 32

data_loader = generate_data(max_length, p, num_samples, batch_size)

# Printing first batch as an example
for batch in data_loader:
    print(batch)
    break


[('c', 'baaab', 'abaaacccccbbbc', 'cacccccbbcaccaaabb', 'bbbacbbbc', 'cacacacbaaa', 'babc', 'cc', 'cab', 'c', 'aaacaccabacaacabaa', 'bc', 'abaaaac', 'ccbccccbacaccaabacba', 'caabcbccccbac', 'abaaccabbacbccccb', 'acabacbc', 'ac', 'bccabcbababcac', 'cbcbcacccbca', 'c', 'bbcacbbb', 'cbbabacbbbcbcba', 'abcccbaaac', 'cbbacccbca', 'acaabbcbabbcacabcabc', 'cbbababaabaacbbabca', 'bcbaccaca', 'cacbbccaaababcbcbc', 'bcbbaa', 'cccaa', 'aacbabccccbcaab'), tensor([0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0])]


In [7]:
class RNNCell(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(RNNCell, self).__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.i2h = nn.Linear(input_size + hidden_size, hidden_size)
        self.h2o = nn.Linear(hidden_size, hidden_size)

    def forward(self, input, hidden):
        combined = torch.cat((input, hidden), 1)
        hidden = torch.tanh(self.i2h(combined))
        output = self.h2o(hidden)
        return output, hidden

class RNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, n_layers=1):
        super(RNN, self).__init__()
        self.hidden_size = hidden_size
        self.n_layers = n_layers
        self.RNN_layers = nn.ModuleList([RNNCell(input_size, hidden_size) for _ in range(n_layers)])
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, input, hidden=None):
        if hidden is None:
            hidden = self.init_hidden(input.size(0))
        outputs = []
        for layer_idx in range(self.n_layers):
            layer_outputs = []
            for input_idx in range(input.size(1)):
                x = input[:, input_idx, :]
                output, hidden[layer_idx] = self.RNN_layers[layer_idx](x, hidden[layer_idx])
                layer_outputs.append(output)
            outputs.append(torch.stack(layer_outputs, dim=1))
            input = outputs[-1]
        output = self.fc(outputs[-1][:, -1, :])
        return output

    def init_hidden(self, batch_size):
        return [torch.zeros(batch_size, self.hidden_size) for _ in range(self.n_layers)]


In [8]:
class LanguageDataset(Dataset):
    def __init__(self, max_length, p, num_samples):
        self.max_length = max_length
        self.p = p
        self.num_samples = num_samples
        self.samples = self.generate_samples()
    
    def generate_samples(self):
        samples = []
        for _ in range(self.num_samples):
            length = np.random.randint(1, self.max_length + 1)
            if np.random.rand() < self.p:
                n = length // 3
                sequence = 'a' * n + 'b' * n + 'c' * n
            else:
                sequence = ''.join(np.random.choice(['a', 'b', 'c'], size=length))
            samples.append((sequence, int(self.is_language(sequence))))
        return samples
    
    def is_language(self, sequence):
        counts = {'a': 0, 'b': 0, 'c': 0}
        for char in sequence:
            counts[char] += 1
        return counts['a'] == counts['b'] == counts['c']
    
    def __len__(self):
        return len(self.samples)
    
    def __getitem__(self, idx):
        sequence, label = self.samples[idx]
        one_hot_sequence = torch.nn.functional.one_hot(torch.tensor([ord(c) - ord('a') for c in sequence]), num_classes=3).float()
        return one_hot_sequence, torch.tensor(label)

def generate_data(max_length, p, num_samples, batch_size=32):
    dataset = LanguageDataset(max_length, p, num_samples)
    train_size = int(0.7 * len(dataset))
    val_size = int(0.15 * len(dataset))
    test_size = len(dataset) - train_size - val_size
    train_dataset, val_dataset, test_dataset = random_split(dataset, [train_size, val_size, test_size])
    
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
    test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)
    
    return train_loader, val_loader, test_loader


In [9]:
def train_model(model, train_loader, val_loader, criterion, optimizer, num_epochs=10):
    best_val_acc = 0
    best_model = None
    for epoch in range(num_epochs):
        model.train()
        for sequences, labels in train_loader:
            optimizer.zero_grad()
            hidden = model.init_hidden(sequences.size(0))
            outputs = model(sequences, hidden)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
        
        # Validation
        model.eval()
        correct = 0
        total = 0
        with torch.no_grad():
            for sequences, labels in val_loader:
                hidden = model.init_hidden(sequences.size(0))
                outputs = model(sequences, hidden)
                _, predicted = torch.max(outputs.data, 1)
                total += labels.size(0)
                correct += (predicted == labels).sum().item()
        
        val_acc = correct / total
        if val_acc > best_val_acc:
            best_val_acc = val_acc
            best_model = model
    return best_model, best_val_acc

# Hyper-parameter search
input_size = 3  # 'a', 'b', 'c'
output_size = 2  # 0 or 1
hidden_sizes = [16, 32, 64]
learning_rates = [0.001, 0.01, 0.1]
best_val_acc = 0
best_model = None

train_loader, val_loader, test_loader = generate_data(20, 0.5, 1000, 32)

for hidden_size in hidden_sizes:
    for lr in learning_rates:
        model = RNN(input_size, hidden_size, output_size)
        criterion = nn.CrossEntropyLoss()
        optimizer = optim.Adam(model.parameters(), lr=lr)
        
        model, val_acc = train_model(model, train_loader, val_loader, criterion, optimizer, num_epochs=10)
        
        if val_acc > best_val_acc:
            best_val_acc = val_acc
            best_model = model

print(f'Best validation accuracy: {best_val_acc}')


RuntimeError: one_hot is only applicable to index tensor.