In [40]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np

In [42]:
# Function to generate data
def generate_data(max_length, num_samples, p):
    data = []
    while len(data) < num_samples:
        n = np.random.randint(1, min((max_length + 2) // 3, 8))  # Limit n to ensure total length <= 20
        total_length = 3 * n
        if total_length > max_length:
            continue

        # Determine if the sample should be from the language or not based on probability p
        if np.random.rand() < p:
            sample = 'a' * n + 'b' * n + 'c' * n
            data.append((sample, 1))  # Label 1 for samples in the language
        else:
            sample = ''
            for _ in range(total_length):
                char = np.random.choice(['a', 'b', 'c'])
                sample += char
            data.append((sample, 0))  # Label 0 for non-language samples

    return data

In [43]:
generate_data(20, 20,0.5)

[('aabbbb', 0),
 ('aaaaaabbbbbbcccccc', 1),
 ('accbccabcbbaaaa', 0),
 ('abcccccbabccabbbbc', 0),
 ('aaabbbccc', 1),
 ('abbbcbcacbac', 0),
 ('ccbaabaabbba', 0),
 ('aabaabccc', 0),
 ('ccb', 0),
 ('aaaaaabbbbbbcccccc', 1),
 ('cbcbcbacacbacbb', 0),
 ('cacbbbacbcca', 0),
 ('bcccabbacbbaaaa', 0),
 ('bacbbb', 0),
 ('aabccbabaacc', 0),
 ('accbba', 0),
 ('aaaabbbbcccc', 1),
 ('cac', 0),
 ('bbbcaccbaacccca', 0),
 ('caacaccccccb', 0)]

In [44]:
# Function to convert data to tensors
def data_to_tensor(data):
    X = []
    y = []
    for sample, label in data:
        sample_tensor = torch.zeros(len(sample), dtype=torch.long)
        for i, char in enumerate(sample):
            if char == 'a':
                sample_tensor[i] = 0
            elif char == 'b':
                sample_tensor[i] = 1
            else:
                sample_tensor[i] = 2
        X.append(sample_tensor)
        y.append(label)

    return torch.stack(X), torch.tensor(y, dtype=torch.float)

In [45]:
# Define the RNN model
class RNNModel(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(RNNModel, self).__init__()
        self.hidden_size = hidden_size
        self.rnn = nn.RNN(input_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        _, hidden = self.rnn(x)
        output = self.fc(hidden.squeeze(0))
        return output

In [46]:
# Function to train the model
def train_model(model, X_train, y_train, X_val, y_val, num_epochs=100, lr=0.001):
    criterion = nn.BCEWithLogitsLoss()
    optimizer = optim.Adam(model.parameters(), lr=lr)

    for epoch in range(num_epochs):
        model.train()
        optimizer.zero_grad()

        output = model(X_train)
        loss = criterion(output.squeeze(), y_train)
        loss.backward()
        optimizer.step()

        # Validation loss
        model.eval()
        with torch.no_grad():
            val_output = model(X_val)
            val_loss = criterion(val_output.squeeze(), y_val)

        if epoch % 10 == 0:
            print(f'Epoch {epoch}, Training Loss: {loss.item()}, Validation Loss: {val_loss.item()}')

In [47]:
# Generate data
data = generate_data(max_length=20, num_samples=1000, p=0.8)  # 80% of the samples are from the language
X, y = data_to_tensor(data)

# Split data into training and validation sets
split_idx = int(0.8 * len(data))
X_train, X_val = X[:split_idx], X[split_idx:]
y_train, y_val = y[:split_idx], y[split_idx:]

# Initialize and train the RNN model
input_size = 3  # one-hot encoding for characters a, b, c
hidden_size = 16
output_size = 1
rnn_model = RNNModel(input_size, hidden_size, output_size)
train_model(rnn_model, X_train.unsqueeze(0), y_train, X_val.unsqueeze(0), y_val)

RuntimeError: stack expects each tensor to be equal size, but got [3] at entry 0 and [18] at entry 1

In [59]:
import torch
from torch.utils.data import Dataset, DataLoader
import numpy as np

class LanguageDataset(Dataset):
    def __init__(self, max_length, p, num_samples):
        self.max_length = max_length
        self.p = p
        self.num_samples = num_samples
        
        self.samples = self.generate_samples()
    
    def generate_samples(self):
        samples = []
        for _ in range(self.num_samples):
            length = np.random.randint(1, self.max_length + 1)
            sequence = self.generate_sequence(length)
            samples.append((sequence, int(self.is_language(sequence))))
        return samples
    
    def generate_sequence(self, length):
        sequence = ""
        for _ in range(length):
            choice = np.random.choice(['a', 'b', 'c'], p=self.p)
            sequence += choice
        return sequence
    
    def is_language(self, sequence):
        # Check if the sequence belongs to the language a^n b^n c^n
        counts = {'a': 0, 'b': 0, 'c': 0}
        for char in sequence:
            counts[char] += 1
        return counts['a'] == counts['b'] == counts['c']
    
    def __len__(self):
        return len(self.samples)
    
    def __getitem__(self, idx):
        return self.samples[idx]

def generate_data(max_length, p, num_samples, batch_size=32):
    dataset = LanguageDataset(max_length, p, num_samples)
    data_loader = DataLoader(dataset, batch_size=batch_size, shuffle=True)
    return data_loader

# Example usage:
max_length = 20
p = [0.3, 0.3, 0.4]  # Probability distribution for choosing characters
num_samples = 1000
batch_size = 32

data_loader = generate_data(max_length, p, num_samples, batch_size)

# Printing first batch as an example
for batch in data_loader:
    print(batch)
    break


[('babc', 'bc', 'abca', 'acbbccababaccbcbaa', 'aaacbaabac', 'aabacaaaaa', 'cbacaacb', 'ababb', 'aaacbbaabcacaaacaacb', 'aaaacac', 'baaaaba', 'baabaaaa', 'bcacacaaaaababaccb', 'bacbbcbcbbbaccccbaca', 'bcbaca', 'bcaa', 'bcca', 'aaaa', 'baca', 'acbcbbbabbcbcbbabba', 'aaa', 'b', 'cabaaabcacaaaaaa', 'bacacaa', 'a', 'cbbbcccabcabcacbbb', 'acaabaaba', 'cabbaacabbabbbcaa', 'ccacacc', 'acccacc', 'caaa', 'bacbaccbcb'), tensor([0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0])]


In [57]:
# print the first batch of data
for batch in data_loader:
    print(batch)
    break

[('ccac', 'bccacacbccbaaabbaac', 'cabcbcabbacbabacab', 'bcabaaaaaca', 'cbababbbc', 'cbabacbbacaccca', 'cccacacbbcc', 'bbccbcacbacabbaa', 'abb', 'abbccbaacac', 'cccaccbacbaaabcc', 'abaac', 'bcbcbccbcca', 'cbbcaacbcaaaaa', 'bcacaccac', 'acbc', 'acccabacab', 'bbccacbacbaa', 'acbaabcbbbbbbb', 'bcacaaaca', 'bbccbaa', 'aaccaaaacb', 'baacbabbaaaaaabbaaa', 'cbacbbbbaaa', 'cbbacbccabbbacb', 'bcacaacccacccbc', 'a', 'cbcbcbac', 'abcaaccabccbaa', 'bcbabcaaaccaa', 'accabcaac', 'bcababacb'), tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0])]
