## How to load the different datasets in ML4AlgComb

This notebook shows how to load the datasets and how to create dataloaders that you can use for model training. 

In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
from load_datasets import get_dataset
from dataloaders import CombDataModule, OneHotDataModule

FOLDER is the filepath to the folder containing the various datasets. 

In [4]:
FOLDER = "data/"

## Grassmannian cluster algebras

In [None]:
dataset_name = "grassmannian_cluster_algebras"
N = 12 # only one value of N supported
X_train, y_train, X_test, y_test, input_size, output_size, num_tokens = get_dataset(data = dataset_name, n = N, folder = FOLDER)

In [None]:
len([x for x in y_train if x ==1] + [x for x in y_test if x ==1] )

In [None]:
len([x for x in y_train if x ==0] + [x for x in y_test if x ==0] )

In [None]:
batch_size_choice = 32
data_module = CombDataModule(X_train, y_train, X_test, y_test, batch_size=batch_size_choice)
data_module.setup()

In [None]:
for seq, labs in data_module.train_dataloader():
    print(seq)
    print(labs)
    break

## KL polynomial coefficients

In [None]:
dataset_name = "kl_polynomial"
N = 7 # N = 5, 6, or 7 supported
X_train, y_train, X_test, y_test, input_size, output_size, num_tokens = get_dataset(data = dataset_name, n = N, folder = FOLDER)

In [110]:
batch_size_choice = 32
data_module = CombDataModule(X_train, y_train, X_test, y_test, batch_size=batch_size_choice)
data_module.setup()

In [None]:
for seq, labs in data_module.train_dataloader():
    print(seq)
    print(labs)
    break

## Lattice path posets

In [None]:
dataset_name = "lattice_path"
N = 13 #N = 10, 11, 12, 13 supported
X_train, y_train, X_test, y_test, input_size, output_size, num_tokens = get_dataset(data = dataset_name, n = N, folder = FOLDER)

In [None]:
#This is not a balanced dataset
len([x for x in y_train if x ==0] + [x for x in y_test if x ==0] )/(len(y_train) + len(y_test))

In [107]:
batch_size_choice = 32
data_module = CombDataModule(X_train, y_train, X_test, y_test, batch_size=batch_size_choice)
data_module.setup()

In [None]:
for seq, labs in data_module.train_dataloader():
    print(seq)
    print(labs)
    break

## mHeight

In [None]:
dataset_name = "mheight"
N = 11 #N = 8, 9, or 10 are supported
X_train, y_train, X_test, y_test, input_size, output_size, num_tokens = get_dataset(data = dataset_name, n = N, folder = FOLDER)

In [27]:
batch_size_choice = 32
data_module = CombDataModule(X_train, y_train, X_test, y_test, batch_size=batch_size_choice)
data_module.setup()

In [None]:
for seq, labs in data_module.train_dataloader():
    print(seq)
    print(labs)
    break

## Quiver mutation equivalence

In [None]:
dataset_name = "quiver"
N = 11 #This is the only value of N supported
X_train, y_train, X_test, y_test, input_size, output_size, num_tokens = get_dataset(data = dataset_name, n = N, folder = FOLDER)

In [None]:
#This is not a balanced dataset
for i in range(7):
    print( (len([x for x in y_train if x ==i]) + len([x for x in y_test if x ==i]) ) /(len(y_train) + len(y_test)) )

In [104]:
batch_size_choice = 32
data_module = CombDataModule(X_train, y_train, X_test, y_test, batch_size=batch_size_choice)
data_module.setup()

In [None]:
for seq, labs in data_module.train_dataloader():
    print(seq)
    print(labs)
    break

## RSK

In [None]:
N = 10
dataset_name = "rsk"
X_train, y_train, X_test, y_test, input_size, output_size, num_tokens = get_dataset(data = dataset_name, n = N, folder = FOLDER)

In [None]:
len(X_train) + len(X_test)

In [100]:
batch_size_choice = 32
data_module = CombDataModule(X_train, y_train, X_test, y_test, batch_size=batch_size_choice)
data_module.setup()

In [None]:
for seq, labs in data_module.train_dataloader():
    print(seq)
    print(labs)
    break

## Schubert polynomials

In [None]:
dataset_name = "schubert"
N = 5 #N = 4, 5, 6 are suppored
X_train, y_train, X_test, y_test, input_size, output_size, num_tokens = get_dataset(data = dataset_name, n = N, folder = FOLDER )

In [33]:
batch_size_choice = 32
data_module = CombDataModule(X_train, y_train, X_test, y_test, batch_size=batch_size_choice)
data_module.setup()

In [None]:
for seq, labs in data_module.train_dataloader():
    print(seq)
    print(labs)
    break

## Symmetric group character

In [None]:
dataset_name = "symmetric_group_char"
N = 18 #N = 18, 20, 22 supported
X_train, y_train, X_test, y_test, input_size, output_size, num_tokens = get_dataset(data = dataset_name, n = N, folder = FOLDER )

In [95]:
batch_size_choice = 32
data_module = CombDataModule(X_train, y_train, X_test, y_test, batch_size=batch_size_choice)
data_module.setup()

In [None]:
for seq, labs in data_module.train_dataloader():
    print(seq)
    print(labs)
    break

In [97]:
#Can also one-hot encode the data
data_module = OneHotDataModule(X_train, y_train, X_test, y_test, num_tokens, batch_size=batch_size_choice)
data_module.setup()
input_size = input_size*num_tokens

In [None]:
for seq, labs in data_module.train_dataloader():
    print(seq)
    print(labs)
    break

## Weaving patterns

In [None]:
dataset_name = "weaving"
N = 6 #N = 6 or 7 supported
X_train, y_train, X_test, y_test, input_size, output_size, num_tokens = get_dataset(data = dataset_name, n = N, folder = FOLDER )

In [143]:
batch_size_choice = 32
data_module = CombDataModule(X_train, y_train, X_test, y_test, batch_size=batch_size_choice)
data_module.setup()

In [None]:
for seq, labs in data_module.train_dataloader():
    print(seq)
    print(labs)
    break