In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy.spatial.distance import cosine
from sklearn.preprocessing import MinMaxScaler
from sklearn.decomposition import PCA
from discopy.grammar import Word
from discopy.rigid import Cup, Id, Ty
import torch
import random
from sentence_transformers import SentenceTransformer
from lambeq import LossFunction, PennyLaneModel, PytorchTrainer, QuantumTrainer, SPSAOptimizer, NumpyModel, MSELoss, Dataset, AtomicType, IQPAnsatz, Sim14Ansatz, Sim15Ansatz, StronglyEntanglingAnsatz, BobcatParser
from lambeq.pregroups import remove_cups
import jax as jnp


from jax import numpy as jnp
import os
import joblib

In [None]:
def read_and_preprocess_data():
    df = pd.read_csv('Data/LargerSadrKartTransative.txt', sep=' ')
    # assign column names to the dataframe
    df.columns = ['annotator', 'subject1', 'verb1', 'object1', 'subject2', 'verb2', 'object2', 'score']
    # group the data by the three sentence columns and calculate the mean and standard deviation of the score column
    grouped_data = df.groupby(['subject1', 'verb1', 'object1', 'subject2', 'verb2', 'object2']).agg({'score': [np.mean, np.std]}).reset_index()
    # flatten the multi-level column names of the grouped data
    grouped_data.columns = [' '.join(col).strip() for col in grouped_data.columns.values]
    # rename the mean and std columns to 'score' and 'range' respectively
    grouped_data.rename(columns={'score mean': 'score', 'score std': 'range'}, inplace=True)
    grouped_data['score'] = grouped_data['score']/grouped_data['score'].max()
    unique_word_list = []
    for ind, row in grouped_data.iterrows():
        for i in [row['subject1'],row['verb1'],row['object1'], row['subject2'],row['verb2'],row['object2']]:
            unique_word_list.append(i)
    unique_word_list = list(set(unique_word_list)) #Makes word_list from word_list's unique elements
    grouped_data.to_csv("Data/AveragedLargerSadrKartTransative.txt")
    # Create an instance of MinMaxScaler
    #ERROR: SCALING STOPPED ALL CONVERGENCES!
    #scaler = MinMaxScaler(feature_range=(-1, 1))
    # Rescale the 'score' column
    #grouped_data['score'] = scaler.fit_transform(grouped_data[['score']])
    return grouped_data, unique_word_list
dataset, unique_word_list = read_and_preprocess_data()

In [None]:
def retrive_nth_rows_sentences(data, row1, row2=None):
    if not row2:
        row2=row1
    sentence1 = data['subject'+str(1)][row1] + " " + data['verb'+str(1)][row1]  + " " + data['object'+str(1)][row1] 
    sentence2 = data['subject'+str(2)][row2] + " " + data['verb'+str(2)][row2]  + " " + data['object'+str(2)][row2] 
    return sentence1, sentence2

def make_sentence_a_state(sentence):
    diagram = diagram_to_sentence(sentence.split(" "))
    diagram = remove_cups(diagram)
    return diagram

def make_diagram_a_circuit(diagram, ansatz, dagger=False):
    discopy_circuit = ansatz(diagram)
    if dagger:
        discopy_circuit = discopy_circuit.dagger()
    return discopy_circuit

def concat_circuits_into_inner_product(circuit1, circuit2):
    concat_circuit = circuit1 >> circuit2
    return concat_circuit

def make_diagrams(data, sentence1, sentence2=None):
    if type(sentence1) == int:
        sentence1, sentence2 = retrive_nth_rows_sentences(data, sentence1, sentence2)
    diagram1 = make_sentence_a_state(sentence1)
    diagram2 = make_sentence_a_state(sentence2)
    return diagram1, diagram2

def diagram_to_sentence(word_list):
    n, s = Ty('n'), Ty('s')
    words = [
        Word(word_list[0], n),
        Word(word_list[1], n.r @ s @ n.l),
        Word(word_list[2], n)
    ]
    cups = Cup(n, n.r) @ Id(s) @ Cup(n.l, n)
    assert Id().tensor(*words) == words[0] @ words[1] @ words[2]
    assert Ty().tensor(*[n.r, s, n.l]) == n.r @ s @ n.l
    diagram = Id().tensor(*words) >> cups
    return diagram

def get_word_dims_from_ansatz(ANSATZ):
    noun = ANSATZ.ob_map[Ty('n')]
    sent = ANSATZ.ob_map[Ty('s')]
    if isinstance(ANSATZ, IQPAnsatz):
        noun_parameters = 3 if noun == 1 else (noun-1)
        subject_parameters = noun + noun + sent - 1
        return noun_parameters, subject_parameters
    if isinstance(ANSATZ, Sim14Ansatz):
        noun_parameters = 3 if noun == 1 else noun*4
        subject_parameters = 4*(noun + noun + sent)
        return noun_parameters, subject_parameters
    if isinstance(ANSATZ, Sim15Ansatz):
        noun_parameters = 3 if noun == 1 else noun*2
        subject_parameters = 2*(noun + noun + sent)
        return noun_parameters, subject_parameters
    if isinstance(ANSATZ, StronglyEntanglingAnsatz):
        print("ERROR NOT IMPLEMENTED YET")
        pass

def make_circuit_from_diagrams(diagram1, diagram2, ansatz, drawing=False):
    discopy_circuit1 = make_diagram_a_circuit(diagram1, ansatz)
    discopy_circuit2 = make_diagram_a_circuit(diagram2, ansatz, dagger=True)
    discopy_circuit = concat_circuits_into_inner_product(discopy_circuit1, discopy_circuit2)

    if drawing:
        discopy_circuit1.draw(figsize=(5, 5))
        discopy_circuit2.draw(figsize=(5, 5))
        discopy_circuit.draw(figsize=(5, 10))   

    pennylane_circuit = discopy_circuit.to_pennylane()
    return pennylane_circuit, discopy_circuit

def make_circuit_from_df_row(data, row_number, ansatz):
    diagram1, diagram2 = make_diagrams(data, row_number)
    qml_circuit, discopy_circuit = make_circuit_from_diagrams(diagram1, diagram2, ansatz, False)
    return qml_circuit, discopy_circuit

In [None]:
def get_datasets(ansatz, seed, batch_size):
    torch.manual_seed(seed)
    random.seed(seed)
    np.random.seed(seed)
    labels = dataset['score']

    training = pd.read_csv("Data/TrainingData.txt")
    test = pd.read_csv("Data/TestData.txt")

    train_data =  [make_circuit_from_df_row(training, i, ansatz)[1] for i in range(len(training))]
    train_labels = labels[training['Unnamed: 0'].values]
    val_data = [make_circuit_from_df_row(test, i, ansatz)[1] for i in range(len(test))] 
    val_labels = labels[test['Unnamed: 0'].values]

    diagrams = train_data + val_data

    train_dataset = Dataset(train_data,train_labels,batch_size=batch_size)
    val_dataset = Dataset(val_data, val_labels, batch_size=batch_size)
    return diagrams, train_dataset, val_dataset

class EncodedNumpyModel(NumpyModel):
     def initialise_weights(self) -> None:
        if not self.symbols:
            raise ValueError('Symbols not initialised. Instantiate through '
                            '`from_diagrams()`.')
        self.weights = self.param_initialise_method(self.symbols)

In [None]:
def check_model_exists_and_load(model_folder_path):
    if os.path.exists(model_folder_path):
        # If the model folder already exists, load the information from the files and return
        model_params_filepath = os.path.join(model_folder_path, "model_params.joblib")
        training_losses_filepath = os.path.join(model_folder_path, "training_losses.npy")
        validation_losses_filepath = os.path.join(model_folder_path, "validation_losses.npy")

        model_params = joblib.load(model_params_filepath)
        training_losses = np.load(training_losses_filepath)
        validation_losses = np.load(validation_losses_filepath)

        return model_params, training_losses, validation_losses, True
    return None, None, None, False

def save_model_training(model_folder_path, model_params, training_losses, validation_losses):
    os.makedirs(model_folder_path, exist_ok=True)

    model_params_filepath = os.path.join(model_folder_path, "model_params.joblib")
    training_losses_filepath = os.path.join(model_folder_path, "training_losses.npy")
    validation_losses_filepath = os.path.join(model_folder_path, "validation_losses.npy")

    joblib.dump(model_params, model_params_filepath)
    np.save(training_losses_filepath, training_losses)
    np.save(validation_losses_filepath, validation_losses)
    return

def run_quantum_trainer(model_type, ansatz, loss_function, optimizer, optim_hyperparams, num_epochs, batch_size, seed, text='text'):
    ### Preperation ###
    noun_count = ansatz.ob_map[Ty('n')]
    sentence_count = ansatz.ob_map[Ty('s')]
    asnatz_hyperparams = {'n':noun_count, 's':sentence_count, 'layers':ansatz.n_layers}
    
    
    ansatz_name = ansatz.__class__.__name__.lower()
    loss_name = loss_function.__class__.__name__.lower()
    optimizer_name = optimizer.__name__.lower()
    optimizer_hyperparams_str = '_'.join([f"{key}{val}" for key, val in optim_hyperparams.items()])
    asnatz_hyperparams_str = '_'.join([f"{key}{val}" for key, val in asnatz_hyperparams.items()])

    model_folder = f"{model_type}_{ansatz_name}_{asnatz_hyperparams_str}_{loss_name}_{optimizer_name}_{optimizer_hyperparams_str}_epochs{num_epochs}_batch{batch_size}_seed{seed}"
    model_folder_path = os.path.join("models", model_folder)

    model_params, training_losses, validation_losses, dont_proceed_if_model_exists = check_model_exists_and_load(model_folder_path)
    if dont_proceed_if_model_exists is True:
        return model_params, training_losses, validation_losses
    print(ansatz_name, asnatz_hyperparams_str, ansatz.n_layers, loss_name, optimizer_name, optimizer_hyperparams_str, num_epochs, batch_size, seed)
    
    ### Data Produced
    torch.manual_seed(seed)
    random.seed(seed)
    np.random.seed(seed)
    diagrams, train_dataset, val_dataset = get_datasets(ansatz, seed, batch_size)
    
    ### Model Assignment according to function inputs
    if model_type == 'random':
        model = NumpyModel.from_diagrams(diagrams, use_jit=True)
    else:
        raise ValueError(f"Invalid ansatz: {ansatz_name}")

    trainer = QuantumTrainer(
        model,
        loss_function=loss_function,
        epochs=num_epochs,
        optimizer=optimizer,
        optim_hyperparams=optim_hyperparams,
        evaluate_on_train=True,
        verbose=text,
        seed=seed
    )
    trainer.fit(train_dataset, val_dataset, logging_step=500)

    save_model_training(model_folder_path, model.weights, trainer.train_epoch_costs, trainer.val_costs)

    return model_params, training_losses, validation_losses

In [None]:
EPOCHS = 500

ansatz_param_config = {
    '1_1':{AtomicType.NOUN: 1, AtomicType.SENTENCE: 1},
    '2_1':{AtomicType.NOUN: 2, AtomicType.SENTENCE: 1},
    '1_2':{AtomicType.NOUN: 1, AtomicType.SENTENCE: 2},
    '2_2':{AtomicType.NOUN: 2, AtomicType.SENTENCE: 2},
    }

optimizer_param_config = {
    '0.01_0.01_'+str(0.01*EPOCHS): {'a': 0.01, 'c': 0.01, 'A':0.01*EPOCHS},
    '0.01_0.1_'+str(0.01*EPOCHS): {'a': 0.01, 'c': 0.1, 'A':0.01*EPOCHS},
    '0.01_1.0_'+str(0.01*EPOCHS): {'a': 0.01, 'c': 1.0, 'A':0.01*EPOCHS},
    '0.1_0.01_'+str(0.01*EPOCHS): {'a': 0.1, 'c': 0.01, 'A':0.01*EPOCHS},
    '0.1_0.1_'+str(0.01*EPOCHS): {'a': 0.1, 'c': 0.1, 'A':0.01*EPOCHS},
    '0.1_1.0_'+str(0.01*EPOCHS): {'a': 0.1, 'c': 1.0, 'A':0.01*EPOCHS},
    '1.0_0.01_'+str(0.01*EPOCHS): {'a': 1.0, 'c': 0.01, 'A':0.01*EPOCHS},
    '1.0_0.1_'+str(0.01*EPOCHS): {'a': 1.0, 'c': 0.1, 'A':0.01*EPOCHS},
    '1.0_1.0_'+str(0.01*EPOCHS): {'a': 1.0, 'c': 1.0, 'A':0.01*EPOCHS},
    }

layer_config = {
    '1':1,
    '2':2,
    '3':3
}

ansatz_config = {
        #'IQP': IQPAnsatz,
        'Sim14': Sim14Ansatz,
        'Sim15': Sim15Ansatz,
    }

loss_function_config = {
        'mse': MSELoss(),
    }

optimizer_config = {
        'spsa': SPSAOptimizer,
    }

BATCH_SIZE = 2
SEED = 42
for ansatz_name, ansatz in reversed(ansatz_config.items()):
    for ansatz_param_name, ansatz_param in reversed(ansatz_param_config.items()):
        for layer_name, layer in reversed(layer_config.items()):
            for loss_name, loss_function in reversed(loss_function_config.items()):
                for optimizer_name, optimizer in reversed(optimizer_config.items()):
                    for o_param_name, o_param in reversed(optimizer_param_config.items()):
                        run_quantum_trainer('random', ansatz(ansatz_param, n_layers = layer), loss_function, optimizer, o_param, EPOCHS, BATCH_SIZE, SEED, "text")

sim15ansatz n2_s1_layers3 3 mseloss spsaoptimizer a0.01_c0.1_A5.0 500 2 42


No GPU/TPU found, falling back to CPU. (Set TF_CPP_MIN_LOG_LEVEL=0 and rerun for more info.)
Epoch 1:    train/loss: 0.2113   valid/loss: 0.1445
Epoch 500:  train/loss: 0.2116   valid/loss: 0.1445

Training completed!


sim15ansatz n2_s1_layers3 3 mseloss spsaoptimizer a0.01_c0.01_A5.0 500 2 42


Epoch 1:    train/loss: 0.2114   valid/loss: 0.1445


: 