In [88]:
import torch
import jax
import jaxlib


jax.devices(), torch.cuda.is_available(), jaxlib.__path__

([CpuDevice(id=0)],
 False,
 ['c:\\Users\\henry\\Desktop\\MastersProject\\quantum_env\\lib\\site-packages\\jaxlib'])

In [81]:
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

# Check GPU availability in PyTorch
torch_available = torch.cuda.is_available()
if torch_available:
    print("PyTorch GPU Available")

# Check GPU availability in JAX
jax_devices = jax.devices()
if jax_devices:
    print("JAX Devices:", jax_devices)
else:
    print("No JAX Devices Found")

JAX Devices: [CpuDevice(id=0)]


In [1]:
import pandas as pd
import numpy as np

from scipy.spatial.distance import cosine
from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.manifold import Isomap
from sklearn.manifold import MDS
from sklearn.manifold import TSNE
from scipy.spatial.distance import pdist, squareform
from scipy.optimize import minimize

from sentence_transformers import SentenceTransformer
embedder = SentenceTransformer('paraphrase-MiniLM-L6-v2')
import jax
jax.devices()

  from .autonotebook import tqdm as notebook_tqdm
Downloading (…)001fa/.gitattributes: 100%|██████████| 690/690 [00:00<?, ?B/s] 
Downloading (…)_Pooling/config.json: 100%|██████████| 190/190 [00:00<?, ?B/s] 
Downloading (…)3bbb8001fa/README.md: 100%|██████████| 3.69k/3.69k [00:00<?, ?B/s]
Downloading (…)bb8001fa/config.json: 100%|██████████| 629/629 [00:00<00:00, 630kB/s]
Downloading (…)ce_transformers.json: 100%|██████████| 122/122 [00:00<00:00, 122kB/s]
Downloading pytorch_model.bin: 100%|██████████| 90.9M/90.9M [00:30<00:00, 3.01MB/s]
Downloading (…)nce_bert_config.json: 100%|██████████| 53.0/53.0 [00:00<00:00, 53.0kB/s]
Downloading (…)cial_tokens_map.json: 100%|██████████| 112/112 [00:00<?, ?B/s] 
Downloading (…)001fa/tokenizer.json: 100%|██████████| 466k/466k [00:00<00:00, 1.75MB/s]
Downloading (…)okenizer_config.json: 100%|██████████| 314/314 [00:00<?, ?B/s] 
Downloading (…)3bbb8001fa/vocab.txt: 100%|██████████| 232k/232k [00:00<00:00, 1.20MB/s]
Downloading (…)b8001fa/modules.jso

[CpuDevice(id=0)]

In [3]:
def read_and_preprocess_data():
    df = pd.read_csv('Data/LargerSadrKartTransative.txt', sep=' ')
    # assign column names to the dataframe
    df.columns = ['annotator', 'subject1', 'verb1', 'object1', 'subject2', 'verb2', 'object2', 'score']
    # group the data by the three sentence columns and calculate the mean and standard deviation of the score column
    grouped_data = df.groupby(['subject1', 'verb1', 'object1', 'subject2', 'verb2', 'object2']).agg({'score': [np.mean, np.std]}).reset_index()
    # flatten the multi-level column names of the grouped data
    grouped_data.columns = [' '.join(col).strip() for col in grouped_data.columns.values]
    # rename the mean and std columns to 'score' and 'range' respectively
    grouped_data.rename(columns={'score mean': 'score', 'score std': 'range'}, inplace=True)
    grouped_data['score'] = grouped_data['score']/grouped_data['score'].max()
    unique_word_list = []
    for ind, row in grouped_data.iterrows():
        for i in [row['subject1'],row['verb1'],row['object1'], row['subject2'],row['verb2'],row['object2']]:
            unique_word_list.append(i)
    unique_word_list = list(set(unique_word_list)) #Makes word_list from word_list's unique elements
    grouped_data.to_csv("Data/AveragedLargerSadrKartTransative.txt")
    return grouped_data, unique_word_list
dataset, unique_word_list = read_and_preprocess_data()

embeddings = {}
for word in unique_word_list:
    embeddings.update({word:{"SBERT":embedder.encode(word)}})

# Generate Embeddings and Reduced Vectors

In [69]:
def reduce_by_pca(input_array, new_dims):
    pca = PCA(n_components=new_dims)
    pca.fit(input_array)
    data_pca = pca.transform(input_array)
    return data_pca

def reduce_by_svd(input_array, new_dims):
    U, D, Vt = np.linalg.svd(input_array)
    U_reduced = U[:, :new_dims]
    A_reduced = np.dot(U_reduced, np.diag(D[:new_dims]))
    return A_reduced

def reduce_by_lda(input_array, new_dims, labels):
    lda = LDA(n_components=new_dims)
    data_lda = lda.fit_transform(input_array, labels)
    return data_lda

def reduce_by_mds(input_array, new_dims):
    pass

def reduce_by_isomap(input_array, new_dims, n_neighbors=5):
    pairwise_distances = squareform(pdist(input_array))
    isomap = Isomap(n_neighbors=n_neighbors, n_components=new_dims)
    data_isomap = isomap.fit_transform(pairwise_distances)
    return data_isomap

def reduce_by_tsne(input_array, new_dims, perplexity=30, learning_rate=200):
    tsne = TSNE(n_components=new_dims, perplexity=perplexity, learning_rate=learning_rate)
    data_tsne = tsne.fit_transform(input_array)
    return data_tsne

def sammon_mapping_loss(Y, X, delta):
    n = X.shape[0]
    d = X.shape[1]
    sum_delta = np.sum(delta)
    d_ij = pdist(Y)
    d_ij[d_ij == 0] = 1e-10  # Avoid division by zero
    loss = np.sum((d_ij - delta) ** 2 / (d_ij * delta)) / (2 * sum_delta)
    return loss

def reduce_by_sammon(input_array, new_dims):
    D = pdist(input_array)  # Calculate distance matrix using the original high-dimensional data
    result = minimize(
        lambda Y: sammon_mapping_loss(Y, input_array, D),
        np.random.rand(input_array.shape[0], new_dims),
        method="L-BFGS-B",
    )
    data_sammon = result.x
    return data_sammon

input_vector_data = np.array([vector_dict['SBERT'] for word, vector_dict in embeddings.items()])
sbert_vectors = [value['SBERT'].T for value in embeddings.values()]
input_matrix_data = np.vstack(sbert_vectors) #For Sammon

encode_methods = {
    "pca": reduce_by_pca,
    "svd": reduce_by_svd,
    'mds':reduce_by_mds,
    "isomap": reduce_by_isomap,
    "tsne": reduce_by_tsne,
    #"sammon": reduce_by_sammon
}

# Loop through the reduction methods and apply each one
for method, reduce_func in encode_methods.items():
    # Update the method name for consistency (e.g., "pca" becomes "PCA")
    method_name = method.upper()
    
    # Specify dimensions based on the reduction method
    if method == "tsne":
        dims_list = [2, 3]  # t-SNE is typically used with 2 or 3 dimensions
    else:
        dims_list = range(1,20)  # For other methods, you can use the dimensions you want
        
    # Loop through the desired dimensions for reduction
    for dim in dims_list:
        # Construct the key for the embeddings dictionary
        key = f"{method}_{dim}"
        
        # Apply the reduction method
        reduced_data = reduce_func(input_vector_data, new_dims=dim)
        
        # Update the embeddings dictionary with the reduced data
        for i, (word, vector_dict) in enumerate(embeddings.items()):
            vector_dict[key] = reduced_data[i]
embeddings['land'].keys()

In [None]:
class QuantumEncodedNumpyModel(NumpyModel):#Rename to "EncodedNumpyModel"
     def initialise_weights(self) -> None:
        """Initialise the weights of the model.
        Raises
        ------
        ValueError
            If `model.symbols` are not initialised.
        """
        if self.type_of_encoding == "pca":
            if not self.symbols:
                raise ValueError('Symbols not initialised. Instantiate through '
                                '`from_diagrams()`.')
            self.weights = free_symbols_to_rotations(self.symbols)

        if self.type_of_encoding == "normal_distribution":
            if not self.symbols:
                raise ValueError('Symbols not initialised. Instantiate through '
                                '`from_diagrams()`.')
            self.weights = normal_distribution_to_rotations(self.symbols)

In [None]:
def pca_to_rotations(symbols):
    noun_parameters, subject_parameters = get_word_dims_from_ansatz(ANSATZ)

    weights = np.zeros(shape=(len(symbols)))
    for i, word_symbol in enumerate(symbols):
        word_string, word_dims, word_index = retrive_word_param_from_symbols(word_symbol, noun_parameters, subject_parameters)
        weights[i] = word_vector_dict[word_string]['pca_'+str(word_dims)][word_index]/(2*np.pi)
    return weights

def normal_distribution_to_rotations(symbols):
    noun_parameters, subject_parameters = get_word_dims_from_ansatz(ANSATZ)

    weights = np.zeros(shape=(len(symbols)))
    for i, word_symbol in enumerate(symbols):
        word_string, word_dims, word_index = retrive_word_param_from_symbols(word_symbol, noun_parameters, subject_parameters)
        
        mean_of_word = word_vector_dict[word_string][384].mean()
        std_of_word = word_vector_dict[word_string][384].std()
        s = np.random.normal(mean_of_word, std_of_word, 1)

        weights[i] = s
    return weights

def uniform_to_rotations(symbols):
    pass

In [None]:
def make_and_train_model(diagrams, train_dataset, val_dataset, model_type, noun_count, ansatz, sentence_count, asnatz_hyperparams, ansatz_name, loss_name, optimizer_name, optimizer_hyperparams_str, num_epochs, batch_size, seed):
    if model_type == 'random':
        model = NumpyModel.from_diagrams(diagrams, use_jit=True)
    elif model_type == 'pca':
        model = EncodedNumpyModel.from_diagrams(diagrams, use_jit=True)
        model.type_of_encoding = 'pca'
        #model.param_initialise_method = pca_to_rotations
        model.initialise_weights()
    elif model_type == 'normal':
        model = EncodedNumpyModel.from_diagrams(diagrams, use_jit=True)
        #model.param_initialise_method = normal_distribution_to_rotations
        model.initialise_weights()
    elif model_type == 'uniform_zero':
        model = EncodedNumpyModel.from_diagrams(diagrams, use_jit=True)
        #model.param_initialise_method = normal_distribution_to_rotations
        model.initialise_weights()
    elif model_type == 'uniform_half':
        model = EncodedNumpyModel.from_diagrams(diagrams, use_jit=True)
        #model.param_initialise_method = normal_distribution_to_rotations
        model.initialise_weights()
    elif model_type == 'uniform_one':
        model = EncodedNumpyModel.from_diagrams(diagrams, use_jit=True)
        #model.param_initialise_method = normal_distribution_to_rotations
        model.initialise_weights()
    elif model_type == 'svd':
        model = EncodedNumpyModel.from_diagrams(diagrams, use_jit=True)
        #model.param_initialise_method = normal_distribution_to_rotations
        model.initialise_weights()
    elif model_type == 'normal_around_half':
        model = EncodedNumpyModel.from_diagrams(diagrams, use_jit=True)
        #model.param_initialise_method = normal_around_half_distribution_to_rotations
        model.initialise_weights()
    else:
        raise ValueError(f"Invalid ansatz: {ansatz_name}")
    return model