In [1]:
import numpy as np
import pandas as pd
from itertools import combinations
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import NearestNeighbors
from tqdm.notebook import tqdm
from multiprocess import Pool
import pickle

In [2]:
experiment_list = [25, 26, 27, 28, 29, 30, 31, 32]
n_features_list = [10, 10, 10, 12, 10, 12, 12, 12]
n_sample_list = [100, 100, 1000, 100, 1000, 100, 1000, 1000]
max_value_list = [100, 1000, 100, 100, 1000, 1000, 100, 1000]

for q in tqdm(range(len(experiment_list))):
    # Set random seed for reproducibility (optional)
    np.random.seed(20230801)

    # Generate synthetic dataset
    num_samples = n_sample_list[q]
    num_features = n_features_list[q]

    # Define the range for the random values
    low = 0
    high = max_value_list[q]

    # Create a dictionary to store the data
    data = {}
    for feature_num in range(1, num_features + 1):
        data[f'Feature{feature_num}'] = np.random.uniform(low, high, num_samples)

    # Convert the dictionary to a DataFrame
    syn = pd.DataFrame(data)

    num_samples = 1
    #num_features = 10

    #low = 0
    #high = 100

    combs = []
    for i in range(len(syn) + 1):
        combs += [list(comb) for comb in combinations(syn, i)]
    combs = [comb for comb in combs if len(comb) >= 1]

    # Preprocessing: Fit the StandardScaler to the synthetic dataset
    scaler = StandardScaler()
    scaled_syn = pd.DataFrame(scaler.fit_transform(syn), columns=syn.columns)

    # Function to compute Nearest Neighbors for a given combination
    def compute_neighbors(seed):
        np.random.seed(seed)

        # Create a dictionary to store the data
        data = {}
        for feature_num in range(1, num_features + 1):
            data[f'Feature{feature_num}'] = np.random.uniform(low, high, num_samples)

        # Convert the dictionary to a DataFrame
        query = pd.DataFrame(data)

        scaled_query = pd.DataFrame(scaler.transform(query), columns=query.columns)

        length_list = []; nn_list = []; distance_list = []

        for comb in combs:
            length_list.append(len(comb))
            # Create a NearestNeighbors object and fit the data
            nbrs = NearestNeighbors(n_neighbors=1).fit(scaled_syn[comb])

            # Find the nearest neighbor
            distances, indices = nbrs.kneighbors(scaled_query[comb])

            # Print the nearest neighbor's index and distance
            nn_list.append(indices[0][0])
            distance_list.append(distances[0][0])

        experiment = {
            'n_features': length_list,
            'index': nn_list,
            'distance': distance_list
        }

        var = pd.DataFrame(experiment)
        return var

    # Use multiprocessing to parallelize the loop
    num_processors = 4 # Adjust this value based on your CPU core count
    with Pool(num_processors) as pool:
        results = list(tqdm(pool.imap(compute_neighbors, range(0, 1000)), total=1000))

    # Create the dictionary of dataframes
    d = {"df" + str(i): df for i, df in enumerate(results)}
    
    # Pickle the dictionary and save it to a file
    with open('data_frames_experiment' + str(experiment_list[q]) + '_' + str(n_features_list[q]) + 'features_' + str(n_sample_list[q]) + 'samples_' + str(max_value_list[q]) + 'vals.obj', 'wb') as f:
        pickle.dump(d, f)

  0%|          | 0/8 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

Process ForkPoolWorker-2:
Process ForkPoolWorker-1:
Process ForkPoolWorker-3:
Process ForkPoolWorker-4:


KeyboardInterrupt: 