In [1]:
import numpy as np
import pickle
import csv
import os
import math
import random
import seaborn as sns
import pandas as pd
from gensim.models import Word2Vec
from gensim.models import KeyedVectors
from gensim.test.utils import datapath

# Libraries for multiprocessing
import multiprocessing as mp
from multiprocessing import Pool

In [2]:
def grab_random_words(word_vectors, num_words):
    # Grab a random sample of words from the word vectors
    random_words = random.choices(list(word_vectors.key_to_index.keys()), k=min(num_words, len(word_vectors.key_to_index)))
    return random_words

def calculate_nearest_neighbors_correlation(word_vectors1, word_vectors2, words, num_neighbors=10):
    correlations = []
    for word in words:
        # Ensure the word exists in both models
        if word in word_vectors1.wv.key_to_index and word in word_vectors2.wv.key_to_index:
            neighbors1 = [neighbor[0] for neighbor in word_vectors1.wv.most_similar(word, topn=num_neighbors)]
            neighbors2 = [neighbor[0] for neighbor in word_vectors2.wv.most_similar(word, topn=num_neighbors)]
            common_neighbors = set(neighbors1) & set(neighbors2)
            if len(common_neighbors) >= 2:  # Ensure at least 2 common neighbors
                ranks1 = [neighbors1.index(neighbor) for neighbor in common_neighbors]
                ranks2 = [neighbors2.index(neighbor) for neighbor in common_neighbors]
                correlation = np.corrcoef(ranks1, ranks2)[0, 1]
                correlations.append(correlation)
    if correlations:  # Check if the correlations list is not empty
        avg_correlation = np.mean(correlations)
    else:
        avg_correlation = np.nan  # Set to NaN if correlations list is empty
    return avg_correlation


def main(corpus, policy):
    num_cores = mp.cpu_count()
    
    # Hyperparameters
    window_sizes_1 = [1, 6, 48]  # Example window sizes for model 1
    window_sizes_2 = [1, 6, 48]  # Example window sizes for model 2
    dimension_sizes_1 = [50, 300, 450]  # Example dimension sizes for model 1
    dimension_sizes_2 = [50, 300, 450]  # Example dimension sizes for model 2
    num_words = 100  # Number of random words to sample

    # Data storage
    data = []

    for window_size_1 in window_sizes_1:
        for dimension_size_1 in dimension_sizes_1:
            for window_size_2 in window_sizes_2:
                for dimension_size_2 in dimension_sizes_2:
                    # Train Word2Vec models
                    model1 = Word2Vec(corpus, window=window_size_1, vector_size=dimension_size_1, min_count = 100, sg = 1, hs = 0, negative = 5, workers = num_cores-1)
                    model2 = Word2Vec(corpus, window=window_size_2, vector_size=dimension_size_2,  min_count = 100, sg = 1, hs = 0, negative = 5, workers = num_cores-1)

                    # Grab random words
                    words = grab_random_words(model1.wv, num_words)

                    # Calculate correlation of nearest neighbors
                    correlations = calculate_nearest_neighbors_correlation(model1, model2, words)

                    # Store data
                    data.append({'Window Size 1': window_size_1,
                                 'Dimension Size 1': dimension_size_1,
                                 'Window Size 2': window_size_2,
                                 'Dimension Size 2': dimension_size_2,
                                 'Average Correlation': correlations})
                    
                    print(window_size_1, dimension_size_1, window_size_2, dimension_size_2, correlations)

    # Create DataFrame
    df = pd.DataFrame(data, columns=['Window Size 1', 'Dimension Size 1', 'Window Size 2', 'Dimension Size 2', 'Average Correlation'])

    # Export DataFrame to CSV
    df.to_csv(policy + "ivalid.csv", index=False)


In [None]:
with open('intermediate/healthcare_corpus.pickle', 'rb') as f:
    list_of_lists = pickle.load(f)
    
corpus = []
flatten = [item for sublist in list_of_lists for item in sublist]
corpus.extend(flatten)

if __name__ == "__main__":
    main(corpus, "healthcare")

1 50 1 50 0.6419017741753815
