In [1]:
import pickle
dataset_embeddings = pickle.load(open('dataset_embeddings.pkl', 'rb'))
index_to_dataset = pickle.load(open('index_to_dataset.pkl', 'rb'))

In [2]:
dataset_index_embeddings = {}
for index, dataset in index_to_dataset.items():
    dataset_index_embeddings[index] = dataset_embeddings[dataset]

In [3]:
import pickle
import gzip

# Load the dictionary from the .gz file
with gzip.open('author_embeddings.pkl.gz', 'rb') as f:
    author_embeddings = pickle.load(f)
len(author_embeddings)

44674

In [4]:
author_embeddings = { **author_embeddings, **dataset_index_embeddings }

In [5]:
import pandas as pd
authors = pd.read_csv('/data/jx4237data/projects/web_page/cm4ai_tkg/work/data/author_dataset.csv')

In [34]:
author_id_set = set(authors['id'].to_list()) & set(author_embeddings.keys())

In [36]:
authors = authors[authors['id'].isin(author_id_set)]

In [53]:
authors = authors.reset_index(drop=True)
authors['Index'] = authors.index
authors

Unnamed: 0,id,FullName,BeginYear,PaperNum,CM4AI,RecentYear,Index,is_author
0,225,Baba Inusa,2005,102,,2024,0,1
1,878,Rutger Schutten,2017,5,,2022,1,1
2,1098,Vardit Ravitsky (CM4AI),2002,192,1.0,2023,2,1
3,1352,Ursula Grohmann,1988,158,,2023,3,1
4,1592,Sharon R Lewin,1993,373,,2023,4,1
...,...,...,...,...,...,...,...,...
29526,123000001136,wFleaBase,2020,100,0.0,0,29526,0
29527,123000001137,yMGV - Yeast microarray global viewer,2020,100,1.0,0,29527,0
29528,123000001138,Protein-Protein Interaction Dataset (The Kroga...,2020,100,1.0,0,29528,0
29529,123000001139,Genetic Perturbations Dataset (Mali Lab),2020,100,1.0,0,29529,0


In [54]:
index2aid = authors.groupby('Index')['id'].apply(lambda x: x.iloc[0]).to_dict()
aid2index = {v: k for k, v in index2aid.items()}
author_embeddings_single = {int(k): v for k, v in author_embeddings.items() if k in aid2index}

In [55]:
from tqdm import tqdm
fname = f"/data/jx4237data/projects/web_page/talentKnowledgeGraph/work/data/tkg_28kAuthor_Dataset.w2v"
with open(fname, 'wt') as f:
    tab = ' '
    nl = '\n'
    f.write(f"{len(author_embeddings_single)}{tab}768{nl}")
    for key, embedding in tqdm(author_embeddings_single.items()):
        f.write(f"{aid2index[key]}{tab}{tab.join(map(str, embedding))}{nl}")
fname

100%|██████████| 29531/29531 [00:08<00:00, 3312.11it/s]


'/data/jx4237data/projects/web_page/talentKnowledgeGraph/work/data/tkg_28kAuthor_Dataset.w2v'

In [56]:
import numpy as np
import matplotlib.pyplot as plt
import math
from gensim.models import KeyedVectors
import pandas as pd
import emblaze
from emblaze.utils import Field, ProjectionTechnique
from collections import defaultdict, namedtuple

In [57]:
wv = KeyedVectors.load_word2vec_format(fname)

In [58]:
all_author_pd = pd.read_csv('/data/jx4237data/projects/web_page/cm4ai_tkg/work/data/author_dataset.csv')
all_author_pd = all_author_pd[all_author_pd['id'].isin(set(author_embeddings_single.keys()))]

In [74]:
all_author_pd.to_csv('/data/jx4237data/projects/web_page/cm4ai_tkg/work/data/author_dataset_filtered.csv', index=False)

In [59]:
all_authors = []
all_author_pd = all_author_pd.reset_index(drop=True)
all_author_pd['index'] = all_author_pd.index
for author in all_author_pd.itertuples(index=False):
    all_authors.append(author)
print(f"author count: {len(all_authors)}; wv len: {len(wv.index_to_key)}")

author count: 29531; wv len: 29531


In [60]:
def ix_to_ix(ix):
    return int(wv.index_to_key[ix])

def author_by_ix(ix):
    ix = ix_to_ix(ix)
    try: 
        all_authors[ix]
    except:
        print(f"ix: {ix}")
    return all_authors[ix]

In [61]:
len(wv.vectors)

29531

In [62]:
positions = wv.vectors
names = [author_by_ix(index).FullName for index in range(len(wv.vectors))]
colors = [author_by_ix(index).BeginYear if int(author_by_ix(index).BeginYear) > 1980 else 1980 for index in range(len(wv.vectors))]
sizes = [math.sqrt(author_by_ix(index).PaperNum)/6.  for index in range(len(wv.vectors))]

emb = emblaze.Embedding({Field.POSITION: positions, Field.NAME: names, Field.COLOR: colors, Field.RADIUS: sizes}, n_neighbors=20,)
# neighbors id are just from 0 to n-1, need wv.index_to_key to get the real index
emb.compute_neighbors(metric='cosine')

In [63]:
import json
import math

projection = emb.project(method=ProjectionTechnique.TSNE, perplexity=40, learning_rate='auto', init='pca', early_exaggeration=1.3)

serialized = projection.to_json(compressed=False, save_neighbors=False)

In [64]:
serialized['ids'] = [author_by_ix(i).id for i in range(len(wv.index_to_key))]

In [65]:
neighbors_json = emb.get_neighbors().to_json(compressed=False)

In [66]:
df_rec_authors = pd.read_csv('/data/jx4237data/projects/web_page/talentKnowledgeGraph/notebooks/talentKnowledgeGraph/author_recommendations.csv.gz', compression='gzip')

In [67]:
df_rec_authors['AID_index'] = df_rec_authors['AID'].apply(lambda x: wv.key_to_index.get(str(aid2index.get(x))))
df_rec_authors['recommended_author_id_index'] = df_rec_authors['recommended_author_id'].apply(lambda x: wv.key_to_index.get(str(aid2index.get(x))))
df_rec_authors.dropna(subset=['AID_index', 'recommended_author_id_index'], inplace=True)
df_rec_authors['AID_index'] = df_rec_authors['AID_index'].astype(int)
df_rec_authors['recommended_author_id_index'] = df_rec_authors['recommended_author_id_index'].astype(int)
result_dict = df_rec_authors.groupby('AID_index')['recommended_author_id_index'].apply(list).to_dict()

In [75]:
df_rec_authors_dataset = pd.read_csv('/data/jx4237data/projects/web_page/talentKnowledgeGraph/notebooks/talentKnowledgeGraph/dataset_author_recommendations.csv.gz', compression='gzip')

df_rec_authors_dataset['DatasetID_index'] = df_rec_authors_dataset['DatasetID'].apply(lambda x: wv.key_to_index.get(str(aid2index.get(x))))
df_rec_authors_dataset['recommended_author_id_index'] = df_rec_authors_dataset['recommended_author_id'].apply(lambda x: wv.key_to_index.get(str(aid2index.get(x))))
df_rec_authors_dataset.dropna(subset=['DatasetID_index', 'recommended_author_id_index'], inplace=True)
df_rec_authors_dataset['DatasetID_index'] = df_rec_authors_dataset['DatasetID_index'].astype(int)
df_rec_authors_dataset['recommended_author_id_index'] = df_rec_authors_dataset['recommended_author_id_index'].astype(int)
result_dict_dataset = df_rec_authors_dataset.groupby('DatasetID_index')['recommended_author_id_index'].apply(list).to_dict()

In [77]:
result_dict = {**result_dict, **result_dict_dataset}

In [78]:
neighbors_json['neighbors'] = result_dict

In [79]:
serialized['neighbors'] = neighbors_json
with open('/data/jx4237data/projects/web_page/cm4ai_tkg/work/data/tkg_ebd_34k_dataset.json', 'wt') as f:
    f.write(json.dumps(serialized))