In [1]:
import pandas as pd
import pickle
import gzip

with gzip.open("author_collaborators.pkl.gz", "rb") as f:
    author_collab = pickle.load(f)

In [3]:
with gzip.open("author_embeddings.pkl.gz", "rb") as f:
    author_ebd = pickle.load(f)

In [5]:
paper_author_2m = pd.read_csv('/data/jx4237data/TKG/TKG_JCDL/Bridge2AI_2m/paper_author.csv.gz')

In [10]:
aids_with_recent_pubs = set(paper_author_2m[paper_author_2m['PubYear'] > 2018]['AID'])

In [17]:
import numpy as np

def get_top_n_similar_authors(author_ebd, author_id, N=5):
    # Check if the author_id exists in the dictionary
    if author_id not in author_ebd:
        raise ValueError(f"Author ID {author_id} not found in the author embeddings.")
    
    # Get the embedding of the target author
    target_embedding = author_ebd[author_id]
    
    # Initialize a list to store similarities
    similarities = []
    
    # Compute cosine similarity between the target author and all other authors
    for other_id, other_embedding in author_ebd.items():
        if other_id != author_id and other_id in aids_with_recent_pubs and other_id not in author_collab[author_id]:  # Exclude the target author from the comparison
            cosine_similarity = np.dot(target_embedding, other_embedding) / (np.linalg.norm(target_embedding) * np.linalg.norm(other_embedding))
            similarities.append((other_id, cosine_similarity))
    
    # Sort the list by similarity in descending order and get the top N
    similarities.sort(key=lambda x: x[1], reverse=True)
    top_n_similar_authors = [author_id for author_id, _ in similarities[:N]]
    
    return top_n_similar_authors


In [None]:
import csv
import gzip
from tqdm import tqdm

import numpy as np

def get_top_n_similar_authors(author_ebd, author_id, N=5):
    # Check if the author_id exists in the dictionary
    if author_id not in author_ebd:
        raise ValueError(f"Author ID {author_id} not found in the author embeddings.")
    
    # Get the embedding of the target author
    target_embedding = author_ebd[author_id]
    
    # Initialize a list to store similarities
    similarities = []
    
    # Compute cosine similarity between the target author and all other authors
    for other_id, other_embedding in author_ebd.items():
        if other_id != author_id and other_id in aids_with_recent_pubs and other_id not in author_collab[author_id]:  # Exclude the target author from the comparison
            cosine_similarity = np.dot(target_embedding, other_embedding) / (np.linalg.norm(target_embedding) * np.linalg.norm(other_embedding))
            similarities.append((other_id, cosine_similarity))
    
    # Sort the list by similarity in descending order and get the top N
    similarities.sort(key=lambda x: x[1], reverse=True)
    top_n_similar_authors = [author_id for author_id, _ in similarities[:N]]
    
    return top_n_similar_authors

# Open a gzip-compressed file for writing
with gzip.open('author_recommendations.csv.gz', 'wt', newline='') as gzfile:
    writer = csv.writer(gzfile)
    
    # Write the header
    writer.writerow(['AID', 'Recommended AID'])
    
    # Iterate over each author ID in author_collab
    for author_id in tqdm(author_collab.keys()):
        # Get the top 30 recommended authors for the current author_id
        recommended_authors = get_top_n_similar_authors(author_ebd, author_id, N=30)
        
        # Write each pair (author_id, recommended_id) to the CSV
        for recommended_id in recommended_authors:
            writer.writerow([author_id, recommended_id])

In [24]:
df = pd.read_csv('author_recommendations.csv.gz', compression='gzip')
df.rename(columns={'Recommended AID': 'recommended_author_id'}, inplace=True)
df.to_csv('author_recommendations.csv.gz', index=False, compression='gzip')