# todo list:
Lets first get the embeddings done, then work on a bit metadata.
- update projected_embedding_ggvec_top40_4d_order2.json (author embeddings).
    - consider neiborhoods. neibors are thoese who have ever collaborated.
    - author embeddings are aggreated from paper embeddings.
    - we can do 30K authors.
    
- update processed-metadata.csv, row id is author id, with author info.
    - (Author info should be enriched, now we can have Number of pubmed papers.)


In [20]:
import pandas as pd
paper_author = pd.read_csv('/data/jx4237data/TKG/TKG_JCDL/Bridge2AI_2m/paper_author.csv.gz')

In [38]:
paper_author40k = pd.read_csv('/data/jx4237data/TKG/TKG_JCDL/Bridge2AI_10k/paper_author.csv.gz')

In [31]:
# filter paper_author by only keep the rows that have AID in paper_author40k
paper_author = paper_author[paper_author['AID'].isin(paper_author40k['AID'])]
paper_author.nunique()

Unnamed: 0,PMID,Au_Order,AID,AuthorNum,PubYear
0,946486,1,6494255,3,1976
1,946486,2,100000086,3,1976
2,946486,3,7066054,3,1976
3,1077385,1,1644286,2,1976
4,1077385,2,100000021,2,1976
...,...,...,...,...,...
95417,38088622,4,26198780,8,2023
95418,38088622,5,26198781,8,2023
95419,38088622,6,11949134,8,2023
95420,38088622,7,26198782,8,2023


In [None]:
# read /data/jx4237data/Graph-CoT/Pipeline/updated_data/tkg_embeddings_all.npz, this is a npz file
import numpy as np
# read the .npz file
tkg_embeddings_all = np.load('/data/jx4237data/Graph-CoT/Pipeline/updated_data/tkg_embeddings_all.npz')

# access the data from the .npz file
embeddings = tkg_embeddings_all['embeddings']
ids = tkg_embeddings_all['ids']

In [None]:
# check how many PMIDs in paper_author['PMID'] (int) are in ids (str), convert both into str
# list pmids in paper_author['PMID'] but not in ids
pmids = paper_author['PMID'].astype(str)
ids = [str(i) for i in ids]
pmids_not_in_ids = list(set(pmids) - set(ids))

In [None]:
import pandas as pd
from tqdm import tqdm
import pandas as pd
from tqdm import tqdm

# read /data/jx4237data/projects/get_methods/pubmed24_title_abstract.csv.gz all cols are strings \
# only keep the rows with PMID in pmids_not_in_ids, use tqdm to show progress
# Define the file path
file_path = '/data/jx4237data/projects/get_methods/pubmed24_title_abstract.csv.gz'

# Define the chunk size
chunk_size = 100000

# Create an empty list to store the filtered chunks
filtered_chunks = []

# Iterate over the file in chunks
for chunk in tqdm(pd.read_csv(file_path, chunksize=chunk_size)):
    # Filter the chunk based on PMID
    filtered_chunk = chunk[chunk['PMID'].astype(str).isin(pmids_not_in_ids)]
    
    # Append the filtered chunk to the list
    filtered_chunks.append(filtered_chunk)
    
# Concatenate the filtered chunks into a single dataframe
filtered_df = pd.concat(filtered_chunks)

# Free memory by deleting the chunks
del filtered_chunks

# Print the filtered dataframe
filtered_df

In [None]:
import os

# Set the TOKENIZERS_PARALLELISM environment variable to false to avoid warnings and potential deadlocks
os.environ["TOKENIZERS_PARALLELISM"] = "false"

# Set visible GPUs before importing PyTorch
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

from transformers import AutoTokenizer
from adapters import AutoAdapterModel
import torch
from torch.utils.data import DataLoader, Dataset
from tqdm import tqdm

# Load model and tokenizer
tokenizer = AutoTokenizer.from_pretrained('allenai/specter2_aug2023refresh_base')
model = AutoAdapterModel.from_pretrained('allenai/specter2_aug2023refresh_base')

# Load the adapter(s) as per the required task, provide an identifier for the adapter in load_as argument and activate it
model.load_adapter("allenai/specter2_aug2023refresh", source="hf", load_as="proximity", set_active=True)

# Prepare new documents and IDs
new_docs = []
new_ids = []
for index, row in filtered_df.iterrows():
    title = row['Title']
    abstract = row['Abstract']
    pmid = row['PMID']
    new_docs.append(str(title) + tokenizer.sep_token + str(abstract))
    new_ids.append(pmid)

# Move the model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

batch_size = 100  # Define an appropriate batch size
all_embeddings = []
all_ids = []

class DocumentDataset(Dataset):
    def __init__(self, docs, ids):
        self.docs = docs
        self.ids = ids

    def __len__(self):
        return len(self.docs)

    def __getitem__(self, idx):
        return self.docs[idx], self.ids[idx]

dataset = DocumentDataset(new_docs, new_ids)
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=False, num_workers=4)

# Function to process a single batch
def process_batch(batch):
    docs, ids = batch
    inputs = tokenizer(list(docs), padding=True, truncation=True,
                       return_tensors="pt", return_token_type_ids=False, max_length=512)
    inputs = inputs.to(device)  # Move inputs to GPU
    with torch.no_grad():  # Disable gradient calculations
        output = model(**inputs)
    batch_embeddings = output.last_hidden_state[:, 0, :].cpu()  # Move embeddings back to CPU
    return batch_embeddings, ids

# Process the dataset in batches
for batch in tqdm(dataloader):
    batch_embeddings, batch_ids = process_batch(batch)
    
    all_embeddings.append(batch_embeddings)
    all_ids.extend(batch_ids)

new_embeddings = torch.cat(all_embeddings, dim=0).numpy()
new_ids = all_ids

print("Embeddings have been generated")

In [None]:
new_ids_str = [str(tensor.tolist()) for tensor in new_ids]

In [None]:
len(new_ids_str)

In [None]:
new_embeddings_dict = {id_: embedding for id_, embedding in zip(new_ids_str, new_embeddings)}
# store this dict in a comressed file
import numpy as np
np.savez_compressed('/data/jx4237data/TKG/TKG_JCDL/Bridge2AI_10k/tkg_updated_360k_embeddings_all.npz', embeddings=new_embeddings, ids=new_ids)

In [None]:
# we have new_embeddings, new_ids, ids, embeddings, we need to concatenate them. 
# and get a dict with key as PMID and value as embedding (for both embeddings and new_embeddings, get one embedding for each PMID)
# Create a dictionary with PMID as key and embedding as value
embeddings_dict = {id_: embedding for id_, embedding in zip(ids, embeddings)}
# new_embeddings_dict = {str(id_): embedding for id_, embedding in zip(new_ids, new_embeddings)}
combined_embeddings_dict = {**embeddings_dict, **new_embeddings_dict}

In [None]:
# now based on the combined_embeddings_dict, (it represent each paper's embedding),
# we need to get the embedding for each author, by a weighted sum of the embeddings of the papers that the author has written
# paper_author has PMID, AID, Au_Order, AuthorNum
# our weighted strategy is like this:
# for author aggregation:
# author rank - weights
# 1st - 1
# last - 1
# 2nd - 1/2
# 3rd - 1/3
# 4th - 1/4
# ... no less than 1/10
# Write the code:
import numpy as np
import pandas as pd
from tqdm import tqdm

def get_author_embeddings(combined_embeddings_dict, paper_author):
    author_embeddings = {}
    author_weights = {}

    for index, entry in tqdm(paper_author.iterrows(), total=paper_author.shape[0], desc="Processing authors"):
        pmid = str(entry['PMID'])
        aid = entry['AID']
        au_order = entry['Au_Order']
        author_num = entry['AuthorNum']

        # Get the embedding for the paper
        paper_embedding = combined_embeddings_dict[pmid]

        # Calculate the weight based on the author's order
        if au_order == 1 or au_order == author_num:
            weight = 1
        else:
            weight = max(1 / au_order, 1 / 10)

        # Initialize the author's embedding and weight if not already done
        if aid not in author_embeddings:
            author_embeddings[aid] = np.zeros_like(paper_embedding)
            author_weights[aid] = 0

        # Update the author's cumulative embedding and weight
        author_embeddings[aid] += weight * paper_embedding
        author_weights[aid] += weight

    # Normalize the embeddings by the sum of weights
    for aid in author_embeddings:
        author_embeddings[aid] /= author_weights[aid]

    return author_embeddings

author_embeddings = get_author_embeddings(combined_embeddings_dict, paper_author)

In [None]:
import gzip
import pickle

# Save the dictionary to a .gz file
with gzip.open('author_embeddings.pkl.gz', 'wb') as f:
    pickle.dump(author_embeddings, f, protocol=pickle.HIGHEST_PROTOCOL)

# Load the dictionary from the .gz file
with gzip.open('author_embeddings.pkl.gz', 'rb') as f:
    author_embeddings = pickle.load(f)

In [None]:
# Load the dictionary from the .gz file
with gzip.open('author_embeddings.pkl.gz', 'rb') as f:
    author_embeddings = pickle.load(f)

len(author_embeddings)

In [None]:
author_embeddings

# now based on the paper_author, let find neibor authors for each author.

In [None]:
# paper_author is a dataframe, has PMID, AID, Au_Order, AuthorNum, 
# now based on the paper_author, let find neibor authors for each author.
# Define a function to get the neighboring authors for each author
import pandas as pd
from collections import defaultdict

def find_neighbors(paper_author):
    # Initialize a dictionary to store neighbors for each author
    author_neighbors = defaultdict(set)

    # Group the dataframe by PMID
    grouped = paper_author.groupby('PMID')

    # Iterate through each group (each paper)
    for pmid, group in grouped:
        # Get the list of authors for this paper
        authors = group['AID'].tolist()

        # For each author, add the other authors as neighbors
        for author in authors:
            neighbors = set(authors) - {author}
            author_neighbors[author].update(neighbors)

    # Convert sets to lists for the final output
    author_neighbors = {author: list(neighbors) for author, neighbors in author_neighbors.items()}

    return author_neighbors

author_neighbors = find_neighbors(paper_author)

In [None]:
new_author_neighbors = {}
for author,neighbors in author_neighbors.items():
    if 0 in neighbors:
        neighbors.remove(0)
    new_author_neighbors[author] = neighbors

In [None]:
# Save the dictionary to a .gz file
with gzip.open('author_collaborators.pkl.gz', 'wb') as f:
    pickle.dump(new_author_neighbors, f, protocol=pickle.HIGHEST_PROTOCOL)

# Load the dictionary from the .gz file
with gzip.open('author_collaborators.pkl.gz', 'rb') as f:
    author_neighbors = pickle.load(f)

# metadata

In [4]:
import pandas as pd
# read /data/jx4237data/TKG/TKG_JCDL/Bridge2AI_10k/Authors.csv.gz
authors = pd.read_csv('/data/jx4237data/TKG/TKG_JCDL/Bridge2AI_10k/Authors.csv')

In [6]:
authors = authors[['AID', 'FullName', 'BeginYear', 'PaperNum','CM4AI','RecentYear']]

In [16]:
authors = authors[(authors['PaperNum'] > 2) & (authors['RecentYear'] > 2020)]
authors

Unnamed: 0,AID,FullName,BeginYear,PaperNum,CM4AI,RecentYear
0,225,Baba Inusa,2005,102,,2024
1,878,Rutger Schutten,2017,5,,2022
3,1098,Vardit Ravitsky (CM4AI),2002,192,1.0,2023
4,1352,Ursula Grohmann,1988,158,,2023
5,1592,Sharon R Lewin,1993,373,,2023
...,...,...,...,...,...,...
44678,843771129831,Jillian Mohan (CM4AI),2012,37,1.0,2024
44679,83284832748327,Jiawei Xu (CM4AI),2021,5,1.0,2024
44680,873811147837,Swathi Thaker (CM4AI),2005,23,1.0,2024
44681,23553111988,Xiaoyu Zhao (CM4AI),2010,11,1.0,2024


In [17]:
# authors['BeginYear'] and authors['PaperNum'] are float64, convert them to int
authors['BeginYear'] = authors['BeginYear'].astype(int)
authors['PaperNum'] = authors['PaperNum'].astype(int)
authors

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  authors['BeginYear'] = authors['BeginYear'].astype(int)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  authors['PaperNum'] = authors['PaperNum'].astype(int)


Unnamed: 0,AID,FullName,BeginYear,PaperNum,CM4AI,RecentYear
0,225,Baba Inusa,2005,102,,2024
1,878,Rutger Schutten,2017,5,,2022
3,1098,Vardit Ravitsky (CM4AI),2002,192,1.0,2023
4,1352,Ursula Grohmann,1988,158,,2023
5,1592,Sharon R Lewin,1993,373,,2023
...,...,...,...,...,...,...
44678,843771129831,Jillian Mohan (CM4AI),2012,37,1.0,2024
44679,83284832748327,Jiawei Xu (CM4AI),2021,5,1.0,2024
44680,873811147837,Swathi Thaker (CM4AI),2005,23,1.0,2024
44681,23553111988,Xiaoyu Zhao (CM4AI),2010,11,1.0,2024


In [18]:
authors_reset = authors.reset_index(drop=True)


In [19]:
authors_reset.rename(columns={'AID': 'id'}, inplace=True)

In [20]:
authors_reset['Index'] = authors_reset.index

In [21]:
authors_reset.to_csv('/data/jx4237data/projects/web_page/talentKnowledgeGraph/work/data/processed_authors_more_than_single_recent_pub.csv', index=False)

In [5]:
authors_reset = pd.read_csv('/data/jx4237data/projects/web_page/talentKnowledgeGraph/work/data/processed_authors_more_than_single.csv')

In [2]:
anime = pd.read_csv('/data/jx4237data/projects/web_page/talentKnowledgeGraph/work/data/processed-metadata.csv')

In [3]:
import numpy as np
import pandas as pd

# Calculate the transformed rating count
transformed_rating_count = ((anime['rating_count']**0.72) / 9000 + 0.14)

# Describe the transformed rating count
description = transformed_rating_count.describe()

# Calculate additional percentiles
percentiles = transformed_rating_count.quantile([0.60, 0.80, 0.90, 0.95])

# Combine the description and percentiles
description = pd.concat([description, percentiles])

# Format the description to 2 decimal places
formatted_description = description.apply(lambda x: format(x, '.2f'))

formatted_description

count    9432.00
mean        0.23
std         0.17
min         0.14
25%         0.15
50%         0.16
75%         0.22
max         2.48
0.6         0.18
0.8         0.25
0.9         0.37
0.95        0.52
Name: rating_count, dtype: object

In [12]:
import numpy as np
import pandas as pd

# Calculate the transformed rating count
transformed_rating_count = ((authors_reset['PaperNum'])**.7 / 6 + 0.14)

# Describe the transformed rating count
description = transformed_rating_count.describe()

# Calculate additional percentiles
percentiles = transformed_rating_count.quantile([0.60, 0.80, 0.90, 0.95])

# Combine the description and percentiles
description = pd.concat([description, percentiles])

# Format the description to 2 decimal places
formatted_description = description.apply(lambda x: format(x, '.2f'))

formatted_description

count    34638.00
mean         3.43
std          3.29
min          0.41
25%          1.20
50%          2.31
75%          4.53
max         39.63
0.6          3.00
0.8          5.25
0.9          7.64
0.95         9.96
Name: PaperNum, dtype: object

In [7]:
import numpy as np
import pandas as pd

# Calculate the transformed rating count
transformed_rating_count = authors_reset['PaperNum']

# Describe the transformed rating count
description = transformed_rating_count.describe()

# Calculate additional percentiles
percentiles = transformed_rating_count.quantile([0.60, 0.80, 0.90, 0.95])

# Combine the description and percentiles
description = pd.concat([description, percentiles])

# Format the description to 2 decimal places
formatted_description = description.apply(lambda x: format(x, '.2f'))

formatted_description

count    34638.00
mean        88.91
std        135.61
min          2.00
25%         14.00
50%         39.00
75%        107.00
max       2468.00
0.6         58.00
0.8        133.00
0.9        230.00
0.95       338.00
Name: PaperNum, dtype: object

In [43]:
import numpy as np
import pandas as pd

# Calculate the transformed rating count
transformed_rating_count = (authors['BeginYear'])

# Describe the transformed rating count
description = transformed_rating_count.describe()

# Calculate additional percentiles
percentiles = transformed_rating_count.quantile([0.02, 0.05, .1,.15, .2 , 0.60, 0.80, 0.90, 0.95])

# Combine the description and percentiles
description = pd.concat([description, percentiles])

# Format the description to 2 decimal places
formatted_description = description.apply(lambda x: format(x, '.2f'))

formatted_description

count    44673.00
mean      2005.32
std         13.57
min       1804.00
25%       1997.00
50%       2007.00
75%       2016.00
max       2024.00
0.02      1974.00
0.05      1980.00
0.1       1986.00
0.15      1991.00
0.2       1994.00
0.6       2011.00
0.8       2019.00
0.9       2022.00
0.95      2023.00
Name: BeginYear, dtype: object