<a href="https://colab.research.google.com/github/lavou/IntroPythonForDS/blob/master/3_0_Word_Embeddings_per_Author.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#In this notebook we clean the Keywords values, assign keywords to each author based on weights of authorship order and we create word embeddings for each author.



######0. Mount, set path & load packages

In [None]:
import os
import pandas as pd
import re
from bs4 import BeautifulSoup
from gensim.models import Word2Vec
import numpy as np

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
dir_path = '/content/drive/MyDrive/Pubmed_project/Dataframes'

In [None]:
# read the df with gpt-3 keywords
file_path = os.path.join(dir_path, 'df_keywords_GPT3_&Authors.csv')
df1 = pd.read_csv(file_path)

######1. Cleaning

In [None]:
print(len(df1["Author's Name"].unique()))

51


In [None]:
print(df1["Author's Name"].unique())

['Doudna JA' 'Sternberg SH' 'Harrington LB' 'Cate JH' 'Al-Shayeb B'
 'Staahl BT' 'Wiedenheft B' 'Fraser CS' 'Szostak JW' 'Wright AV'
 'Kumar GR' 'Stahl EC' 'Cofsky JC' 'Kranzusch PJ' 'Batey RT'
 'Ehrenberg AJ' 'Oakes BL' 'Floor SN' 'Mortimer SA' 'Doherty EA'
 'Bhuiya A' 'Switz NA' 'Harris AR' 'Moehle EA' 'Fletcher DA' 'Hirsh A'
 'East-Seletsky A' 'Nuñez JK' 'Kidwell MA' 'Ke A' 'MacRae IJ' 'Khalid MM'
 'Escajeda AM' 'Mok A' 'Lareau LF' 'Tan MX' 'Hsu PD' 'Lew RJ' 'Pestal K'
 'Gildea HK' 'Urnov FD' 'Giannikopoulos P' 'Lobba MJ' 'Watters KE'
 'Kornfeld JE' 'Noland CL' 'Barendregt A' 'Haurwitz RE' 'Coyle SM'
 'Hershey JW' 'Taha TY']


In [None]:
the51_author_names = df1["Author's Name"].unique().tolist()
# save it into a list
with open('/content/drive/MyDrive/Pubmed_project/Lists/51_author_names.txt', 'w') as f:
    for item in the51_author_names:
        f.write("%s\n" % item)

In [None]:
df1.head()

In [None]:
print(df1['Keywords'].dtype)

object


In [None]:
print(df1['Keywords'].apply(type))

In [None]:
# Standardize the keywords as they are a mixture of formats (single strings, lists of strings, bulleted lists etc)

def standardize_keywords(keyword):
    if isinstance(keyword, list):
        # Flatten the list and join the elements
        keyword = ', '.join([item for sublist in keyword for item in sublist])
    elif isinstance(keyword, str):
        # Remove any leading numbering or bullet points and strip leading/trailing whitespaces
        keyword = re.sub(r'^[\d\.\-\s]+', '', keyword).strip()
        # Remove any square brackets and quotes
        keyword = re.sub(r'[\[\]\']', '', keyword)
        # Replace any remaining bullet points or numbering with a comma
        keyword = re.sub(r'(\n[\d\.\-\)\s]+|\n•\s)', ', ', keyword)
        # Remove any remaining bullet points or numbering
        keyword = re.sub(r'^[\d\.\-\)\s]+', '', keyword).strip()
        # Check if the input resembles a filename
        if os.path.isfile(keyword):
            # Read the contents of the file and parse with BeautifulSoup
            with open(keyword) as f:
                contents = f.read()
            keyword = BeautifulSoup(contents, 'html.parser').get_text()
    return keyword


In [None]:
df1['Keywords'] = df1['Keywords'].apply(standardize_keywords)

In [None]:
df1.head()

Unnamed: 0,Paper ID,Authors,Abstract,Author's Order,Number of Authors,Keywords,Title,Author's Name
0,36798416,"['Taha TY', 'Chen IP', 'Hayashi JM', 'Tabata T...",Although the SARS-CoV-2 Omicron variant (BA.1)...,18,19,"SARS-CoV-2, Omicron, pGLUE",Rapid assembly of SARS-CoV-2 genomes reveals a...,Doudna JA
1,36797405,"['Yoon PH', 'Adler BA', 'Doudna JA']",,3,3,"Cas12, TnpB, CRISPR",To TnpB or not TnpB? Cas12 is the answer.,Doudna JA
2,36690762,"['Colognori D', 'Trinidad M', 'Doudna JA']",Robust and precise transcript targeting in mam...,3,3,"CRISPR, RNA, mammalian cells",Precise transcript targeting by CRISPR-Csm com...,Doudna JA
3,36656942,"['Wang JY', 'Doudna JA']",The advent of clustered regularly interspaced ...,2,2,"CRISPR, genome editing, genetic diseases.",CRISPR technology: A decade of genome editing ...,Doudna JA
4,36652483,"['Li Z', 'Zhong Z', 'Wu Z', 'Pausch P', 'Al-Sh...",Clustered regularly interspaced short palindro...,7,8,"CRISPR-CasΦ, CasΦ variants, DNA methylation, o...",Genome editing in plants using the compact edi...,Doudna JA


In [None]:
print(df1['Keywords'].apply(type).unique())

[<class 'str'>]


In [None]:
# print all the keywords
keywords_list = df1['Keywords'].tolist()
for keywords in keywords_list:
    print(keywords)

In [None]:
# Group by author name
grouped_df = df1.groupby('Author\'s Name')

######2. Assign keywords & create word embeddings

Notes:

1. Group by the author / Concatenate keywordds for every paper / Assign weights based on order of authorship.

2. Weights do not nornalization, as they sum up to 1






In [None]:
grouped_df = df1.groupby('Author\'s Name')

In [None]:
# Create an empty dictionary to store the keywords for each author
author_keywords = {}

# Loop through each author group
for name, group in grouped_df:
    # Concatenate the keywords from all the papers the author has contributed to
    all_keywords = ', '.join(group['Keywords'])
    
    # Split the concatenated keywords into a list
    all_keywords = all_keywords.split(', ')
    
    # Create a dictionary to store the weighted keywords for the author
    author_dict = {}
    
    # Loop through each paper the author has contributed to
    for index, row in group.iterrows():
        # Calculate the weight for the paper based on the author's order
        weight = 1 / row['Author\'s Order']
        
        # Split the paper's keywords into a list
        paper_keywords = row['Keywords'].split(', ')
        
        # Loop through each keyword in the paper
        for keyword in paper_keywords:
            # Strip leading/trailing whitespaces and convert to lowercase
            keyword = keyword.strip().lower()
            
            # Check if keyword exists in author_dict
            if keyword in author_dict:
                author_dict[keyword] += weight
            else:
                author_dict[keyword] = weight
    
    # Sort the weighted keywords by weight in descending order
    sorted_keywords = sorted(author_dict.items(), key=lambda x: x[1], reverse=True)
    
    # Add the sorted weighted keywords to the dictionary for the author
    author_keywords[name] = sorted_keywords

# Create a dictionary to store the embeddings for each author
author_embeddings = {}

# Loop through each author's keywords
for name, keywords in author_keywords.items():
    # Create a list to store the normalized weights for the author's keywords
    weights = []
    
    # Create a list to store the preprocessed keywords for the author's keywords
    keyword_list = []
    
    # Loop through each keyword and weight for the author
    for keyword, weight in keywords:
        # Append the normalized weight to the list of weights
        weights.append(weight / len(keywords))
        
        # Append the preprocessed keyword to the list of keywords
        keyword_list.append(keyword)
    
    # Create a Word2Vec model for the author's keywords
    model = Word2Vec([keyword_list], min_count=1, vector_size=100)
    
    # Add the Word2Vec model to the dictionary of author embeddings
    author_embeddings[name] = model.wv


In [None]:
# Normalize each author embedding using L2 normalization
for author, embedding in author_embeddings.items():
    norm = np.linalg.norm(embedding.vectors, axis=1, keepdims=True)
    embedding.vectors /= norm


In [None]:
# Create a list of author names and embeddings
author_names = []
embeddings = []
for author, embedding in author_embeddings.items():
    author_names.append(author)
    embeddings.append(embedding)

# Create a dataframe with the author names and embeddings
embedding_keywords_per_author_df = pd.DataFrame({'Author': author_names, 'Embedding': embeddings})

In [None]:
embedding_keywords_per_author_df.head()

Unnamed: 0,Author,Embedding
0,Al-Shayeb B,"[[-0.009473834, 0.004190257, 0.090214565, 0.15..."
1,Barendregt A,"[[-0.009471502, 0.0041983225, 0.09022657, 0.15..."
2,Batey RT,"[[-0.009605538, 0.004423238, 0.09005795, 0.159..."
3,Bhuiya A,"[[-0.009704356, 0.004596475, 0.090265684, 0.15..."
4,Cate JH,"[[-0.009663561, 0.0043093963, 0.090165645, 0.1..."


In [None]:
len(embedding_keywords_per_author_df)

51

In [None]:
len(author_keywords)

51

In [None]:
num_authors = len(grouped_df["Author's Name"].unique())
print(num_authors)

51


In [None]:
# Save the dataframe 
save_dir = "/content/drive/MyDrive/Pubmed_project"
save_file_path = save_dir + "/1_Keyword_Embeddings.csv"
embedding_keywords_per_author_df.to_csv(save_file_path, index=False)

In [None]:
type(embedding_keywords_per_author_df)

pandas.core.frame.DataFrame

In [None]:
embedding_keywords_per_author_df.to_csv('/content/drive/MyDrive/Pubmed_project/2_Keyword_Embeddings.csv', index=False, sep=',')

In [None]:
embedding_array = embedding.vectors
np.savetxt('/content/drive/MyDrive/Pubmed_project/3_Keyword_Embeddings.csv', embedding_array, delimiter=',')

######**NOTE:** Although Doudna JA (the central author for our analysis) has 346 co-authors in toal and although we have already selected the 51 more relevant to her based on the frequency of their shared publications, we end up with only 51 authors here. That is because some of these authors are not included at the data we collected from PubMed. Disregard this and continue the process with these 51 for now.
