<a href="https://colab.research.google.com/github/lavou/ResearchMate/blob/main/2_0_Keywords_Extraction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#In this notebook we process the authors' dataframes created and then we use GTP-3 API to extarct the topic of each paper in the form of keywords.




######0. Mount, set path and load packages

In [None]:
import os
import pandas as pd
import openai
import re
import time

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# set the path to the directory containing the CSV file
dir_path = '/content/drive/MyDrive/Pubmed_project/Dataframes'

# construct the file path using os.path.join()
file_path = os.path.join(dir_path, '100_Authors_df_with_Doudna.csv')

# load the data from the CSV file into a dataframe
df = pd.read_csv(file_path)

# view the DataFrame
print(df.head())

   Unnamed: 0  Paper ID                                            Authors  \
0           0  36798416  ['Taha TY', 'Chen IP', 'Hayashi JM', 'Tabata T...   
1           1  36797405               ['Yoon PH', 'Adler BA', 'Doudna JA']   
2           2  36690762         ['Colognori D', 'Trinidad M', 'Doudna JA']   
3           3  36656942                           ['Wang JY', 'Doudna JA']   
4           4  36652483  ['Li Z', 'Zhong Z', 'Wu Z', 'Pausch P', 'Al-Sh...   

                                            Abstract  Author's Order  \
0  Although the SARS-CoV-2 Omicron variant (BA.1)...              18   
1                                                NaN               3   
2  Robust and precise transcript targeting in mam...               3   
3  The advent of clustered regularly interspaced ...               2   
4  Clustered regularly interspaced short palindro...               7   

   Number of Authors                                           Keywords  \
0                 19   

######1. Preprocessing and cleaning 
---




In [None]:
print(df.shape)

(3448, 8)


In [None]:
# Drop rows where author's order is -1
df = df[df["Author's Order"] != -1]

In [None]:
len(df)

2999

In [None]:
# Check for missing stuff
print(df['Title'].isna().sum())
#df['Abstract'].isna().sum()
#df['Keywords'].isna().sum()


4


In [None]:
missing_title = df[df['Title'].isna()]
print(missing_title)

      Unnamed: 0  Paper ID                                            Authors  \
1512           2  36745728  ['Fulop NJ', 'Ramsay AIG', 'Vindrola-Padros C'...   
2563           2  28846217         ['Khalid MM', 'Galuska MA', 'Hamilton RJ']   
2565           4  30725648  ['Ershad M', 'Meredith A', 'Shah N', 'Khalid MM']   
2567           6  28613681                          ['Khalid MM', 'Waseem M']   

     Abstract  Author's Order  Number of Authors Keywords Title  
1512      NaN              15                 26      NaN   NaN  
2563      NaN               1                  3      NaN   NaN  
2565      NaN               4                  4      NaN   NaN  
2567      NaN               1                  2      NaN   NaN  


In [None]:
# It seems there are 4 papers who don't even contain titles, drop these.
# When the title is null there is no info about the paper, apart from the ID and the authors
df.dropna(subset=['Title'], inplace=True)

In [None]:
len(df)

2995

In [None]:
df.head()

In [None]:
cols = df.columns
print(cols)

Index(['Unnamed: 0', 'Paper ID', 'Authors', 'Abstract', 'Author's Order',
       'Number of Authors', 'Keywords', 'Title'],
      dtype='object')


In [None]:
# drop the Unnamed: 0 column
df.drop('Unnamed: 0', axis=1, inplace=True)

In [None]:
print(df)

      Paper ID                                            Authors  \
0     36798416  ['Taha TY', 'Chen IP', 'Hayashi JM', 'Tabata T...   
1     36797405               ['Yoon PH', 'Adler BA', 'Doudna JA']   
2     36690762         ['Colognori D', 'Trinidad M', 'Doudna JA']   
3     36656942                           ['Wang JY', 'Doudna JA']   
4     36652483  ['Li Z', 'Zhong Z', 'Wu Z', 'Pausch P', 'Al-Sh...   
...        ...                                                ...   
3443  29682593  ['Hanigan TW', 'Danes JM', 'Taha TY', 'Frasor ...   
3444  29080240  ['Abdelkarim H', 'Neelarapu R', 'Madriaga A', ...   
3445  29045501  ['Hanigan TW', 'Taha TY', 'Aboukhatwa SM', 'Fr...   
3446  28943357  ['Hanigan TW', 'Aboukhatwa SM', 'Taha TY', 'Fr...   
3447  28835796  ['Taha TY', 'Aboukhatwa SM', 'Knopp RC', 'Ikeg...   

                                               Abstract  Author's Order  \
0     Although the SARS-CoV-2 Omicron variant (BA.1)...              18   
1                    

In [None]:
# Save the cleaned df
output_file_path = os.path.join(dir_path, 'cleaned_df_no_keywords.csv')
df.to_csv(output_file_path, index=False)

######2. Keywords extarction - GPT
---





In [None]:
#percentage of rows without any keywords
num_no_keywords = df['Keywords'].isna().sum()
num_total = len(df)
pct_no_keywords = num_no_keywords / num_total * 100

# print the percentage of rows without any keywords
print(f"{pct_no_keywords:.2f}% of rows do not have any keywords.")

77.20% of rows do not have any keywords.


*GPT-3 prompt* 

In [None]:
#os.environ['OPENAI_API_KEY'] = "your key"

In [None]:
#openai.api_key = "insert key here"

In [None]:
# Write function to extract (the most important) keywords, from abstarct if available, otherwise from title

def generate_keywords(text, num_keywords=3):
    prompt = f"Given the following biomedical/medical text, extract {num_keywords} important keywords capturing the main topics of the text:\n\n{text}\n\nKeywords:"
    response = openai.Completion.create(
        engine="text-davinci-002",
        prompt=prompt,
        max_tokens=50,
        n=1,
        stop=None,
        temperature=0.7,
    )

    keywords = response.choices[0].text.strip().split(', ')
    return keywords[:num_keywords]


In [None]:
# now iterrate over the df and generate keywords in the rows missing

def generate_keywords_for_missing_rows(df, delay=1):
    for index, row in df.iterrows():
        if pd.isna(row['Keywords']):
            text = row['Abstract'] if pd.notna(row['Abstract']) else row['Title']
            keywords = generate_keywords(text)
            df.at[index, 'Keywords'] = ', '.join(keywords)
            print(f"Generated keywords for index {index}: {', '.join(keywords)}")  # Print the generated keywords for debugging
            time.sleep(delay)  # Add a delay between API calls
    return df

In [None]:
df_with_keywords = generate_keywords_for_missing_rows(df)

In [None]:
# save it
df_with_keywords.to_csv("/content/drive/MyDrive/Pubmed_project/df_keywords_GPT3.csv", index=False)

In [None]:
len(df_with_keywords)

2995

In [None]:
len(df)

2995

###### extra: Add an single author column - Run this only if you want to run GPT-3 from scratch. Otherwise no need, the output is already saved as a csv in the 'Dataframes' dir.
---




In [None]:
# Define a fuction to extract single author's name from the list of authors based on the order of authorship. Create a new column.

def get_author_name(authors, order):
    try:
        if isinstance(authors, list):
            return authors[order - 1] if order <= len(authors) else None
        else:
            author_list = authors.strip("[]").replace("'", "").split(", ")
            return author_list[order - 1] if order <= len(author_list) else None
    except (ValueError, IndexError):
        return None

# Create new column 'Author's Name' with extracted author names
df_with_keywords["Author's Name"] = df_with_keywords.apply(lambda x: get_author_name(x['Authors'], x["Author's Order"]), axis=1)

In [None]:
print(df_with_keywords["Author's Name"])

0       Doudna JA
1       Doudna JA
2       Doudna JA
3       Doudna JA
4       Doudna JA
          ...    
3443      Taha TY
3444      Taha TY
3445      Taha TY
3446      Taha TY
3447      Taha TY
Name: Author's Name, Length: 2995, dtype: object


In [None]:
len(df_with_keywords)

2995

In [None]:
df_with_keywords.head(10)

In [None]:
# save it
df_with_keywords.to_csv("/content/drive/MyDrive/Pubmed_project/df_keywords_GPT3_&Authors.csv", index=False)