## Remove Non-English documents & perform second round of cleaning (using Title and Abstract)

In [1]:
import pandas as pd
from langdetect import detect

In [2]:
PATH_data_export = "/Users/huiziyu/Library/CloudStorage/GoogleDrive-huiziy@g.ucla.edu/My Drive/Project - LLM in Biomedical & Health/new_data/processed/"

In [3]:
df = pd.read_csv(f'{PATH_data_export}/output_data.csv')

In [4]:
# replace na with ""
df.fillna("",inplace=True)
abstract_df = df[['display_name','doi','abstract_text']].reset_index(drop=True)
# We use title and abstract, as well focus on sentence-line context aware info, not keywords.
abstract_df["Title_abstract"] = abstract_df["display_name"]+". "+abstract_df["abstract_text"]
# now titel only
# abstract_df["Title_abstract"] = abstract_df["display_name"]

In [5]:
abstract_df

Unnamed: 0,display_name,doi,abstract_text,Title_abstract
0,Performance of ChatGPT on USMLE: Potential for...,https://doi.org/10.1371/journal.pdig.0000198,We evaluated the performance of a large langua...,Performance of ChatGPT on USMLE: Potential for...
1,How Does ChatGPT Perform on the United States ...,https://doi.org/10.2196/45312,Background Chat Generative Pre-trained Transfo...,How Does ChatGPT Perform on the United States ...
2,Performance of ChatGPT on USMLE: Potential for...,https://doi.org/10.1101/2022.12.19.22283643,ABSTRACT We evaluated the performance of a lar...,Performance of ChatGPT on USMLE: Potential for...
3,"ChatGPT Utility in Healthcare Education, Resea...",https://doi.org/10.3390/healthcare11060887,ChatGPT is an artificial intelligence (AI)-bas...,"ChatGPT Utility in Healthcare Education, Resea..."
4,ChatGPT and the rise of large language models:...,https://doi.org/10.3389/fpubh.2023.1166120,Large Language Models (LLMs) have recently gat...,ChatGPT and the rise of large language models:...
...,...,...,...,...
8755,Effect of Ramadan intermittent fasting on rena...,https://doi.org/10.1016/j.hnm.2023.200227,Intermittent fasting (IF) has garnered attenti...,Effect of Ramadan intermittent fasting on rena...
8756,Equity in Scientific Publishing: Can Artificia...,https://doi.org/10.1016/j.mcpdig.2023.10.002,Chat Generative Pre-Trained Transformer (ChatG...,Equity in Scientific Publishing: Can Artificia...
8757,Predicting seizure recurrence from medical rec...,https://doi.org/10.1016/s2589-7500(23)00205-4,Physicians document a wealth of helpful inform...,Predicting seizure recurrence from medical rec...
8758,ChatGPT (Generated Pre-Trained Transformer) As...,https://doi.org/10.23937/2572-4037.1510062,Mental health disorders affect one in four peo...,ChatGPT (Generated Pre-Trained Transformer) As...


In [6]:
from langdetect import detect, LangDetectException

def safe_detect(text):
    # Check if the text is not empty and not just whitespace
    if text and text.strip():
        try:
            # Perform language detection
            return detect(text)
        except LangDetectException:
            # Return a default value or raise an error
            return ""
    else:
        # Return a default value for empty or whitespace-only strings
        return ""

# Applying the function
df_new = abstract_df[abstract_df.Title_abstract.apply(safe_detect).eq('en')]


In [7]:
df_new.to_csv(f'{PATH_data_export}/output_data_en.csv', index=False)

In [8]:
df_new.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8322 entries, 0 to 8759
Data columns (total 4 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   display_name    8322 non-null   object
 1   doi             8322 non-null   object
 2   abstract_text   8322 non-null   object
 3   Title_abstract  8322 non-null   object
dtypes: object(4)
memory usage: 325.1+ KB


In [9]:
file_path = f'{PATH_data_export}/doi_relevant.txt'

# Saving the 'Title_abstract' column to a text file
df_new['doi'].to_csv(file_path, index=False, header=False)

In [9]:
df_new

Unnamed: 0,display_name,doi,abstract_text,Title_abstract
0,Performance of ChatGPT on USMLE: Potential for...,https://doi.org/10.1371/journal.pdig.0000198,We evaluated the performance of a large langua...,Performance of ChatGPT on USMLE: Potential for...
1,How Does ChatGPT Perform on the United States ...,https://doi.org/10.2196/45312,Background Chat Generative Pre-trained Transfo...,How Does ChatGPT Perform on the United States ...
2,Performance of ChatGPT on USMLE: Potential for...,https://doi.org/10.1101/2022.12.19.22283643,ABSTRACT We evaluated the performance of a lar...,Performance of ChatGPT on USMLE: Potential for...
3,"ChatGPT Utility in Healthcare Education, Resea...",https://doi.org/10.3390/healthcare11060887,ChatGPT is an artificial intelligence (AI)-bas...,"ChatGPT Utility in Healthcare Education, Resea..."
4,ChatGPT and the rise of large language models:...,https://doi.org/10.3389/fpubh.2023.1166120,Large Language Models (LLMs) have recently gat...,ChatGPT and the rise of large language models:...
...,...,...,...,...
8755,Effect of Ramadan intermittent fasting on rena...,https://doi.org/10.1016/j.hnm.2023.200227,Intermittent fasting (IF) has garnered attenti...,Effect of Ramadan intermittent fasting on rena...
8756,Equity in Scientific Publishing: Can Artificia...,https://doi.org/10.1016/j.mcpdig.2023.10.002,Chat Generative Pre-Trained Transformer (ChatG...,Equity in Scientific Publishing: Can Artificia...
8757,Predicting seizure recurrence from medical rec...,https://doi.org/10.1016/s2589-7500(23)00205-4,Physicians document a wealth of helpful inform...,Predicting seizure recurrence from medical rec...
8758,ChatGPT (Generated Pre-Trained Transformer) As...,https://doi.org/10.23937/2572-4037.1510062,Mental health disorders affect one in four peo...,ChatGPT (Generated Pre-Trained Transformer) As...


## Second Round of Cleaning: 
Per OpenAlex help page: When you search works, the API looks for matches in titles, abstracts, ... For most text search we remove stop words and use stemming (specifically, the Kstem token filter) to improve results. This could result in non-relevant documents being included after stemming. So we perform a second round of cleaning to keep only documents that explicitly mention the query words in the title and abstract. 
Source: https://docs.openalex.org/how-to-use-the-api/get-lists-of-entities/search-entities

In [11]:
# Keywords
keywords_1 = ["large language model", "GPT", "gpt", "Large Language Model", "LLM"]
# keywords_2 = ["health", "medical", "Health", "Medical"]

# Checking if the Title_abstract field contains the keywords
# contains_keywords = df_new['Title_abstract'].apply(lambda x: any(kw in x for kw in keywords_1) and any(kw in x for kw in keywords_2))
contains_keywords = df_new['Title_abstract'].apply(lambda x: any(kw in x for kw in keywords_1))
df_relevant = df_new[contains_keywords]
df_relevant.to_csv(f'{PATH_data_export}data_relevant.csv', index=False)

FileNotFoundError: [Errno 2] No such file or directory: '/Users/huiziyu/Library/CloudStorage/GoogleDrive-huiziy@g.ucla.edu/My Drive/Project - LLM in Biomedical & Health/new_data/processed/data_relevant.csv'

In [36]:
file_path = f'{PATH_data_export}/doi_relevant.txt'

# Saving the 'Title_abstract' column to a text file
df_relevant['doi'].to_csv(file_path, index=False, header=False)

In [32]:
## Total with keywords

In [33]:
contains_keywords.sum()

3417

In [34]:
df_relevant

Unnamed: 0,display_name,doi,abstract_text,Title_abstract
0,Performance of ChatGPT on USMLE: Potential for...,https://doi.org/10.1371/journal.pdig.0000198,We evaluated the performance of a large langua...,Performance of ChatGPT on USMLE: Potential for...
1,How Does ChatGPT Perform on the United States ...,https://doi.org/10.2196/45312,Background Chat Generative Pre-trained Transfo...,How Does ChatGPT Perform on the United States ...
2,ChatGPT and the Future of Medical Writing,https://doi.org/10.1148/radiol.223312,"HomeRadiologyVol. 307, No. 2 PreviousNext Revi...",ChatGPT and the Future of Medical Writing. Hom...
3,Performance of ChatGPT on USMLE: Potential for...,https://doi.org/10.1101/2022.12.19.22283643,ABSTRACT We evaluated the performance of a lar...,Performance of ChatGPT on USMLE: Potential for...
4,"The Role of ChatGPT, Generative Language Model...",https://doi.org/10.2196/46885,ChatGPT is a generative language model tool la...,"The Role of ChatGPT, Generative Language Model..."
...,...,...,...,...
17959,Revolutionizing clinical experimental protocol...,https://doi.org/10.1016/j.amjms.2023.09.004,ChatGPT is a conversational artificial intelli...,Revolutionizing clinical experimental protocol...
17961,Applying GPT-4 to the Plastic Surgery Inservic...,https://doi.org/10.1016/j.bjps.2023.09.027,The recent introduction of Generative Pre-trai...,Applying GPT-4 to the Plastic Surgery Inservic...
17962,ChatGPT for low- and middle-income countries: ...,https://doi.org/10.1016/j.lanwpc.2023.100906,"ChatGPT (OpenAI, San Francisco, CA, USA) has m...",ChatGPT for low- and middle-income countries: ...
17963,Interpretable Disease Prediction via Path Reas...,https://doi.org/10.1016/j.knosys.2023.111082,Disease prediction based on patients’ historic...,Interpretable Disease Prediction via Path Reas...


In [28]:
df_without_keywords = df_new[~contains_keywords]
df_without_keywords.to_csv(f'{PATH_data_export}/nonrelevant_data.csv', index=False)