In [1]:
!pip install pandas
!pip install utils

# scihub requirements
!pip install beautifulsoup4
!pip install requests
!pip install retrying
!pip install pysocks




### Create pd dataframes from txt files downloaded from AACT

In [2]:
import pandas as pd
import os

# get files from cwd
cwd = os.getcwd()
files = os.listdir(cwd)


for file in files:
    if file.endswith('.txt'):
        name = file.rsplit('.',1)[0]
        if name == 'study_records':
            continue
        vars()[name] = pd.read_csv(file, delimiter='|', low_memory=False)


### Select which columns are useful for tokenization, this can be changed to find optimal LLM performance if needed

In [3]:
# create dictionary for which columns to call

tables = {
    "browse_conditions" : ['nct_id', 'mesh_term'],
    "browse_interventions" : ['nct_id', 'mesh_term'],
    "conditions" : ['nct_id', 'name'],
    "countries" : ['nct_id', 'name'],
    "design_groups" : ['nct_id', 'group_type', 'title', 'description'],
    "designs" : ['nct_id', 'allocation', 'intervention_model', 'observational_model', 
                 'primary_purpose', 'time_perspective', 'masking', 'masking_description', 
                 'intervention_model_description'],
    "detailed_descriptions" : ['nct_id', 'description'],
    "eligibilities" : ['nct_id', 'sampling_method', 'gender', 'minimum_age', 
                       'maximum_age', 'population', 'criteria', 'gender_description'],
    "facilities" : ['nct_id', 'name', 'city', 'state', 'country'],
    "keywords" : ['nct_id', 'name'],
    "sponsors" : ['nct_id', 'agency_class', 'lead_or_collaborator', 'name'],
    "studies" : ['nct_id', 'brief_title', 'official_title', 'overall_status', 
                 'phase', 'source', 'biospec_description', 'biospec_retention',  
                 'plan_to_share_ipd_description'],
    "study_references" : ['nct_id', 'pmid', 'citation']
}

### Creates df with NCT ID and entire string of words to be used (exception: no pdfs yet)
#### This will take a while

In [4]:
# check if there is multiple rows related to each NCT ID
def duplicateNCT(dfName):
    return not dfName["nct_id"].is_unique

# select columns that we care about, then concatenate strings from all rows with same NCT_ID
def getRelevantData(dfName, relevantColumnList, dfOriginal):
    
    df = dfName[relevantColumnList].copy()
    

    # concatenate if needed
    if duplicateNCT(dfName) == True:
        for col in relevantColumnList:
            if col == 'nct_id':
                continue
            df[col] = df[col].astype(str)
            df[col] = df.groupby(['nct_id'])[col].transform(lambda x : ' '.join(x))
            
    df = df.drop_duplicates()
    
    # join new columns into main DF
    dfCombined = dfOriginal.join(df.set_index('nct_id'), on='nct_id')
    return dfCombined

def mergeColumns(relevantColumnList, dfOriginal):
    # merge all columns to the right of NCT column
    dfOriginal['words'] = dfOriginal[dfOriginal.columns[1:]].apply(lambda x: '. '.join(x.astype(str)), axis=1)
    
    # delete columns that were merged
    for col in relevantColumnList:
        if col == 'nct_id':
            continue
        dfOriginal = dfOriginal.drop(col, axis=1)    
    return dfOriginal


#initialize LLM DF (protocols only - NO results)
trainingDF = brief_summaries[['nct_id','description']].copy()
trainingDF = trainingDF.rename(columns={'description': 'words'})

# add all this shit together
for key in tables:
    trainingDF = getRelevantData(locals()[key], tables[key], trainingDF)
    trainingDF = mergeColumns(tables[key], trainingDF)


### Create separate DOI df using only the study references table
#### a lot faster

In [5]:
doiDF = trainingDF[['nct_id']].copy()
doiDF = getRelevantData(locals()["study_references"], tables["study_references"], doiDF)
doiDF = doiDF[doiDF['citation'].notna()]

### Get DOI link, validate it, and use Sci Hub to scrape it

In [1]:
from scihub import SciHub


count = 0
for index, row in doiDF.iterrows():
    count += 1
    str = row['citation']
    ls = str.split()
    for i in range(len(ls)):
        if ls[i] == 'doi:':
            link = ls[i + 1] 
    #print(link)

print(count)

NameError: name 'doiDF' is not defined