In [None]:

from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.edge.service import Service as EdgeService
from webdriver_manager.microsoft import EdgeChromiumDriverManager
from selenium.webdriver.support.wait import WebDriverWait 
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.support.ui import Select
from selenium.common.exceptions import StaleElementReferenceException
from selenium.webdriver.common.by import By
import time
import os
import pandas as pd
import re
import numpy as np

###### INPUTS = ####
login = "https://app.covidence.org/sign_in"
email = "mjs30170@gmail.com" ## LLM Test User email
password = "3D-fApF!DtLfrbP" ## LLM Test User password

# Include .csv in DocumentTitle
## HAS TO BE A CSV!
DocumentTitle = "Test_data_11-6.csv"
input_document = os.path.join(os.getcwd(), DocumentTitle)

# Articles remaining
## Remember to update articles remaining
NumberArticlesToScreen = 5

# Set a custom path for the Edge driver
## Ensure the directory has the necessary permissions
### find current working directory and adds the appropriate webdriver
custom_driver_path = os.path.join(os.getcwd(), 'edgedriver') 
### Make driver path a directory path
os.makedirs(custom_driver_path, exist_ok=True)

# Set the environment variable for the Edge driver
## Local = 1 for local computer
os.environ['WDM_LOCAL'] = '1'
## log_leve = 0, minimal logging
os.environ['WDM_LOG_LEVEL'] = '0'
## set edge driver variable to the driver path
os.environ['WDM_EDGE_DRIVER'] = custom_driver_path

# Manually set the path for the Edge driver
## finds driver and loads into python
driver_path = EdgeChromiumDriverManager().install()
## initalize webdriver
driver = webdriver.Edge(service=EdgeService(driver_path))
## open and maximaize
driver.get(login)
driver.maximize_window()
time.sleep(5)

# Find the username field and enter the email
username_field = driver.find_element(By.NAME, "session[email]")
username_field.send_keys(email)

# Find the password field and enter the password
password_field = driver.find_element(By.NAME, "session[password]")
password_field.send_keys(password)

# Find the login button and click it
login_button = driver.find_element(By.NAME, "commit")
login_button.click()
time.sleep(5)

# Find the redirect button with partial link text "TEST LLM" and click it
redirect_button1 = driver.find_element(By.PARTIAL_LINK_TEXT, "TEST LLM")
redirect_button1.click()
time.sleep(5)

# Find the "Continue" button and click it
redirect_button2 = driver.find_element(By.LINK_TEXT, "Continue")
redirect_button2.click()
time.sleep(5)

try:
    # Assuming the window has a popup, use the close button with a specific aria-label
    popup_close_button = driver.find_element(By.CSS_SELECTOR, '[aria-label="Close popover"]')
    popup_close_button.click()
    time.sleep(2)
except:
    # If no popup appears, continue with the script
    pass

# if/else to determine how many articles to display per page
if NumberArticlesToScreen <= 25:
    NumberPerPage = 25
elif 26 <= NumberArticlesToScreen <= 50:
    NumberPerPage = 50
elif NumberArticlesToScreen >= 51:
    NumberPerPage = 100

# internal loop counter
## designed to be overly conservative and allow the program to run without interruptions
if NumberArticlesToScreen <= NumberPerPage:
    NumberArticleLoops = 1
else:
    NumberArticleLoops = int(round(NumberArticlesToScreen / NumberPerPage, 0))
 
# Select items per page from the dropdown
if NumberPerPage >= 50:
    PerPage = Select(driver.find_element(By.CLASS_NAME, "per-page"))
    PerPage.select_by_value(str(NumberPerPage))
time.sleep(5)

# Handle potential popup
try:
    # Assuming the popup has a close button with a specific aria-label
    popup_close_button = driver.find_element(By.CSS_SELECTOR, '[aria-label="close popup"]')
    popup_close_button.click()
    time.sleep(2)
except:
    # If no popup appears, continue with the script
    pass

# define the function clean_data to drop specific columns
def clean_data(df):
    # Drop columns: 'Title', 'Authors' and 13 other columns
    df = df.drop(columns=['Title', 'Authors', 'Abstract', 'Published Year', 'Published Month', 'Journal', 'Volume', 'Issue', 'Pages', 'Accession Number', 'DOI', 'Ref', 'Study', 'Notes', 'Tags'])
    return df

# Loaded variable 'df' from 
df = pd.read_csv(input_document)
# Clean the data by dropping unnecessary columns
df_clean = clean_data(df.copy())

# This chunk of code pulls the website source code 
# to find the necessary information
for _ in range(NumberArticleLoops):
    # Call the function to read and edit the text file
    input = driver.page_source

    # Join the list of lines into a single string
    input_str = ''.join(input)

    # Find all occurrences of the pattern '<tr class="" id="study-...">'
    pattern = r'<tr class="" id="study-[^"]*">'
    matches = re.findall(pattern, input_str)

    # Remove 10 characters from the front and 2 from the end of each match
    processed_matches = [match[23:-2] for match in matches]

    # Print all processed matches
    for match in processed_matches:
        print(match)

    # Save processed matches as a list
    matches_list = processed_matches

    # Find the end position of the last match
    if matches:
        last_match_end = input_str.find(matches[-1]) + len(matches[-1])
    else:
        last_match_end = 0

    # Find all occurrences of text between '#' and a new line starting from last_match_end
    pattern_between_hash_and_newline = r'<div class="study-header">\n#(.*?)\n'
    matches_between_hash_and_newline = re.findall(pattern_between_hash_and_newline, input_str)

    # Print all matches between '#' and a new line
    for match in matches_between_hash_and_newline:
        print(match)

    # Save matches as a list
    matches_between_hash_and_newline_list = matches_between_hash_and_newline

    # Combine the lists into a DataFrame with specified column names
    data = {
        'InternalID': matches_list,
        'ShownID': matches_between_hash_and_newline_list
    }

    # Ensure both lists are of the same length by padding the shorter list with None
    max_length = max(len(matches_list), len(matches_between_hash_and_newline_list))
    data['InternalID'].extend([None] * (max_length - len(matches_list)))
    data['ShownID'].extend([None] * (max_length - len(matches_between_hash_and_newline_list)))

    df_combined = pd.DataFrame(data)

    # Add "#" at the beginning of each cell in the ShownID column
    df_combined['ShownID'] = df_combined['ShownID'].apply(lambda x: f'#{x}' if x is not None else x)

    # Print the DataFrame
    print(df_combined)

    # Combine df_clean and df_combined into a new DataFrame, keeping only rows with matches
    df_final = pd.merge(df_clean, df_combined, left_on='CovidenceIDNumber', right_on='ShownID', how='inner')

    # Set the common column as the index for both DataFrames
    df_combined.set_index('InternalID', inplace=True)
    df_final.set_index('InternalID', inplace=True)

    # Reindex df_final to match the order of df_combined
    df_final = df_final.reindex(df_combined.index)

    # Reset the index if needed
    df_combined.reset_index(inplace=True)
    df_final.reset_index(inplace=True)

    # Loop to submit results. 
    for index in df_final.index:
        ItemDec = df_final.loc[index, 'LLM']
        ItemID = df_final.loc[index, 'InternalID']
        attempts = 0
        while attempts < 10:
            try:
                if ItemDec == 'No':
                    current_item = WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.XPATH, f"//*[@id='study-{ItemID}']/td[3]/div[1]/button[1]")))
                    driver.execute_script("arguments[0].click();", current_item)
                elif ItemDec == 'Yes':
                    current_item = WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.XPATH, f"//*[@id='study-{ItemID}']/td[3]/div[1]/button[3]")))
                    driver.execute_script("arguments[0].click();", current_item)
                time.sleep(1) 
                break
            except StaleElementReferenceException:
                attempts += 1
                time.sleep(1)
time.sleep(10)

1161389772
1161389828
1161389757
1161389758
1161389927
1161389940
1161389745
1161389746
1161389902
1161390092
1161390008
1161390014
1161389648
1161389665
1161389808
1161390150
1161390166
1161390172
1161390192
1161390033
1161390045
1161389626
1161390050
1161389687
1161389855
1161389625
1161389636
1161389640
1161389662
1161389800
1161389773
1161389795
1161390066
1161390234
1161389681
1161390152
1161390195
1161389844
1161389956
1161389703
1161389958
1161389951
1161389952
1161389691
1161389833
1161389905
1161389921
1161389864
1161390079
1161390081
1161389887
1161390022
1161389931
1161389885
1161389897
1161389865
1161389823
1161389870
1161389883
1161389859
1161389863
1161389791
1161389799
1161389814
1161389713
1161389739
1161390094
1161390134
1161389747
1161389767
1161389900
1161390143
1161389646
1161389686
1161390059
1161390063
1161390086
1161390179
1161390208
1161390216
1161390226
1161389660
1161389748
1161389752
1161389806
1161389654
1161389634
1161389720
1161389647
1161389728
1161389971

In [2]:
import pandas as pd
import os

# Load and clean the irrelevant data
irrelevant_document = os.path.join(os.getcwd(), "IrrelevantTest.csv")
irrelevant = pd.read_csv(irrelevant_document)
irrelevant_clean = clean_data(irrelevant.copy())
irrelevant_clean = irrelevant_clean.rename(columns={'Covidence #': 'ShownID'})
irrelevant_clean = irrelevant_clean.sort_values(by='ShownID')

# Load and clean the screenRound2 data
screenRound2_document = os.path.join(os.getcwd(), "ScreenRound2.csv")
screenRound2 = pd.read_csv(screenRound2_document)
screenRound2_clean = clean_data(screenRound2.copy())
screenRound2_clean = screenRound2_clean.rename(columns={'Covidence #': 'ShownID'})
screenRound2_clean = screenRound2_clean.sort_values(by='ShownID')

# Print descriptive statistics
print(irrelevant_clean.describe())
print(screenRound2_clean.describe())

# Assuming df_clean is already defined and contains the 'LLM' column
# Filter rows where LLM is 'Yes'
df_testYes = df_clean[df_clean['LLM'] == 'Yes']

# Rename the column in df_testYes
df_testYes = df_testYes.rename(columns={'CovidenceIDNumber': 'ShownID'})
df_testYes = df_testYes.sort_values(by='ShownID')

# Filter rows where LLM is 'No'
df_testNo = df_clean[df_clean['LLM'] == 'No']

# Rename the column in df_testNo
df_testNo = df_testNo.rename(columns={'CovidenceIDNumber': 'ShownID'})
df_testNo = df_testNo.sort_values(by='ShownID')

## outer join leaves NaN for missing spots
testYes = pd.merge(df_testYes, screenRound2_clean, on="ShownID", how='outer')

testNo = pd.merge(df_testNo, irrelevant_clean, on="ShownID", how='outer')

len(testYes) + len(testNo) == len(df_clean)

       ShownID
count     6025
unique    6025
top      #9999
freq         1
       ShownID
count     6102
unique    6102
top      #9994
freq         1


False

In [3]:
try:
    # Assuming the popup has a close button with a specific aria-label
    popup_close_button = driver.find_element(By.CSS_SELECTOR, '[aria-label="Close popover"]')
    popup_close_button.click()
    time.sleep(2)
except:
    # If no popup appears, continue with the script
    pass