In [1]:
import os
import pandas as pd
from scholarly import scholarly
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from dotenv import load_dotenv
from openpyxl import load_workbook
import importlib
from datetime import datetime

# Load env
load_dotenv()
username = os.getenv("HUGGINGFACE_USERNAME")
password = os.getenv("HUGGINGFACE_PASSWORD")

# Import files
import Python_scripts.login as login
import Python_scripts.extract_info as extract_info
import Python_scripts.click_commit_dates as click_commit
import Python_scripts.click_username as click_username
import Python_scripts.click_arxiv_tags as click_arxiv_tags
import Python_scripts.space_apps_info as space_apps_info
import Python_scripts.get_submission_date as get_submission_date
import Python_scripts.click_community as click_community
import Python_scripts.check_404_error as check_404_error
import Python_scripts.model_page_info as model_page_info
importlib.reload(login)
importlib.reload(extract_info)
importlib.reload(click_commit)
importlib.reload(click_username)
importlib.reload(click_arxiv_tags)
importlib.reload(space_apps_info)
importlib.reload(get_submission_date)
importlib.reload(click_community)
importlib.reload(check_404_error)
importlib.reload(model_page_info)

# Load links
nlp_links = pd.read_excel("Outputs/Unedited_sheets/model_info_with_likes&down.xlsx")
nlp_links.reset_index(inplace = True, drop = True)


# Determine the starting batch number based on the slice
split_number = 500
batch_start_number = 1
nlp_links = nlp_links[58499:]


# Excel file setup
output_excel = os.path.join("Outputs", 'model_info_with_likes&down&more_6.xlsx')


# Set up Chrome
chrome_options = Options()
# Display ON/OFF
chrome_options.add_argument("--headless")
service = Service(executable_path='Dependencies//chromedriver.exe') 
driver = webdriver.Chrome(service=service, options=chrome_options)

login.login_hugging_face(driver, username, password)

results = []
error_flag = 0


# Loop for every link
for index, row in nlp_links.iterrows():
    language_tag = nlp_links.iloc[0]['Language of the Model']
    info_text = nlp_links.iloc[0]['Organization Tags']
    downloads_all_time = "NA"
    has_arxiv = "NA"
    model_name = "NA"
    likes_count = "NA"
    model_card = "NA"
    model_card_word_count = "NA"
    space_app_count = "NA"
    arxiv_links = "NA"


    link = row['Model Link']
    error_flag = check_404_error.check_404_page(driver, link)
    if error_flag == 0:
        temp_list = model_page_info.extract_info(driver, link)
        downloads_all_time = temp_list[0]
        has_arxiv = temp_list[1]
        model_name = temp_list[2]
        likes_count = temp_list[3]
        model_card = temp_list[4]
        model_card_word_count = temp_list[5]
        space_app_count = temp_list[6]
        arxiv_links = temp_list[7]
        number_of_papers = len(arxiv_links)

    results.append({
        'Model Link': link,
        'Language of the Model' : language_tag,
        'Organization Tags' : info_text,
        'Downloads All Time': downloads_all_time,
        'Likes' : likes_count,
        'Number of Space Apps' : space_app_count,
        'Valid Link?' : error_flag,
        'Model Card': model_card,
        'Model Card Length' : model_card_word_count,
        'Model Name' : model_name,
        'Has Arxiv Tag': has_arxiv,
        'Number of Papers' : number_of_papers,
        'Links to Paper(s)' : arxiv_links
    })
    
    # Save to Excel every split_number links or when finished
    if (index + 1) % split_number == 0 or (index + 1) == len(nlp_links):
        output_df = pd.DataFrame(results)
        batch_number = batch_start_number + index // split_number
        sheet_name = f'Batch_{batch_number}'
        print(f"Finished processing {batch_number} batches")

        if os.path.exists(output_excel):
            # Load the workbook
            workbook = load_workbook(output_excel)
            if sheet_name in workbook.sheetnames:
                # If the sheet already exists, remove it
                del workbook[sheet_name]
                workbook.save(output_excel)
            with pd.ExcelWriter(output_excel, mode='a', engine='openpyxl') as writer:
                output_df.to_excel(writer, sheet_name=sheet_name, index=False)
        else:
            with pd.ExcelWriter(output_excel, mode='w', engine='openpyxl') as writer:
                output_df.to_excel(writer, sheet_name=sheet_name, index=False)

        # Clear results list to start next batch
        results.clear()

# Save any remaining links if the results are not empty after the loop
if results:
    output_df = pd.DataFrame(results)
    batch_number += 1
    sheet_name = f'Batch_{batch_number}'
    print(f"Finished processing {batch_number} batches")

    if os.path.exists(output_excel):
        workbook = load_workbook(output_excel)
        if sheet_name in workbook.sheetnames:
            del workbook[sheet_name]
            workbook.save(output_excel)
        with pd.ExcelWriter(output_excel, mode='a', engine='openpyxl') as writer:
            output_df.to_excel(writer, sheet_name=sheet_name, index=False)
    else:
        with pd.ExcelWriter(output_excel, mode='w', engine='openpyxl') as writer:
            output_df.to_excel(writer, sheet_name=sheet_name, index=False)


# Close the driver after completion
driver.quit()

Logged in successfully!
Finished processing 117 batches
Finished processing 118 batches
Finished processing 119 batches
Finished processing 120 batches
Finished processing 121 batches
Finished processing 122 batches
Finished processing 123 batches
Finished processing 124 batches
Finished processing 125 batches
Finished processing 126 batches
Finished processing 127 batches
Finished processing 128 batches
Finished processing 129 batches
Finished processing 130 batches
Finished processing 131 batches


In [21]:
[print(i) for i in nlp_links["Model Link"]]

https://huggingface.co/meta-llama/Meta-Llama-3.1-8B-Instruct
https://huggingface.co/google/gemma-2-2b-it
https://huggingface.co/google/gemma-2-2b
https://huggingface.co/meta-llama/Meta-Llama-3.1-405B
https://huggingface.co/nisten/Biggie-SmoLlm-0.15B-Base


[None, None, None, None, None]

In [20]:
import os
import pandas as pd
from scholarly import scholarly
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from dotenv import load_dotenv
from openpyxl import load_workbook
import importlib
from datetime import datetime

# Load env
load_dotenv()
username = os.getenv("HUGGINGFACE_USERNAME")
password = os.getenv("HUGGINGFACE_PASSWORD")

# Import files
import Python_scripts.login as login
import Python_scripts.extract_info as extract_info
import Python_scripts.click_commit_dates as click_commit
import Python_scripts.click_username as click_username
import Python_scripts.click_arxiv_tags as click_arxiv_tags
import Python_scripts.space_apps_info as space_apps_info
import Python_scripts.get_submission_date as get_submission_date
import Python_scripts.click_community as click_community
import Python_scripts.check_404_error as check_404_error
import Python_scripts.model_page_info as model_page_info
importlib.reload(login)
importlib.reload(extract_info)
importlib.reload(click_commit)
importlib.reload(click_username)
importlib.reload(click_arxiv_tags)
importlib.reload(space_apps_info)
importlib.reload(get_submission_date)
importlib.reload(click_community)
importlib.reload(check_404_error)
importlib.reload(model_page_info)

# Load links
nlp_links = pd.read_excel("Outputs/Unedited_sheets/model_info_with_likes&down.xlsx")
nlp_links.reset_index(inplace = True, drop = True)


# Determine the starting batch number based on the slice
split_number = 500
batch_start_number = 1
nlp_links = nlp_links[58499:]

In [21]:
nlp_links

Unnamed: 0,Model Link,Language of the Model,Organization Tags,Downloads All Time,Likes,Number of Space Apps,Invalid Link?
58499,https://huggingface.co/bella05/pogny_5_32_0.01,,,7.0,0.0,0.0,0
58500,https://huggingface.co/bwahyuh/digidaw1,,,7.0,0.0,0.0,0
58501,https://huggingface.co/ab30atsiwo/finbert-gpt-...,,,8.0,0.0,0.0,0
58502,https://huggingface.co/ab30atsiwo/finbert-gpt-...,,,6.0,0.0,0.0,0
58503,https://huggingface.co/movadek/uk-imm-court-ou...,English,,8.0,0.0,0.0,0
...,...,...,...,...,...,...,...
65151,https://huggingface.co/SaiPavanKumarMeruga/dis...,,,4.0,0.0,0.0,0
65152,https://huggingface.co/apriandito/tipe-tweet,,,4.0,0.0,0.0,0
65153,https://huggingface.co/Rahul-8853/testtttt,,,0.0,0.0,0.0,0
65154,https://huggingface.co/quangtqv/mxbai_rerank_t...,,,5.0,0.0,0.0,0


# Testing Code

In [20]:
import os
import pandas as pd
from scholarly import scholarly
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from dotenv import load_dotenv
import importlib
from datetime import datetime

# Load env
load_dotenv()
username = os.getenv("HUGGINGFACE_USERNAME")
password = os.getenv("HUGGINGFACE_PASSWORD")

# Import files
import Python_scripts.login as login
import Python_scripts.extract_info as extract_info
import Python_scripts.click_commit_dates as click_commit
import Python_scripts.click_username as click_username
import Python_scripts.click_arxiv_tags as click_arxiv_tags
import Python_scripts.space_apps_info as space_apps_info
import Python_scripts.get_submission_date as get_submission_date
import Python_scripts.check_404_error as check_404_error
importlib.reload(login)
importlib.reload(extract_info)
importlib.reload(click_commit)
importlib.reload(click_username)
importlib.reload(click_arxiv_tags)
importlib.reload(space_apps_info)
importlib.reload(get_submission_date)
importlib.reload(check_404_error)

# Load links
nlp_links = pd.read_csv("Outputs/model_links-NLP.csv")
new_link = "https://huggingface.co/microsoft/Florence-2-large"
nlp_links = pd.concat([pd.DataFrame({"Model Link": [new_link]}), nlp_links]).reset_index(drop=True)
nlp_links = nlp_links[0:2000]

# Set up Chrome
chrome_options = Options()
# Display ON/OFF
# chrome_options.add_argument("--headless")
service = Service(executable_path='Dependencies//chromedriver.exe') 
driver = webdriver.Chrome(service=service, options=chrome_options)

# login.login_hugging_face(driver, username, password)

link = 'https://huggingface.co/aubmindlab/aragpt2-mega-detector-long'

error_flag = check_404_error.check_404_page(driver, link)
print(error_flag)

# Extract info after clicking on the commits link
# click_commit_list = click_commit.click_files_and_versions(driver, link)
# number_of_commits = click_commit_list[0]
# latest_commit_date = click_commit_list[1]
# oldest_commit_date = click_commit_list[2]


# Extract info after clicking on the ARXIV tags
# click_arxiv_tags_list = click_arxiv_tags.get_arxiv_links(driver, link)
# arxiv_links = click_arxiv_tags_list[0]
# number_of_papers = len(arxiv_links)
# submission_dates = click_arxiv_tags_list[1]
# submission_dates_dt = [datetime.strptime(date, '%d %b %Y') if date != "N/A" else "N/A" for date in submission_dates]

get_submission_date_list = get_submission_date.get_readme_info(driver, link)
arxiv_links_2 = get_submission_date_list[0]
published_dates = get_submission_date_list[1]
published_dates_dt = [datetime.strptime(date, '%Y-%m-%dT%H:%M:%S') if date != "N/A" else "N/A" for date in published_dates]

# time_differences = [
#     "N/A" if (pub_date == "N/A" or sub_date == "N/A") else (pub_date - sub_date).days if pub_date and sub_date else None
#     for pub_date, sub_date in zip(published_dates_dt, submission_dates_dt)
# ]

0


In [7]:
import Python_scripts.login as login
import Python_scripts.extract_info as extract_info
import Python_scripts.click_commit_dates as click_commit
import Python_scripts.click_username as click_username
import Python_scripts.click_arxiv_tags as click_arxiv_tags
import Python_scripts.space_apps_info as space_apps_info
import Python_scripts.get_submission_date as get_submission_date
import Python_scripts.check_404_error as check_404_error
importlib.reload(login)
importlib.reload(extract_info)
importlib.reload(click_commit)
importlib.reload(click_username)
importlib.reload(click_arxiv_tags)
importlib.reload(space_apps_info)
importlib.reload(get_submission_date)
importlib.reload(check_404_error)

chrome_options = Options()
# Display ON/OFF
# chrome_options.add_argument("--headless")
service = Service(executable_path='Dependencies//chromedriver.exe') 
driver = webdriver.Chrome(service=service, options=chrome_options)

commit_url = 'https://huggingface.co/aubmindlab/aragpt2-mega-detector-long/commit/685843487166af81b5cc47f33386f0f107d10d4c'
arxiv_tags = ['2012.15520']
get_submission_date.check_readme_for_arxiv(driver, commit_url, arxiv_tags)

{'2012.15520': '2021-03-11T21:46:39'}