In [None]:
import os
import pandas as pd
from scholarly import scholarly
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from dotenv import load_dotenv
from openpyxl import load_workbook
import importlib
from datetime import datetime
import time

# Load env
load_dotenv()
username = os.getenv("HUGGINGFACE_USERNAME")
password = os.getenv("HUGGINGFACE_PASSWORD")

# Import files
import Python_scripts.login as login
import Python_scripts.extract_info as extract_info
import Python_scripts.click_commit_dates as click_commit
import Python_scripts.click_username as click_username
import Python_scripts.click_arxiv_tags as click_arxiv_tags
import Python_scripts.space_apps_info as space_apps_info
import Python_scripts.get_submission_date as get_submission_date
import Python_scripts.click_community as click_community
import Python_scripts.check_404_error as check_404_error
import Python_scripts.files_versions_info as files_versions_info
importlib.reload(login)
importlib.reload(extract_info)
importlib.reload(click_commit)
importlib.reload(click_username)
importlib.reload(click_arxiv_tags)
importlib.reload(space_apps_info)
importlib.reload(get_submission_date)
importlib.reload(click_community)
importlib.reload(check_404_error)
importlib.reload(files_versions_info)

# Load links
nlp_links = pd.read_csv("Outputs/model_links-NLP.csv")
nlp_links = nlp_links[nlp_links["Tag"] == "Text Generation"]
nlp_links.reset_index(inplace = True, drop = True)


# Determine the starting batch number based on the slice
split_number = 2
batch_start_number = 1
nlp_links = nlp_links[0:40]


# Set up Chrome
chrome_options = Options()
# Display ON/OFF
chrome_options.add_argument("--headless")
service = Service(executable_path='Dependencies//chromedriver.exe') 
driver = webdriver.Chrome(service=service, options=chrome_options)

login.login_hugging_face(driver, username, password)

results = []
error_flag = 0


# Loop for every link
for index, row in nlp_links.iterrows():

    downloads_all_time = has_arxiv = model_card = language_tag = model_name = likes_element = description_names = model_card_word_count = no_space_apps = 'NA'
    community_count = discussions_count = discussions_closed_count = pull_request_count = pull_request_closed_count = 'NA'
    info_text = name_text = 'NA'
    number_of_commits = latest_commit_date = oldest_commit_date = additional_requirements = 'NA'
    arxiv_links = number_of_papers = submission_dates_dt = 'NA'
    published_dates_dt = readme_commit_dates = no_unique_readme_commits = time_differences = unique_commit_dates = no_unique_commits = 'NA'
    error_flag = 'NA'

    link = row['Model Link']

    try:
        error_flag = check_404_error.check_404_page(driver, link)
        if error_flag == 0:
            # Extract info from the link page
            extract_info_list = extract_info.extract_info(driver, link)
            downloads_per_month = extract_info_list[0]
            downloads_all_time = extract_info_list[1]
            model_card = extract_info_list[2]
            language_tag = extract_info_list[3]
            model_name = extract_info_list[4]
            likes_element = extract_info_list[5]
            description_names = extract_info_list[6]
            model_card_word_count = extract_info_list[7]
            no_space_apps =  extract_info_list[8]
            
            # Extract info after clicking the Community Tab
            click_community_list = click_community.click_comm(driver, link)
            community_count = click_community_list[0]
            discussions_count = click_community_list[1]
            discussions_closed_count = click_community_list[2]
            pull_request_count = click_community_list[3]
            pull_request_closed_count = click_community_list[4]

            # Extract info after clicking on the username link
            click_username_list = click_username.get_user_info(driver, link)
            info_text = click_username_list[0]
            name_text = click_username_list[1]
            
            # Extract info after clicking on the commits link
            click_commit_list = click_commit.click_files_and_versions(driver, link)
            number_of_commits = click_commit_list[0]
            latest_commit_date = click_commit_list[1]
            oldest_commit_date = click_commit_list[2]
            additional_requirements = click_commit_list[3]
            unique_commit_dates = click_commit_list[4]
            no_unique_commits = len(unique_commit_dates)

            # Extract info after clicking on the ARXIV tags
            click_arxiv_tags_list = click_arxiv_tags.get_arxiv_links(driver, link)
            arxiv_links = click_arxiv_tags_list[0]
            number_of_papers = len(arxiv_links)
            if number_of_papers == 0 or number_of_papers == 'NA':
                has_arxiv = 0
            elif number_of_papers > 0:
                has_arxiv = 1
            submission_dates = click_arxiv_tags_list[1]
            submission_dates_dt = [datetime.strptime(date, '%d %b %Y') if date != "N/A" else "N/A" for date in submission_dates]

            get_submission_date_list = get_submission_date.get_readme_info(driver, link)
            arxiv_links_2 = get_submission_date_list[0]
            published_dates = get_submission_date_list[1]
            published_dates_dt = [datetime.strptime(date, '%Y-%m-%dT%H:%M:%S') if date != "N/A" else "N/A" for date in published_dates]
            readme_commit_dates = get_submission_date_list[2]
            no_unique_readme_commits = len(readme_commit_dates)

            time_differences = [
                "N/A" if (isinstance(pub_date, str) or isinstance(sub_date, str) or pub_date is None or sub_date is None) 
                else (pub_date - sub_date).days
                for pub_date, sub_date in zip(published_dates_dt, submission_dates_dt)
            ]

    except:
        print(f"Unknown Error occured for the link {link}")

    
    results.append({
        'Model Link': link,
        'Valid Link?' : error_flag,
        'Model Name' : model_name,
        'Language of the Model' : language_tag,
        'Organization Tags' : info_text,
        'Name of Organization/Individual' : name_text,
        'Downloads All Time': downloads_all_time,
        'Likes' : likes_element,
        'Community Contributions' : community_count,
        'Number of Discussions' : discussions_count,
        'Closed Discussions' : discussions_closed_count,
        'Number of Pull Requests' : pull_request_count,
        'Closed Pull Requests' : pull_request_closed_count,
        'Has Arxiv Tag': has_arxiv,
        'Number of Papers' : number_of_papers,
        'Links to Paper(s)' : arxiv_links,
        'Publish Dates of Paper(s)' : submission_dates_dt,
        'Publish Dates of Paper(s) (on HF)' : published_dates_dt,
        'Time Difference' : time_differences,
        'Model Card': model_card,
        'Model Card Length' : model_card_word_count,
        'Number of Sections' : len(description_names),
        'Model Card Section Names' : description_names,
        'Number of Commits': number_of_commits,
        'Latest Commit Date': latest_commit_date,
        'Oldest Commit Date': oldest_commit_date,
        'Unique Readme Commit Dates' : readme_commit_dates,
        'Number of Unique Readme Commit Dates' : no_unique_readme_commits,
        'Unique Commit Dates' : unique_commit_dates,
        'Number of Unique Commit Dates' : no_unique_commits,
        'Number of Space Apps' : no_space_apps,
        'Additional Requirements' : additional_requirements
    })
    
    # Save to CSV every split_number links processed
    if (index + 1) % split_number == 0 or (index + 1) == len(nlp_links):
        output_df = pd.DataFrame(results)
        batch_number = batch_start_number + index // split_number
        output_csv = os.path.join("Outputs\Text_generation_results_2", f'model_info_batch_{batch_number}.csv')
        output_df.to_csv(output_csv, index=False)
        
        print(f"Finished processing batch {batch_number} and saved to {output_csv}")

        # Clear results list to start next batch
        results.clear()

# Close the driver after completion
driver.quit()


Logged in successfully!
No arxiv tags for the link: https://huggingface.co/meta-llama/Meta-Llama-3.1-8B-Instruct
No arxiv tags for the link: https://huggingface.co/google/gemma-2-2b-it
Finished processing batch 1 and saved to Outputs\Text_generation_results_2\model_info_batch_1.csv
No arxiv tags for the link: https://huggingface.co/google/gemma-2-2b
No arxiv tags for the link: https://huggingface.co/meta-llama/Meta-Llama-3.1-405B
Finished processing batch 2 and saved to Outputs\Text_generation_results_2\model_info_batch_2.csv
No arxiv tags for the link: https://huggingface.co/meta-llama/Meta-Llama-3.1-8B
Finished processing batch 3 and saved to Outputs\Text_generation_results_2\model_info_batch_3.csv
Finished processing batch 4 and saved to Outputs\Text_generation_results_2\model_info_batch_4.csv
No arxiv tags for the link: https://huggingface.co/Writer/Palmyra-Med-70B-32K
No arxiv tags for the link: https://huggingface.co/meta-llama/Meta-Llama-3.1-70B-Instruct
Finished processing batc

In [1]:
import os
import pandas as pd
from scholarly import scholarly
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from dotenv import load_dotenv
from openpyxl import load_workbook
import importlib
from datetime import datetime
import time

# Load env
load_dotenv()
username = os.getenv("HUGGINGFACE_USERNAME")
password = os.getenv("HUGGINGFACE_PASSWORD")

# Import files
import Python_scripts.login as login
import Python_scripts.extract_info as extract_info
import Python_scripts.click_commit_dates as click_commit
import Python_scripts.click_username as click_username
import Python_scripts.click_arxiv_tags as click_arxiv_tags
import Python_scripts.space_apps_info as space_apps_info
import Python_scripts.get_submission_date as get_submission_date
import Python_scripts.click_community as click_community
import Python_scripts.check_404_error as check_404_error
import Python_scripts.files_versions_info as files_versions_info
importlib.reload(login)
importlib.reload(extract_info)
importlib.reload(click_commit)
importlib.reload(click_username)
importlib.reload(click_arxiv_tags)
importlib.reload(space_apps_info)
importlib.reload(get_submission_date)
importlib.reload(click_community)
importlib.reload(check_404_error)
importlib.reload(files_versions_info)

# Load links
#nlp_links = pd.read_csv("Outputs/model_links-NLP.csv")
nlp_links = pd.read_csv("Outputs/Errors_retry.csv")
nlp_links = nlp_links[nlp_links["Tag"] == "Text Generation"]
nlp_links.reset_index(inplace = True, drop = True)
nlp_links = nlp_links[2500:]

# Determine the starting batch number based on the slice
split_number = 2
batch_start_number = 1


# Set up Chrome
chrome_options = Options()
# Display ON/OFF
chrome_options.add_argument("--headless")
service = Service(executable_path='Dependencies//chromedriver.exe') 
driver = webdriver.Chrome(service=service, options=chrome_options)

login.login_hugging_face(driver, username, password)

results = []
error_flag = 0


# Loop for every link
for index, row in nlp_links.iterrows():

    downloads_all_time = has_arxiv = model_card = language_tag = model_name = likes_element = description_names = model_card_word_count = no_space_apps = 'NA'
    community_count = discussions_count = discussions_closed_count = pull_request_count = pull_request_closed_count = 'NA'
    info_text = name_text = 'NA'
    number_of_commits = latest_commit_date = oldest_commit_date = additional_requirements = 'NA'
    arxiv_links = number_of_papers = submission_dates_dt = 'NA'
    published_dates_dt = readme_commit_dates = no_unique_readme_commits = time_differences = unique_commit_dates = no_unique_commits = 'NA'
    error_flag = 'NA'

    link = row['Model Link']
    if(index%50 == 0):
        print(index)


    try:
        error_flag = check_404_error.check_404_page(driver, link)
        if error_flag == 0:
            # Extract info from the link page
            extract_info_list = extract_info.extract_info(driver, link)
            downloads_per_month = extract_info_list[0]
            downloads_all_time = extract_info_list[1]
            model_card = extract_info_list[2]
            language_tag = extract_info_list[3]
            model_name = extract_info_list[4]
            likes_element = extract_info_list[5]
            description_names = extract_info_list[6]
            model_card_word_count = extract_info_list[7]
            no_space_apps =  extract_info_list[8]
            
            # Extract info after clicking the Community Tab
            click_community_list = click_community.click_comm(driver, link)
            community_count = click_community_list[0]
            discussions_count = click_community_list[1]
            discussions_closed_count = click_community_list[2]
            pull_request_count = click_community_list[3]
            pull_request_closed_count = click_community_list[4]

            # Extract info after clicking on the username link
            click_username_list = click_username.get_user_info(driver, link)
            info_text = click_username_list[0]
            name_text = click_username_list[1]
            
            # Extract info after clicking on the commits link
            click_commit_list = click_commit.click_files_and_versions(driver, link)
            number_of_commits = click_commit_list[0]
            latest_commit_date = click_commit_list[1]
            oldest_commit_date = click_commit_list[2]
            additional_requirements = click_commit_list[3]
            unique_commit_dates = click_commit_list[4]
            no_unique_commits = len(unique_commit_dates)

            # Extract info after clicking on the ARXIV tags
            click_arxiv_tags_list = click_arxiv_tags.get_arxiv_links(driver, link)
            arxiv_links = click_arxiv_tags_list[0]
            number_of_papers = len(arxiv_links)
            if number_of_papers == 0 or number_of_papers == 'NA':
                has_arxiv = 0
            elif number_of_papers > 0:
                has_arxiv = 1
            submission_dates = click_arxiv_tags_list[1]
            submission_dates_dt = [datetime.strptime(date, '%d %b %Y') if date != "N/A" else "N/A" for date in submission_dates]

            get_submission_date_list = get_submission_date.get_readme_info(driver, link)
            arxiv_links_2 = get_submission_date_list[0]
            published_dates = get_submission_date_list[1]
            published_dates_dt = [datetime.strptime(date, '%Y-%m-%dT%H:%M:%S') if date != "N/A" else "N/A" for date in published_dates]
            readme_commit_dates = get_submission_date_list[2]
            # no_unique_readme_commits = len(readme_commit_dates)

            time_differences = [
                "N/A" if (isinstance(pub_date, str) or isinstance(sub_date, str) or pub_date is None or sub_date is None) 
                else (pub_date - sub_date).days
                for pub_date, sub_date in zip(published_dates_dt, submission_dates_dt)
            ]

    except:
        print(f"Unknown Error occured for the link {link}")

    
    results.append({
        'Model Link': link,
        'Valid Link?' : error_flag,
        'Model Name' : model_name,
        'Language of the Model' : language_tag,
        'Organization Tags' : info_text,
        'Name of Organization/Individual' : name_text,
        'Downloads All Time': downloads_all_time,
        'Likes' : likes_element,
        'Community Contributions' : community_count,
        'Number of Discussions' : discussions_count,
        'Closed Discussions' : discussions_closed_count,
        'Number of Pull Requests' : pull_request_count,
        'Closed Pull Requests' : pull_request_closed_count,
        'Has Arxiv Tag': has_arxiv,
        'Number of Papers' : number_of_papers,
        'Links to Paper(s)' : arxiv_links,
        'Publish Dates of Paper(s)' : submission_dates_dt,
        'Publish Dates of Paper(s) (on HF)' : published_dates_dt,
        'Time Difference' : time_differences,
        'Model Card': model_card,
        'Model Card Length' : model_card_word_count,
        'Number of Sections' : len(description_names),
        'Model Card Section Names' : description_names,
        'Number of Commits': number_of_commits,
        'Latest Commit Date': latest_commit_date,
        'Oldest Commit Date': oldest_commit_date,
        'Unique Readme Commit Dates' : readme_commit_dates,
        'Number of Unique Readme Commit Dates' : no_unique_readme_commits,
        'Unique Commit Dates' : unique_commit_dates,
        'Number of Unique Commit Dates' : no_unique_commits,
        'Number of Space Apps' : no_space_apps,
        'Additional Requirements' : additional_requirements
    })


output_df = pd.DataFrame(results)
output_csv = os.path.join("Outputs", f'Errors_retry_results_10.csv')
output_df.to_csv(output_csv, index=False)
    
# Close the driver after completion
driver.quit()


Logged in successfully!
2500
2550
Error navigating to 'Community' tab for https://huggingface.co/samaysk/megaspringllamaft
2600
No arxiv tags for the link: https://huggingface.co/Flemem/TunedLlama-7B
2650
Error navigating to 'Community' tab for https://huggingface.co/sequelbox/Llama2-13B-DaringFortitude
No arxiv tags for the link: https://huggingface.co/Trelis/falcon-40B-chat-SFT
2700
Error navigating to 'Community' tab for https://huggingface.co/RUCAIBox/rear-llama-7b-hf
No arxiv tags for the link: https://huggingface.co/ethz-spylab/poisoned_generation_trojan5
Error navigating to 'Community' tab for https://huggingface.co/CausalLM/7B
Error navigating to 'Community' tab for https://huggingface.co/solakim/Doug-Mistral-Chat
No arxiv tags for the link: https://huggingface.co/solakim/Doug-Mistral-Chat
2750
Error navigating to 'Community' tab for https://huggingface.co/ParasiticRogue/Nyakura-CausalLM-RP-34B
2800
Error navigating to 'Community' tab for https://huggingface.co/ParasiticRogue/N

In [3]:
import os
import pandas as pd
from scholarly import scholarly
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from dotenv import load_dotenv
from openpyxl import load_workbook
import importlib
from datetime import datetime
import time

# Load env
load_dotenv()
username = os.getenv("HUGGINGFACE_USERNAME")
password = os.getenv("HUGGINGFACE_PASSWORD")

# Import files
import Python_scripts.login as login
import Python_scripts.extract_info as extract_info
import Python_scripts.click_commit_dates as click_commit
import Python_scripts.click_username as click_username
import Python_scripts.click_arxiv_tags as click_arxiv_tags
import Python_scripts.space_apps_info as space_apps_info
import Python_scripts.get_submission_date as get_submission_date
import Python_scripts.click_community as click_community
import Python_scripts.check_404_error as check_404_error
import Python_scripts.files_versions_info as files_versions_info
importlib.reload(login)
importlib.reload(extract_info)
importlib.reload(click_commit)
importlib.reload(click_username)
importlib.reload(click_arxiv_tags)
importlib.reload(space_apps_info)
importlib.reload(get_submission_date)
importlib.reload(click_community)
importlib.reload(check_404_error)
importlib.reload(files_versions_info)

# Determine the starting batch number based on the slice
split_number = 2
batch_start_number = 1


# Set up Chrome
chrome_options = Options()
# Display ON/OFF
# chrome_options.add_argument("--headless")
service = Service(executable_path='Dependencies//chromedriver.exe') 
driver = webdriver.Chrome(service=service, options=chrome_options)

login.login_hugging_face(driver, username, password)

results = []
error_flag = 0


links = ["https://huggingface.co/bartowski/Aethora-7b-v1-exl2",
         "https://huggingface.co/cgus/LLaMA2-13B-Erebus-v3-exl2"]


# Loop for every link
for index, link in enumerate(links):

    downloads_all_time = has_arxiv = model_card = language_tag = model_name = likes_element = description_names = model_card_word_count = no_space_apps = 'NA'
    community_count = discussions_count = discussions_closed_count = pull_request_count = pull_request_closed_count = 'NA'
    info_text = name_text = 'NA'
    number_of_commits = latest_commit_date = oldest_commit_date = additional_requirements = 'NA'
    arxiv_links = number_of_papers = submission_dates_dt = 'NA'
    published_dates_dt = readme_commit_dates = no_unique_readme_commits = time_differences = unique_commit_dates = no_unique_commits = 'NA'
    error_flag = 'NA'

    if(index%50 == 0):
        print(index)


    error_flag = check_404_error.check_404_page(driver, link)
    if error_flag == 0:
        # Extract info from the link page
        extract_info_list = extract_info.extract_info(driver, link)
        downloads_per_month = extract_info_list[0]
        downloads_all_time = extract_info_list[1]
        has_arxiv = extract_info_list[2] ######
        model_card = extract_info_list[3]
        language_tag = extract_info_list[4] ######
        model_name = extract_info_list[5]
        likes_element = extract_info_list[6]
        description_names = extract_info_list[7]
        model_card_word_count = extract_info_list[8]
        no_space_apps =  extract_info_list[9]

        # Extract info after clicking the Community Tab
        click_community_list = click_community.click_comm(driver, link)
        community_count = click_community_list[0]
        discussions_count = click_community_list[1]
        discussions_closed_count = click_community_list[2]
        pull_request_count = click_community_list[3]
        pull_request_closed_count = click_community_list[4]


Logged in successfully!
0
