In [8]:
def generate_complete_list_of_journals():

    import glob
    import pandas as pd

    journal_names = glob.glob("scimago*.csv")

    records = []
    for journal_name in journal_names:
        records.append(
            pd.read_csv(
                journal_name,
                sep=";",
                on_bad_lines="skip",
            )
        )
    records = pd.concat(records)
    records = records[["Title"]]
    records = records.drop_duplicates()
    records = records.sort_values(by="Title")
    records.to_csv("complete_list_of_journals.txt", index=False, header=False)


# generate_complete_list_of_journals()

In [35]:
def process_candidates():

    import glob
    import pandas as pd

    #
    # Extract existent source titles
    existent_titles = glob.glob("journals/*.csv")
    existent_titles = [title[:-4] for title in existent_titles]

    #
    # Load candidate records
    file_names = glob.glob("candidates/*.csv")

    records = []
    for file_name in file_names:
        records.append(
            pd.read_csv(
                file_name,
                sep=",",
                on_bad_lines="skip",
            )
        )
    records = pd.concat(records)

    #
    # Extract candidate source titles
    candidate_titles = records[["Source title"]].drop_duplicates()

    #
    # Remove existent titles from candidate titles
    candidate_titles = candidate_titles[
        ~candidate_titles["Source title"].isin(existent_titles)
    ]

    #
    # For each candidate title, filter the records and save the dataframe to a separate CSV file
    # in the journals directory
    for candidate_title in candidate_titles["Source title"]:
        records[records["Source title"] == candidate_title].to_csv(
            f"journals/{candidate_title.replace('/', ' ')}.csv",
            index=False,
            sep=",",
            compression="zip",
        )


def remove_downloaded_journals():

    import glob
    import pandas as pd

    #
    # Load the complete list of journals
    records = pd.read_csv(
        "complete_list_of_journals.txt",
        header=None,
        on_bad_lines="skip",
    )
    records.columns = ["Title"]
    complete_titles = records["Title"].values

    #
    # Extract existent source titles
    downloaded_titles = glob.glob("journals/*.csv")
    downloaded_titles = [title[:-4] for title in downloaded_titles]
    downloaded_titles = [title[9:] for title in downloaded_titles]
    complete_titles = [
        title
        for title in complete_titles
        if title.replace("/", " ") not in downloaded_titles
    ]

    with open("complete_list_of_journals.txt", "w") as file:
        for title in complete_titles:
            file.write(title + "\n")


process_candidates()
remove_downloaded_journals()

In [37]:
def zip_files():

    import glob
    import pandas as pd
    import os

    #
    # Extract existent source titles
    titles = glob.glob("journals/*.csv")

    for title in titles:
        pd.read_csv(title).to_csv(title + ".zip", index=False, compression="zip")
        # delete the original file
        os.remove(title)


# zip_files()