In [None]:
#Install and import required libraries

%pip install pandas ftfy keybert scikit-learn thefuzz

import pandas as pd
import ftfy
import re
import unicodedata
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from thefuzz import process

In [None]:
def load_and_format_data(pubmed_path, scopus_path, scholar_path, keywords_path):
    """
    Loads and formats publication data from PubMed, Scopus, and Google Scholar,
    and loads keyword data generated from an NLP algorithim.

    Args:
        pubmed_path (str): Path to the PubMed CSV file.
        scopus_path (str): Path to the Scopus CSV file.
        scholar_path (str): Path to the Google Scholar CSV file.
        keywords_path (str): Path to the keywords CSV file.

    Returns:
        tuple: (pubmed_df, scopus_df, scholar_df, keywords_df) — formatted DataFrames.
    """
    # Load data from CSVs
    pubmed = pd.read_csv(pubmed_path)
    scopus = pd.read_csv(scopus_path)
    google_scholar = pd.read_csv(scholar_path)
    our_keywords = pd.read_csv(keywords_path)

    # Standardize column names
    pubmed.rename(columns={"Full Name": "Name"}, inplace=True)
    google_scholar.rename(columns={"Venue": "Journal", "keywords": "Keywords"}, inplace=True)

    # Clean Google Scholar keywords
    def format_gs_keywords(keywords_list):
        return str(keywords_list).replace("[", "").replace("]", "").replace("'", "").replace(",", ";")

    google_scholar['Keywords'] = google_scholar['Keywords'].apply(lambda x: format_gs_keywords(x))

    return pubmed, scopus, google_scholar, our_keywords

def clean_article_title(title):
    """
    Fixes encoding issues and special character artifacts in article titles.
    """
    title = ftfy.fix_text(title)
    replacements = {
        "Œ±": "α", "Œ≤": "β", "Œº": "μ", "Œî": "Δ",
        "‚Äò": "'", "‚Äù": '"', "‚Äú": '"', "‚Äì": "-", "‚Äô": "'"
    }
    for bad, good in replacements.items():
        title = title.replace(bad, good)
    return title.rstrip('.')

def clean_name(name):
    """
    Cleans and standardizes author names by normalizing unicode and formatting.
    """
    name = unicodedata.normalize("NFKD", name).encode("ASCII", "ignore").decode("utf-8")
    name = re.sub(r'\s+', ' ', name).strip()
    if ',' in name:
        name = name.split(',', 1)[0]
    name = name.title()
    name = re.sub(r'\b([A-Z]) (?=[A-Z][a-z])', r'\1. ', name)
    return name

def normalize_name(name):
    """
    Reduces name to first and last names only for standardization.
    """
    name_parts = name.split()
    if len(name_parts) > 2:
        name = " ".join([name_parts[0], name_parts[-1]])
    return name

def standardize_name(name, name_list):
    """
    Finds the closest match for a name in a provided name list using fuzzy matching.
    """
    name = normalize_name(name)
    match = process.extractOne(name, name_list, scorer=fuzz.ratio)
    return match[0]

def merge_and_clean_data(pubmed, scopus, google_scholar):
    """
    Merges and consolidates data from PubMed, Scopus, and Google Scholar,
    filling in missing data and standardizing columns.
    """
    merged_df = pd.merge(pubmed, scopus, how="outer", on="Title", suffixes=('_pubmed', '_scopus'))
    merged_df = merged_df.merge(google_scholar, on="Title", how="outer")

    for col in google_scholar.columns:
        if col != "Title":
            merged_df.rename(columns={col: f"{col}_google_scholar"}, inplace=True)

    # Fill missing PubMed fields using Scopus and then Google Scholar
    for pubmed_column in merged_df.columns[merged_df.columns.str.contains('pubmed')]:
        scopus_column = pubmed_column.replace("pubmed", "scopus")
        if scopus_column in merged_df.columns:
            merged_df[pubmed_column] = merged_df[pubmed_column].combine_first(merged_df[scopus_column])

    for pubmed_column in merged_df.columns[merged_df.columns.str.contains('pubmed')][:-1]:
        gs_column = pubmed_column.replace("pubmed", "google_scholar")
        if gs_column in merged_df.columns:
            merged_df[pubmed_column] = merged_df[pubmed_column].combine_first(merged_df[gs_column])

    merged_df.rename(columns={
        "Name_pubmed": "Name",
        "DOI_pubmed": "DOI",
        "Year_pubmed": "Year",
        "Journal_pubmed": "Journal",
        "Scopus ID_scopus": "Scopus ID",
        "ORCID_scopus": "ORCID",
        "Author ID_google_scholar": "Google Scholar ID",
    }, inplace=True)

    merged_df.fillna({"Year": 0}, inplace=True)
    merged_df.fillna("", inplace=True)
    merged_df["Year"] = merged_df["Year"].astype(int)
    merged_df["DOI"] = merged_df.apply(lambda row: "" if "DOI not found" in row["DOI"] else row["DOI"], axis=1)

    # Combine keywords from all sources
    merged_df["Keywords"] = merged_df.apply(lambda row: "; ".join(filter(None, [
        row.get("Keywords_pubmed", ""), row.get("Keywords_scopus", "")]
    )), axis=1)
    merged_df["Keywords"] = merged_df.apply(lambda row: "; ".join(filter(None, [
        row.get("Keywords", ""), row.get("Keywords_google_scholar", "")]
    )), axis=1)

    return merged_df

def finalize_dataset(merged_df):
    """
    Final cleaning steps including deduplication, name standardization, and saving.
    """
    # Select and clean columns
    final_df = merged_df[[
        "Name", "Title", "Year", "Journal", "Type",
        "Keywords", "Abstract", "DOI", "Article Affiliation",
        "ORCID", "Scopus ID", "Google Scholar ID"
    ]]
    final_df = final_df[final_df["Year"] != 0]

    final_df["Title"] = final_df["Title"].apply(clean_article_title)
    final_df["Name"] = final_df["Name"].apply(clean_name)
    final_df = final_df.drop_duplicates(subset=["Title", "Name"])
    final_df = final_df[(final_df["Scopus ID"] != "") | (final_df["ORCID"] != "")]

    # Standardize names
    name_list = final_df['Name'].apply(normalize_name).unique().tolist()
    final_df['standardized_name'] = final_df['Name'].apply(lambda x: standardize_name(x, name_list))

    return final_df

def main():
    """
    Main function to orchestrate data loading, merging, cleaning, and exporting.
    """
    # Load CSVs
    pubmed_path = '/combine orcid scopus final.csv'
    scopus_path = '/scopus_publications_2:4.csv'
    scholar_path = '/google_scholar_updated.csv'
    keywords_path = '/keyword_updated_projects.csv'

    pubmed, scopus, google_scholar, our_keywords = load_and_format_data(
    pubmed_path, scopus_path, scholar_path, keywords_path
)

    # Merge and clean
    merged_df = merge_and_clean_data(pubmed, scopus, google_scholar)

    # Finalize dataset
    final_df = finalize_dataset(merged_df)

    # Export
    final_df.to_csv('final_dataset.csv', index=False)
    print("Dataset saved to final_dataset.csv")

if __name__ == "__main__":
    main()