In [None]:
# Import necessary libraries
import pandas as pd
import re

def load_and_prepare_pubmed(filepath):
    """Load PubMed CSV data."""
    return pd.read_csv(filepath)

def load_and_prepare_orcid(filepath):
    """Load ORCID CSV data and rename columns for consistency."""
    df = pd.read_csv(filepath)
    df.rename(columns={
        'title': 'Title',
        'author': 'Full Name',
        'journal_title': 'Journal',
        'year': 'Year',
        'keywords': 'Keywords'
    }, inplace=True)
    return df

def clean_keywords_column(keywords):
    """
    Clean the 'Keywords' field by removing brackets and quotes,
    and replacing commas with semicolons.
    """
    if isinstance(keywords, str):
        cleaned = re.sub(r"[\[\]']", "", keywords)
        return cleaned.replace(",", ";").strip()
    return ""

def merge_and_clean_data(pubmed_df, orcid_df):
    """Merge two dataframes and clean duplicates and keywords."""
    # Combine the two dataframes
    merged_df = pd.concat([pubmed_df, orcid_df], ignore_index=True)

    # Drop duplicate rows (by full row and then by title only)
    merged_df = merged_df.drop_duplicates()
    merged_df = merged_df.drop_duplicates(subset='Title', keep='first')

    # Clean the 'Keywords' column
    merged_df["Keywords"] = merged_df["Keywords"].apply(clean_keywords_column)

    return merged_df

def main():
    # Load both datasets
    pubmed_df = load_and_prepare_pubmed('Pubmed/Cleaned Up Pubmed.csv')
    orcid_df = load_and_prepare_orcid('ORCID/Cleaned Up ORCID.csv')

    # Merge and clean data
    merged_df = merge_and_clean_data(pubmed_df, orcid_df)

    # Save cleaned merged dataset
    merged_df.to_csv('Merged with both pubmed and orcid.csv', index=False)
    print(f"✅ Successfully saved merged dataset with {len(merged_df)} rows.")

if __name__ == "__main__":
    main()