In [None]:
import sinta
import logging
from scholarly import scholarly
from scholarly import MaxTriesExceededException
import json
from pathlib import Path

In [None]:
import pandas as pd
from datetime import datetime
#pd.set_option('display.max_rows', None)

# Define a function to extract and format the birth date from the NIDN
def extract_birth_date(nidn):
    # Check if the NIDN is not available or not long enough
    if pd.isnull(nidn) or len(nidn) < 8:
        return None
    # Extract the birth date components
    day = nidn[2:4]
    month = nidn[4:6]
    year = '19' + nidn[6:8]  # Assuming the year is in 19xx
    # Format the birth date as dd/mm/yyyy
    return day + '/' + month + '/' + year

# Define a function to calculate the age from the birth date
def calculate_age(birth_date):
    # Check if the birth date is not available
    if pd.isnull(birth_date):
        return None
    # Extract the birth year
    birth_year = int(birth_date.split('/')[-1])
    # Calculate the age by subtracting the birth year from the current year
    return datetime.now().year - birth_year

In [None]:
AFFILIATION = "UB"
df_nidn = pd.read_csv(f"../data/raw/{AFFILIATION}/{AFFILIATION.lower()}_data_NIDN.txt", dtype={'id': str, 'NIDN': str})
df_retired = pd.read_csv(f"../data/raw/{AFFILIATION}/{AFFILIATION.lower()}_bio_retired.txt")
df_retired

In [None]:
df = df_nidn[~df_nidn.name.isin(df_retired.Retired)]
# Create a new column 'birth_date' by applying the function to the 'NIDN' column
df.loc[:, 'birth_date'] = df['NIDN'].apply(extract_birth_date)

# Create a new column 'age' by applying the function to the 'birth_date' column
df.loc[:, 'age'] = df['birth_date'].apply(calculate_age)
df = df.rename(columns={"name":"name_inputted"})
# Display the DataFrame
df

In [None]:
sinta_get = sinta.author(df.id.dropna().to_list())

In [None]:
df_sinta = pd.DataFrame.from_dict(sinta_get)
df_clean = df.merge(df_sinta, left_on="id", right_on="id").set_index("id")
df_clean = df_clean.rename(columns={"name":"name_sinta", "affiliation":"affiliation_sinta"})

In [None]:
google_result = {}
for sinta_id in df_clean.index:
    google_id = df_clean.loc[sinta_id, "google_scholar_id"]
    author_name = df_clean.loc[sinta_id, "name_inputted"]
    outfile = Path(f"../data/processed/{AFFILIATION}/google_scholar/{sinta_id}.json")
    outfile.parent.mkdir(exist_ok=True, parents=True)
    if outfile.exists():
        logging.debug(f"Fetching scholar information for sinta_id: {sinta_id} from cache...")
        with open(outfile, "r") as f:
            author = json.load(f)
    else:
        if google_id == None:
            logging.warning(f"Unable to get google scholar id for {sinta_id}")
            author = {}
        else:
            logging.debug(f"Fetching scholar information for sinta_id: {sinta_id} using google scholar API: {google_id}...")
            try:
                author = scholarly.search_author_id(google_id)
            except MaxTriesExceededException as e:
                logging.warning(e)
                logging.debug(f"Searching scholar information for: {author_name} using google scholar API...")
                author_query = scholarly.search_author(author_name)
                author = next(author_query)
            # sanity check
            if author["name"].lower() == author_name.lower():
                #if AFFILIATION.lower() in author["email_domain"]:
                    author = scholarly.fill(author, sections=[])
                    with open(outfile, "w") as f:
                        json.dump(author, f, indent=2)
            else:
                author = {}
    google_result.update({sinta_id:author})

In [None]:
df_google = pd.DataFrame.from_dict(google_result).T

In [None]:
df_final = df_clean.merge(df_google, left_index=True, right_index=True)

In [None]:
df_final.T.to_json(f"../data/processed/{AFFILIATION}/life_science.json", indent=2)

In [None]:
# sanity check for empty email domain
df_final[df_final.email_domain.isnull()]

In [None]:
# sanity check for different affiliation domain
suspicious_email_domain = []
for email_domain in df_final.email_domain.unique():
    if type(email_domain) is str:
        if AFFILIATION.lower() not in email_domain:
            suspicious_email_domain.append(email_domain)
suspicious_email_domain

In [None]:
df_final[df_final.email_domain.isin(suspicious_email_domain)].loc[:, ["name_inputted", "name", "email_domain", 'scholar_id', 'affiliation', 'interests', 'homepage']]