# Match not found Professors

In [22]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import time
import pandas as pd
from sentence_transformers import SentenceTransformer, util


## Retrieve all the authors

In [13]:
df = pd.read_csv("../data/authors/authors_data_science.csv")
import requests

def search_orcid_bicocca(given_name, family_name):
    base_url = "https://pub.orcid.org/v3.0/expanded-search/"
    query = f"given-names:{given_name} AND family-name:{family_name}"

    headers = {
        "Accept": "application/json"
    }

    params = {
        "q": query
    }

    response = requests.get(base_url, headers=headers, params=params)

    if response.status_code != 200:
        print(f"Errore nella richiesta: {response.status_code}")
        return None

    results = response.json().get("expanded-result", [])

    if not results:
        print("Nessun risultato trovato.")
        return None

    for res in results:
        affiliations = res.get("institution-name", [])
        if any("bicocca" in aff.lower() for aff in affiliations):
            full_name = res.get("given-names", "") + " " + res.get("family-name", "")
            orcid = res.get("orcid-id", "N/A")
            print(f"Trovato: {full_name} -> ORCID: {orcid}")
            return orcid

    print("Nessun ORCID trovato associato alla Bicocca.")
    return None

In [14]:
for index, row in df.iterrows():
        family_name = row['Last name']
        given_name = row['Name']
        orcid = search_orcid_bicocca(given_name, family_name)
    
        df.loc[index, "orcid"] = orcid

Nessun risultato trovato.
Nessun ORCID trovato associato alla Bicocca.
Trovato: Simone  -> ORCID: 0000-0002-7070-1545
Trovato: Pier Giovanni  -> ORCID: 0000-0003-3769-6649
Nessun ORCID trovato associato alla Bicocca.
Trovato: Davide  -> ORCID: 0000-0001-9655-7142
Trovato: Michele  -> ORCID: 0000-0002-2480-966X
Trovato: Gianluca  -> ORCID: 0000-0001-5584-3089
Nessun ORCID trovato associato alla Bicocca.
Nessun ORCID trovato associato alla Bicocca.
Nessun ORCID trovato associato alla Bicocca.
Nessun ORCID trovato associato alla Bicocca.
Trovato: Gianfranco  -> ORCID: 0000-0002-3412-4162
Trovato: Gabriele  -> ORCID: 0000-0001-5186-0199
Trovato: marco  -> ORCID: 0000-0001-7415-0771
Trovato: Andrea  -> ORCID: 0000-0001-9803-3668
Trovato: Fabio  -> ORCID: 0000-0001-6864-2702
Trovato: Enza  -> ORCID: 0000-0002-4062-0824
Trovato: gianna  -> ORCID: 0000-0002-7952-3618
Trovato: Enrico  -> ORCID: 0000-0002-3436-1395
Trovato: Paolo  -> ORCID: 0000-0001-9112-0574
Trovato: Marco  -> ORCID: 0000-0003

In [4]:
not_found = df[df["orcid"].isna()]
not_found

Unnamed: 0,Last name,Name,SSD,orcid
0,Andreotti,Alberta Argia,SPS/09,
1,Bernasconi,Davide Paolo,MED/01,
4,Cesarini,Mirko,ING-INF/05,
8,Di Domenica,Nico,SECS-P/08,
9,Fattore,Marco,SECS-S/03,
10,Ferretti,Claudio,INF/01,
11,Fersini,Elisabetta,INF/01,
28,Rebora,Paola,MED/01,
29,Stella,Fabio Antonio,MAT/09,


### Scrape Information of not found professors

In [5]:
def generate_profile_url(name, surname):
    """Genera lo slug URL del profilo unimib in base al nome e cognome."""
    full_name = f"{name} {surname}".lower()
    slug = "-".join(full_name.split())
    return f"https://www.unimib.it/{slug}"

def scrape_full_profile(url):
    headers = {"User-Agent": "Mozilla/5.0"}
    try:
        response = requests.get(url, headers=headers, timeout=10)
        if response.status_code != 200:
            print(f"⚠️ Errore {response.status_code} per URL: {url}")
            return {}
    except Exception as e:
        print(f"❌ Richiesta fallita per {url}: {e}")
        return {}

    soup = BeautifulSoup(response.text, "html.parser")
    data = {}

    # Email
    email_tag = soup.find("a", href=lambda x: x and "mailto:" in x)
    if email_tag:
        data["email"] = email_tag.get_text(strip=True)

    # Phone number
    phone_tag = soup.find("a", href=lambda x: x and "tel:" in x)
    if phone_tag:
        data["phone"] = phone_tag.get_text(strip=True)

    # Campi etichettati
    fields = {
        "Ruolo": "role",
        "Settore scientifico disciplinare": "ssd",
        "Gruppo scientifico disciplinare": "gsd",
        "Afferenza": "department",
        "Stanza": "office"
    }

    for label, key in fields.items():
        try:
            label_elem = soup.find("div", string=lambda x: x and label in x)
            if label_elem:
                sibling = label_elem.find_next_sibling("div")
                if sibling:
                    data[key] = sibling.get_text(strip=True)
        except Exception as e:
            print(f"⚠️ Errore estraendo {label}: {e}")

    # === Nuova estrazione pubblicazioni da <li> sotto "Pubblicazioni"
    try:
        pub_title = soup.find(lambda tag: tag.name in ["h2", "h3"] and "pubblicazioni" in tag.get_text(strip=True).lower())
        if pub_title:
            pub_list = pub_title.find_next("ul")
            if pub_list:
                items = pub_list.find_all("li")
                publications = [item.get_text(separator=" ", strip=True) for item in items]
                data["publications"] = "; ".join(publications[:5])
    except Exception as e:
        print(f"⚠️ Errore durante l'estrazione delle pubblicazioni: {e}")

    return data

    """Effettua scraping della pagina profilo Unimib, estraendo info e pubblicazioni."""
    headers = {"User-Agent": "Mozilla/5.0"}
    try:
        response = requests.get(url, headers=headers, timeout=10)
        if response.status_code != 200:
            print(f"⚠️ Errore {response.status_code} per URL: {url}")
            return {}
    except Exception as e:
        print(f"❌ Richiesta fallita per {url}: {e}")
        return {}

    soup = BeautifulSoup(response.text, "html.parser")
    data = {}

    # Email
    email_tag = soup.find("a", href=lambda x: x and "mailto:" in x)
    if email_tag:
        data["email"] = email_tag.get_text(strip=True)

    # Phone number
    phone_tag = soup.find("a", href=lambda x: x and "tel:" in x)
    if phone_tag:
        data["phone"] = phone_tag.get_text(strip=True)

    # Campi etichettati
    fields = {
        "Ruolo": "role",
        "Settore scientifico disciplinare": "ssd",
        "Gruppo scientifico disciplinare": "gsd",
        "Afferenza": "department",
        "Stanza": "office"
    }

    for label, key in fields.items():
        try:
            label_elem = soup.find("div", string=lambda x: x and label in x)
            if label_elem:
                sibling = label_elem.find_next_sibling("div")
                if sibling:
                    data[key] = sibling.get_text(strip=True)
        except Exception as e:
            print(f"⚠️ Errore estraendo {label}: {e}")

    # Pubblicazioni (blocco "biblio")
    try:
        pub_section = soup.find("div", class_="biblio")
        if pub_section:
            entries = pub_section.find_all("div", class_="biblio-entry")
            publications = [entry.get_text(separator=" ", strip=True) for entry in entries]
            data["publications"] = "; ".join(publications[:5])  # Puoi cambiare 5 in un numero maggiore
    except Exception as e:
        print(f"⚠️ Errore estraendo pubblicazioni: {e}")

    return data


# Lista professori
professori = [
    ("Alberta Argia", "Andreotti"),
    ("Davide Paolo", "Bernasconi"),
    ("Mirko", "Cesarini"),
    ("Nico", "Di Domenica"),
    ("Marco", "Fattore"),
    ("Claudio", "Ferretti"),
    ("Elisabetta", "Fersini"),
    ("Paola", "Rebora"),
    ("Fabio Antonio", "Stella"),
]

results = []
for name, surname in professori:
    profile_url = generate_profile_url(name, surname)
    print(f"🔍 Analizzando {profile_url}")
    info = scrape_full_profile(profile_url)
    result = {
        "name": name,
        "surname": surname,
        "profile_url": profile_url,
        **info
    }
    results.append(result)
    time.sleep(1.5)  # evita rate limiting

# Crea DataFrame finale
df = pd.DataFrame(results)

# Mostra/Salva
df


🔍 Analizzando https://www.unimib.it/alberta-argia-andreotti
🔍 Analizzando https://www.unimib.it/davide-paolo-bernasconi
🔍 Analizzando https://www.unimib.it/mirko-cesarini
🔍 Analizzando https://www.unimib.it/nico-di-domenica
⚠️ Errore 404 per URL: https://www.unimib.it/nico-di-domenica
🔍 Analizzando https://www.unimib.it/marco-fattore
🔍 Analizzando https://www.unimib.it/claudio-ferretti
🔍 Analizzando https://www.unimib.it/elisabetta-fersini
🔍 Analizzando https://www.unimib.it/paola-rebora
🔍 Analizzando https://www.unimib.it/fabio-antonio-stella
✅ Scraping completato.


Unnamed: 0,name,surname,profile_url,phone,role,ssd,gsd,department,office,publications
0,Alberta Argia,Andreotti,https://www.unimib.it/alberta-argia-andreotti,264487579.0,Professoressa ordinaria,Sociologia dei processi economici e del lavoro...,"SOCIOLOGIA DEI PROCESSI ECONOMICI, DEL LAVORO,...",DIPARTIMENTO DI SOCIOLOGIA E RICERCA SOCIALE,"U07, Piano: 3, Stanza: 352","Andreotti, A., Coletto, D., Rio, A. (2024). St..."
1,Davide Paolo,Bernasconi,https://www.unimib.it/davide-paolo-bernasconi,264488098.0,Professore associato,Statistica medica (MEDS-24/A),"STATISTICA MEDICA, IGIENE GENERALE E APPLICATA...",DIPARTIMENTO DI MEDICINA E CHIRURGIA (SCHOOL O...,"U28, Piano: 0, Stanza: T003","Capsoni, N., Azin, G., Scarnera, M., Bettina, ..."
2,Mirko,Cesarini,https://www.unimib.it/mirko-cesarini,264485849.0,Ricercatore,Sistemi di elaborazione delle informazioni (II...,SISTEMI DI ELABORAZIONE DELLE INFORMAZIONI (09...,DIPARTIMENTO DI STATISTICA E METODI QUANTITATIVI,"U07, Piano: 4, Stanza: 4133","Ravenda, F., Cesarini, M., Peluso, S., Mira, A..."
3,Nico,Di Domenica,https://www.unimib.it/nico-di-domenica,,,,,,,
4,Marco,Fattore,https://www.unimib.it/marco-fattore,264483227.0,Professore ordinario,Statistica economica (STAT-02/A),STATISTICA ECONOMICA (13/STAT-02),DIPARTIMENTO DI STATISTICA E METODI QUANTITATIVI,"U07, Piano: 4, Stanza: 4133","Fattore, M. (2025). Complexity reduction of mu..."
5,Claudio,Ferretti,https://www.unimib.it/claudio-ferretti,264487819.0,Professore associato,Informatica (INFO-01/A),INFORMATICA (01/INFO-01),"DIPARTIMENTO DI INFORMATICA, SISTEMISTICA E CO...","U14, Piano: 2, Stanza: 2053","Saletta, M., Ferretti, C. (2024). Exploring th..."
6,Elisabetta,Fersini,https://www.unimib.it/elisabetta-fersini,264487896.0,Professoressa associata,Informatica (INFO-01/A),INFORMATICA (01/INFO-01),"DIPARTIMENTO DI INFORMATICA, SISTEMISTICA E CO...","U14, Piano: 2, Stanza: 2016","Batini, C., Santucci, G., Palmonari, M., Bella..."
7,Paola,Rebora,https://www.unimib.it/paola-rebora,264488165.0,Professoressa associata,Statistica medica (MEDS-24/A),"STATISTICA MEDICA, IGIENE GENERALE E APPLICATA...",DIPARTIMENTO DI MEDICINA E CHIRURGIA (SCHOOL O...,"U28, Piano: 0, Stanza: T002","Rebora, P., Antolini, L., Glidden, D., & Valse..."
8,Fabio Antonio,Stella,https://www.unimib.it/fabio-antonio-stella,264487837.0,Professore ordinario,Informatica (INFO-01/A),INFORMATICA (01/INFO-01),"DIPARTIMENTO DI INFORMATICA, SISTEMISTICA E CO...","U14, Piano: 2, Stanza: 2046","Acerbi, E., Viganò, E., Poidinger, M., Mortell..."


In [6]:
df.columns

Index(['name', 'surname', 'profile_url', 'phone', 'role', 'ssd', 'gsd',
       'department', 'office', 'publications'],
      dtype='object')

### search all profile with same name of missing professors

In [7]:
def search_orcid_detailed(given_name, family_name):
    base_url = "https://pub.orcid.org/v3.0/expanded-search/"
    headers = {"Accept": "application/json"}
    query = f"given-names:{given_name} AND family-name:{family_name}"
    params = {"q": query}

    try:
        res = requests.get(base_url, headers=headers, params=params, timeout=10)
        res.raise_for_status()
    except Exception as e:
        print(f"❌ Errore per {given_name} {family_name}: {e}")
        return []

    data = res.json()
    results = data.get("expanded-result", []) or []

    entries = []
    for result in results:
        entry = {
            "ORCID ID": result.get("orcid-id"),
            "First Name": result.get("given-names"),
            "Last Name": result.get("family-names"),
            "Other Names": "; ".join(result.get("other-name", []) or []),
            "Affiliations": "; ".join(result.get("institution-name", []) or [])
        }
        entries.append(entry)

    return entries

# Lista professori
professori = [
    ("Alberta Argia", "Andreotti"),
    ("Davide Paolo", "Bernasconi"),
    ("Mirko", "Cesarini"),
    ("Nico", "Di Domenica"),
    ("Marco", "Fattore"),
    ("Claudio", "Ferretti"),
    ("Elisabetta", "Fersini"),
    ("Paola", "Rebora"),
    ("Fabio Antonio", "Stella"),
]

# Raccolta risultati
orcid_data = []
for name, surname in professori:
    print(f"🔍 Cercando {name} {surname}")
    matches = search_orcid_detailed(name, surname)
    for match in matches:
        match["Searched Name"] = name
        match["Searched Surname"] = surname
        orcid_data.append(match)
    time.sleep(1.5)

# Salvataggio
data = pd.DataFrame(orcid_data)



🔍 Cercando Alberta Argia Andreotti
🔍 Cercando Davide Paolo Bernasconi
🔍 Cercando Mirko Cesarini
🔍 Cercando Nico Di Domenica
🔍 Cercando Marco Fattore
🔍 Cercando Claudio Ferretti
🔍 Cercando Elisabetta Fersini
🔍 Cercando Paola Rebora
🔍 Cercando Fabio Antonio Stella


In [8]:
data

Unnamed: 0,ORCID ID,First Name,Last Name,Other Names,Affiliations,Searched Name,Searched Surname
0,0000-0001-8771-5428,DAVIDE PAOLO,BERNASCONI,,,Davide Paolo,Bernasconi
1,0000-0001-6043-3367,Paolo,Bernasconi,,Fondazione IRCCS Policlinico San Matteo; UNIVE...,Davide Paolo,Bernasconi
2,0000-0001-9601-0403,Mirko,Cesarini,,,Mirko,Cesarini
3,0009-0004-3430-8326,Nico,Di Stefano,,University of Brescia,Nico,Di Domenica
4,0009-0001-4949-0624,Nico,Di Fonte,,University of L'Aquila,Nico,Di Domenica
5,0000-0002-6624-8637,MARCO,FATTORE,,,Marco,Fattore
6,0000-0002-5562-7893,Claudio,Ferretti,,,Claudio,Ferretti
7,0000-0001-9582-5045,Claudio,Ferretti,,,Claudio,Ferretti
8,0000-0002-8987-100X,Elisabetta,Fersini,,,Elisabetta,Fersini
9,0000-0003-0606-5852,PAOLA,REBORA,,,Paola,Rebora


In [9]:
def get_orcid_details(orcid_id):
    url = f"https://pub.orcid.org/v3.0/{orcid_id}"
    headers = {"Accept": "application/json"}
    
    try:
        res = requests.get(url, headers=headers, timeout=10)
        res.raise_for_status()
    except Exception as e:
        print(f"❌ Errore per ORCID {orcid_id}: {e}")
        return {
            "Biography": None,
            "Country": None,
            "Research URLs": None,
            "Affiliations Detailed": None,
            "Publications Titles": None,
            "DOIs": None
        }

    data = res.json()
    person = data.get("person", {})
    activities = data.get("activities-summary", {})

    # === Biografia e paese ===
    biography = person.get("biography", {}).get("content") if person.get("biography") else None

    address_list = person.get("addresses", {}).get("address", [])
    country = None
    if address_list and isinstance(address_list, list):
        country = address_list[0].get("country", {}).get("value")

    # === Researcher URLs ===
    researcher_urls = []
    urls = person.get("researcher-urls", {}).get("researcher-url", [])
    if isinstance(urls, list):
        researcher_urls = [u.get("url", {}).get("value") for u in urls if u.get("url")]

    # === Affiliazioni ===
    employments = activities.get("employments", {}).get("employment-summary", [])
    current_affiliation = "; ".join(
        [f"{e.get('organization', {}).get('name', '')} ({e.get('role-title', '')})"
         for e in employments if e.get("organization")]
    )

    # === Pubblicazioni ===
    works = activities.get("works", {}).get("group", [])
    work_titles = []
    dois = []
    for w in works:
        summary = w.get("work-summary", [{}])[0]
        title = summary.get("title", {}).get("title", {}).get("value", "")
        if title:
            work_titles.append(title)
        external_ids = summary.get("external-ids", {}).get("external-id", [])
        for eid in external_ids:
            if eid.get("external-id-type") == "doi":
                dois.append(eid.get("external-id-value"))

    return {
        "Biography": biography,
        "Country": country,
        "Research URLs": "; ".join(filter(None, researcher_urls)),
        "Affiliations Detailed": current_affiliation,
        "Publications Titles": "; ".join(work_titles[:5]),
        "DOIs": "; ".join(dois[:5])
    }

# === Carica dataset ORCID ID ===
data = data

# === Estrai dettagli da ogni ORCID ===
details = []
for i, row in data.iterrows():
    orcid_id = row["ORCID ID"]
    print(f"Estraendo dettagli per {orcid_id}")
    info = get_orcid_details(orcid_id)
    details.append(info)
    time.sleep(1.5)  # evita rate limiting

# === Aggiungi nuove colonne al DataFrame originale ===
details_df = pd.DataFrame(details)
df_full = pd.concat([data, details_df], axis=1)

# === Salva il file finale ===
df_full.to_csv("orcid_professori_arricchito.csv", index=False)


🔍 Estraendo dettagli per 0000-0001-8771-5428
🔍 Estraendo dettagli per 0000-0001-6043-3367
🔍 Estraendo dettagli per 0000-0001-9601-0403
🔍 Estraendo dettagli per 0009-0004-3430-8326
🔍 Estraendo dettagli per 0009-0001-4949-0624
🔍 Estraendo dettagli per 0000-0002-6624-8637
🔍 Estraendo dettagli per 0000-0002-5562-7893
🔍 Estraendo dettagli per 0000-0001-9582-5045
🔍 Estraendo dettagli per 0000-0002-8987-100X
🔍 Estraendo dettagli per 0000-0003-0606-5852
🔍 Estraendo dettagli per 0000-0002-1394-0507
🔍 Estraendo dettagli per 0000-0002-2670-9873
✅ File salvato come orcid_professori_arricchito.csv


In [10]:
df_full


Unnamed: 0,ORCID ID,First Name,Last Name,Other Names,Affiliations,Searched Name,Searched Surname,Biography,Country,Research URLs,Affiliations Detailed,Publications Titles,DOIs
0,0000-0001-8771-5428,DAVIDE PAOLO,BERNASCONI,,,Davide Paolo,Bernasconi,,,,,How Are Diagnosis-Related Groups and Staffing ...,10.3390/healthcare12191988; 10.3390/jcm1302061...
1,0000-0001-6043-3367,Paolo,Bernasconi,,Fondazione IRCCS Policlinico San Matteo; UNIVE...,Davide Paolo,Bernasconi,,,,,Donor Cell Acute Myeloid Leukemia after Hemato...,10.3390/genes14112085; 10.3390/cancers13133170...
2,0000-0001-9601-0403,Mirko,Cesarini,,,Mirko,Cesarini,,,http://www.statistica.unimib.it/utenti/cesarini/,,Classifying online Job Advertisements through ...,10.1016/j.future.2018.03.035; 10.1002/sam.1137...
3,0009-0004-3430-8326,Nico,Di Stefano,,University of Brescia,Nico,Di Domenica,,,,,,
4,0009-0001-4949-0624,Nico,Di Fonte,,University of L'Aquila,Nico,Di Domenica,,,,,Early prediction of spinodal-like relaxation e...,10.1063/5.0211031; 10.1016/j.molliq.2023.123425
5,0000-0002-6624-8637,MARCO,FATTORE,,,Marco,Fattore,,,,,A fuzzy posetic toolbox for multi-criteria eva...,10.1007/s10479-024-06352-3; 10.7866/HPE-RPE.23...
6,0000-0002-5562-7893,Claudio,Ferretti,,,Claudio,Ferretti,,,,,,
7,0000-0001-9582-5045,Claudio,Ferretti,,,Claudio,Ferretti,,,,,Exploring the Prompt Space of Large Language M...,10.1145/3638529.3654049; 10.3390/a16100478; 10...
8,0000-0002-8987-100X,Elisabetta,Fersini,,,Elisabetta,Fersini,,,,,Unraveling Disagreement Constituents in Hatefu...,10.1007/978-3-031-56066-8_3; 10.3390/math11204...
9,0000-0003-0606-5852,PAOLA,REBORA,,,Paola,Rebora,,,,,Development and Psychometric Testing of the Ca...,10.1177/26350106251336309; 10.1007/s00134-025-...


### Semantic matching

In [12]:
# Costruzione stringhe semantiche
def build_professor_string(row):
    fields = [
        row.get("name", ""),
        row.get("surname", ""),
        #row.get("profile_url", ""),
        #row.get("role", ""),
        #row.get("ssd", ""),
        #row.get("gsd", ""),
        #row.get("department", ""),
        row.get("publications", "")
    ]
    return " ".join([str(f).strip() for f in fields if pd.notna(f) and str(f).strip() != ""])

def build_orcid_string(row):
    fields = [
        row.get("First Name", ""),
        row.get("Last Name", ""),
        #row.get("Affiliations", ""),
        #row.get("Biography", ""),
        row.get("Publications Titles", ""),
        #row.get("Verified Email", ""),
        #row.get("Personal Website", "")
    ]
    return " ".join([str(f).strip() for f in fields if pd.notna(f) and str(f).strip() != ""])

# Carica modello
model = SentenceTransformer("all-MiniLM-L6-v2")

# Inizializza risultati
results = []

# Itera su ciascun professore di df
for i, prof_row in df.iterrows():
    prof_name = prof_row["name"]
    prof_surname = prof_row["surname"]

    # Filtra df_full con lo stesso nome/cognome cercato
    df_candidates = df_full[
        (df_full["Searched Name"].str.strip().str.lower() == prof_name.strip().lower()) &
        (df_full["Searched Surname"].str.strip().str.lower() == prof_surname.strip().lower())
    ]

    # Se non ci sono candidati, salta
    if df_candidates.empty:
        results.append({
            "Best Match ORCID Text": None,
            "Similarity Score": None,
            "Matched ORCID ID": None
        })
        continue

    # Costruisci stringhe
    prof_text = build_professor_string(prof_row)
    candidate_texts = df_candidates.apply(build_orcid_string, axis=1)

    # Embedding
    emb_prof = model.encode(prof_text, convert_to_tensor=True)
    emb_cands = model.encode(candidate_texts.tolist(), convert_to_tensor=True)

    # Similarità
    scores = util.cos_sim(emb_prof, emb_cands)[0]
    best_idx = scores.argmax().item()
    best_score = scores[best_idx].item()

    best_match_row = df_candidates.iloc[best_idx]

    results.append({
        "Best Match ORCID Text": candidate_texts.iloc[best_idx],
        "Similarity Score": best_score,
        "Matched ORCID ID": best_match_row.get("ORCID ID", None),
        "Matched First Name": best_match_row.get("First Name", None),
        "Matched Last Name": best_match_row.get("Last Name", None)
    })


# Unisci con il DataFrame originale
results_df = pd.DataFrame(results)
df_matched = pd.concat([df.reset_index(drop=True), results_df], axis=1)

# Salva
df_matched.to_csv("matched_professori_filtered.csv", index=False)
df_matched


  from tqdm.autonotebook import tqdm, trange


Unnamed: 0,name,surname,profile_url,phone,role,ssd,gsd,department,office,publications,Best Match ORCID Text,Similarity Score,Matched ORCID ID,Matched First Name,Matched Last Name
0,Alberta Argia,Andreotti,https://www.unimib.it/alberta-argia-andreotti,264487579.0,Professoressa ordinaria,Sociologia dei processi economici e del lavoro...,"SOCIOLOGIA DEI PROCESSI ECONOMICI, DEL LAVORO,...",DIPARTIMENTO DI SOCIOLOGIA E RICERCA SOCIALE,"U07, Piano: 3, Stanza: 352","Andreotti, A., Coletto, D., Rio, A. (2024). St...",,,,,
1,Davide Paolo,Bernasconi,https://www.unimib.it/davide-paolo-bernasconi,264488098.0,Professore associato,Statistica medica (MEDS-24/A),"STATISTICA MEDICA, IGIENE GENERALE E APPLICATA...",DIPARTIMENTO DI MEDICINA E CHIRURGIA (SCHOOL O...,"U28, Piano: 0, Stanza: T003","Capsoni, N., Azin, G., Scarnera, M., Bettina, ...",DAVIDE PAOLO BERNASCONI How Are Diagnosis-Rela...,0.321503,0000-0001-8771-5428,DAVIDE PAOLO,BERNASCONI
2,Mirko,Cesarini,https://www.unimib.it/mirko-cesarini,264485849.0,Ricercatore,Sistemi di elaborazione delle informazioni (II...,SISTEMI DI ELABORAZIONE DELLE INFORMAZIONI (09...,DIPARTIMENTO DI STATISTICA E METODI QUANTITATIVI,"U07, Piano: 4, Stanza: 4133","Ravenda, F., Cesarini, M., Peluso, S., Mira, A...",Mirko Cesarini Classifying online Job Advertis...,0.209971,0000-0001-9601-0403,Mirko,Cesarini
3,Nico,Di Domenica,https://www.unimib.it/nico-di-domenica,,,,,,,,Nico Di Stefano,0.665949,0009-0004-3430-8326,Nico,Di Stefano
4,Marco,Fattore,https://www.unimib.it/marco-fattore,264483227.0,Professore ordinario,Statistica economica (STAT-02/A),STATISTICA ECONOMICA (13/STAT-02),DIPARTIMENTO DI STATISTICA E METODI QUANTITATIVI,"U07, Piano: 4, Stanza: 4133","Fattore, M. (2025). Complexity reduction of mu...",MARCO FATTORE A fuzzy posetic toolbox for mult...,0.658271,0000-0002-6624-8637,MARCO,FATTORE
5,Claudio,Ferretti,https://www.unimib.it/claudio-ferretti,264487819.0,Professore associato,Informatica (INFO-01/A),INFORMATICA (01/INFO-01),"DIPARTIMENTO DI INFORMATICA, SISTEMISTICA E CO...","U14, Piano: 2, Stanza: 2053","Saletta, M., Ferretti, C. (2024). Exploring th...",Claudio Ferretti Exploring the Prompt Space of...,0.811079,0000-0001-9582-5045,Claudio,Ferretti
6,Elisabetta,Fersini,https://www.unimib.it/elisabetta-fersini,264487896.0,Professoressa associata,Informatica (INFO-01/A),INFORMATICA (01/INFO-01),"DIPARTIMENTO DI INFORMATICA, SISTEMISTICA E CO...","U14, Piano: 2, Stanza: 2016","Batini, C., Santucci, G., Palmonari, M., Bella...",Elisabetta Fersini Unraveling Disagreement Con...,0.470735,0000-0002-8987-100X,Elisabetta,Fersini
7,Paola,Rebora,https://www.unimib.it/paola-rebora,264488165.0,Professoressa associata,Statistica medica (MEDS-24/A),"STATISTICA MEDICA, IGIENE GENERALE E APPLICATA...",DIPARTIMENTO DI MEDICINA E CHIRURGIA (SCHOOL O...,"U28, Piano: 0, Stanza: T002","Rebora, P., Antolini, L., Glidden, D., & Valse...",PAOLA REBORA Development and Psychometric Test...,0.215637,0000-0003-0606-5852,PAOLA,REBORA
8,Fabio Antonio,Stella,https://www.unimib.it/fabio-antonio-stella,264487837.0,Professore ordinario,Informatica (INFO-01/A),INFORMATICA (01/INFO-01),"DIPARTIMENTO DI INFORMATICA, SISTEMISTICA E CO...","U14, Piano: 2, Stanza: 2046","Acerbi, E., Viganò, E., Poidinger, M., Mortell...",FABIO ANTONIO STELLA Comparing Deep Reinforcem...,0.503208,0000-0002-1394-0507,FABIO ANTONIO,STELLA


In [15]:
df

Unnamed: 0,Last name,Name,SSD,orcid
0,Andreotti,Alberta Argia,SPS/09,
1,Bernasconi,Davide Paolo,MED/01,
2,Bianco,Simone,INF/01,0000-0002-7070-1545
3,Bissiri,Pier Giovanni,SECS-S/01,0000-0003-3769-6649
4,Cesarini,Mirko,ING-INF/05,
5,Chicco,Davide,INF/01,0000-0001-9655-7142
6,Ciavotta,Michele,INF/01,0000-0002-2480-966X
7,Della Vedova,Gianluca,INF/01,0000-0001-5584-3089
8,Di Domenica,Nico,SECS-P/08,
9,Fattore,Marco,SECS-S/03,


### Add New ORCID

In [16]:
df_professori_matched = df_matched
df_orcid_list = df

# Normalizza nomi e cognomi
df_professori_matched["full_name"] = (
    df_professori_matched["name"].str.strip().str.lower() + " " +
    df_professori_matched["surname"].str.strip().str.lower()
)

df_orcid_list["full_name"] = (
    df_orcid_list["Name"].str.strip().str.lower() + " " +
    df_orcid_list["Last name"].str.strip().str.lower()
)

# Crea dizionario: full_name -> orcid (solo se presente)
orcid_map = df_professori_matched.set_index("full_name")["Matched ORCID ID"].dropna().to_dict()

# Applica solo dove orcid è mancante
df_orcid_list["orcid"] = df_orcid_list.apply(
    lambda row: orcid_map.get(row["full_name"], row["orcid"]) if pd.isna(row["orcid"]) else row["orcid"],
    axis=1
)

# Rimuovi colonna temporanea
df_orcid_list.drop(columns=["full_name"], inplace=True)

# Salva risultato
df_orcid_list.to_csv("orcid_list_updated.csv", index=False)



In [18]:
df_orcid_list = df_orcid_list[
    ~(
        ((df_orcid_list["Name"].str.strip().str.lower() == "alberta argia") & 
         (df_orcid_list["Last name"].str.strip().str.lower() == "andreotti")) |
        ((df_orcid_list["Name"].str.strip().str.lower() == "nico") & 
         (df_orcid_list["Last name"].str.strip().str.lower() == "di domenica"))
    )
]
df_orcid_list

Unnamed: 0,Last name,Name,SSD,orcid
1,Bernasconi,Davide Paolo,MED/01,0000-0001-8771-5428
2,Bianco,Simone,INF/01,0000-0002-7070-1545
3,Bissiri,Pier Giovanni,SECS-S/01,0000-0003-3769-6649
4,Cesarini,Mirko,ING-INF/05,0000-0001-9601-0403
5,Chicco,Davide,INF/01,0000-0001-9655-7142
6,Ciavotta,Michele,INF/01,0000-0002-2480-966X
7,Della Vedova,Gianluca,INF/01,0000-0001-5584-3089
9,Fattore,Marco,SECS-S/03,0000-0002-6624-8637
10,Ferretti,Claudio,INF/01,0000-0001-9582-5045
11,Fersini,Elisabetta,INF/01,0000-0002-8987-100X


In [19]:
df_orcid_list.to_csv('authors_complete_ds.csv')

In [20]:
courses = [
    "Data Science Lab in Medicine",
    "Digital Signal and Image Management",
    "Foundations of Probability and Statistics",
    "Service Science",
    "Green Computing",
    "Technological Infrastructures for Data Science",
    "Foundations of Computer Science",
    "Data Science Lab",
    "Cybersecurity for Data Science",
    "Natural Language Processing",
    "Financial Markets Analytics",
    "Green Computing",
    "Economics for Data Science",
    "Data Management",
    "Business Intelligence",
    "Decision Models",
    "High Dimensional Data Analysis",
    "Data Science Lab in Business and Marketing",
    "Foundations of Deep Learning",
    "Data Science Lab in Environment and Physics",
    "Data Semantics",
    "Text Mining and Search",
    "Streaming Data Management and Time Series Analysis",
    "Statistical Modeling",
    "Data Science Lab in Biosciences",
    "Data Science Lab in Medicine",
    "Big Data in Public Health",
    "Machine Learning",
    "Text Mining and Search"
]


df_orcid_list['Course'] = courses
df_orcid_list

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_orcid_list['Course'] = courses


Unnamed: 0,Last name,Name,SSD,orcid,Course
1,Bernasconi,Davide Paolo,MED/01,0000-0001-8771-5428,Data Science Lab in Medicine
2,Bianco,Simone,INF/01,0000-0002-7070-1545,Digital Signal and Image Management
3,Bissiri,Pier Giovanni,SECS-S/01,0000-0003-3769-6649,Foundations of Probability and Statistics
4,Cesarini,Mirko,ING-INF/05,0000-0001-9601-0403,Service Science
5,Chicco,Davide,INF/01,0000-0001-9655-7142,Green Computing
6,Ciavotta,Michele,INF/01,0000-0002-2480-966X,Technological Infrastructures for Data Science
7,Della Vedova,Gianluca,INF/01,0000-0001-5584-3089,Foundations of Computer Science
9,Fattore,Marco,SECS-S/03,0000-0002-6624-8637,Data Science Lab
10,Ferretti,Claudio,INF/01,0000-0001-9582-5045,Cybersecurity for Data Science
11,Fersini,Elisabetta,INF/01,0000-0002-8987-100X,Natural Language Processing


In [21]:
df_orcid_list.to_csv('authors_complete_ds.csv')

### Create Complete Dataset for Internal professors

In [25]:
df_complete = pd.read_csv("../data/authors/authors_complete.csv")
df_orcid = pd.read_csv("../data/authors/authors_complete_ds.csv")
df_orcid

Unnamed: 0.1,Unnamed: 0,Last name,Name,SSD,orcid,Course
0,1,Bernasconi,Davide Paolo,MED/01,0000-0001-8771-5428,Data Science Lab in Medicine
1,2,Bianco,Simone,INF/01,0000-0002-7070-1545,Digital Signal and Image Management
2,3,Bissiri,Pier Giovanni,SECS-S/01,0000-0003-3769-6649,Foundations of Probability and Statistics
3,4,Cesarini,Mirko,ING-INF/05,0000-0001-9601-0403,Service Science
4,5,Chicco,Davide,INF/01,0000-0001-9655-7142,Green Computing
5,6,Ciavotta,Michele,INF/01,0000-0002-2480-966X,Technological Infrastructures for Data Science
6,7,Della Vedova,Gianluca,INF/01,0000-0001-5584-3089,Foundations of Computer Science
7,9,Fattore,Marco,SECS-S/03,0000-0002-6624-8637,Data Science Lab
8,10,Ferretti,Claudio,INF/01,0000-0001-9582-5045,Cybersecurity for Data Science
9,11,Fersini,Elisabetta,INF/01,0000-0002-8987-100X,Natural Language Processing


In [30]:
import pandas as pd

# Carica i due dataset
df_complete = pd.read_csv("../data/authors/authors_complete.csv")
df_complete = df_complete[['Last name','Name','hindex','openalex id',	'institutions id', 'institutions name',	'topics']]
df_orcid = pd.read_csv("../data/authors/authors_complete_ds.csv")

# Rimuovi la colonna "Unnamed: 0" se presente
for df in [df_complete, df_orcid]:
    if "Unnamed: 0" in df.columns:
        df.drop(columns=["Unnamed: 0"], inplace=True)

# Unione basata su nome e cognome
df_merged = pd.merge(df_complete, df_orcid, on=["Name", "Last name"], how="left")

# Visualizza il risultato
df_merged.head()
df_merged.to_csv('../data/authors/authors_internal.csv')

In [31]:
df_merged

Unnamed: 0,Last name,Name,hindex,openalex id,institutions id,institutions name,topics,SSD,orcid,Course
0,Bernasconi,Davide Paolo,24.0,https://openalex.org/A5050512903,"['https://openalex.org/I66752286', 'https://op...","['University of Milano-Bicocca', 'Azienda Soci...","['Epidemiology', 'Hepatology', 'Oncology', 'Pu...",MED/01,0000-0001-8771-5428,Data Science Lab in Medicine
1,Bianco,Simone,34.0,https://openalex.org/A5013570285,"['https://openalex.org/I66752286', 'https://op...","['University of Milano-Bicocca', 'University o...","['Computer Vision and Pattern Recognition', 'A...",INF/01,0000-0002-7070-1545,Digital Signal and Image Management
2,Bissiri,Pier Giovanni,7.0,https://openalex.org/A5074768639,"['https://openalex.org/I66752286', 'https://op...","['University of Milano-Bicocca', 'University o...","['Artificial Intelligence', 'Environmental Eng...",SECS-S/01,0000-0003-3769-6649,Foundations of Probability and Statistics
3,Cesarini,Mirko,15.0,https://openalex.org/A5049259722,"['https://openalex.org/I66752286', 'https://op...","['University of Milano-Bicocca', 'Center for N...","['Management Science and Operations Research',...",ING-INF/05,0000-0001-9601-0403,Service Science
4,Chicco,Davide,27.0,https://openalex.org/A5011556172,"['https://openalex.org/I185261750', 'https://o...","['University of Toronto', 'University of Milan...","['Molecular Biology', 'Health Information Mana...",INF/01,0000-0001-9655-7142,Green Computing
5,Ciavotta,Michele,16.0,https://openalex.org/A5079768184,"['https://openalex.org/I66752286', 'https://op...","['University of Milano-Bicocca', 'The Universi...","['Information Systems', 'Computer Networks and...",INF/01,0000-0002-2480-966X,Technological Infrastructures for Data Science
6,Della Vedova,Gianluca,22.0,https://openalex.org/A5059679442,"['https://openalex.org/I66752286', 'https://op...","['University of Milano-Bicocca', 'Istituto Naz...","['Molecular Biology', 'Artificial Intelligence...",INF/01,0000-0001-5584-3089,Foundations of Computer Science
7,Fattore,Marco,16.0,https://openalex.org/A5006214257,['https://openalex.org/I66752286'],['University of Milano-Bicocca'],"['Physical and Theoretical Chemistry', 'Manage...",SECS-S/03,0000-0002-6624-8637,Data Science Lab
8,Ferretti,Claudio,47.0,https://openalex.org/A5033044137,"['https://openalex.org/I27837315', 'https://op...","['University of Michigan', 'AGH University of ...","['Nuclear and High Energy Physics', 'Artificia...",INF/01,0000-0001-9582-5045,Cybersecurity for Data Science
9,Fersini,Elisabetta,23.0,https://openalex.org/A5005702955,"['https://openalex.org/I66752286', 'https://op...","['University of Milano-Bicocca', 'University o...","['Artificial Intelligence', 'Statistical and N...",INF/01,0000-0002-8987-100X,Natural Language Processing
