In [2]:
import pandas as pd
import json
from thefuzz import fuzz, process
from tqdm import tqdm
import nltk
from joblib import Parallel, delayed
import os
from pprint import pprint

In [3]:
authors_df = pd.read_json('./internalAuthors.json').transpose()

In [4]:
with open('./publications.json', 'r') as f:
    data = json.load(f)

data_list = []

for top_key, inner_dict in data.items():
    for inner_key, inner_values in inner_dict.items():
        inner_values['author'] = top_key
        inner_values['publication url'] = inner_key
        data_list.append(inner_values)


pub_df = pd.DataFrame(data_list)

In [5]:
authors_df.groupby('Afferenza', dropna=False)['Nome completo'].count().sort_values(ascending=False)

Afferenza
DIPARTIMENTO DI FILOLOGIA CLASSICA E ITALIANISTICA                                                                                           71
DIPARTIMENTO DI INFORMATICA - SCIENZA E INGEGNERIA                                                                                           54
NaN                                                                                                                                          37
DIPARTIMENTO DI SCIENZE GIURIDICHE                                                                                                           26
DIPARTIMENTO DI SCIENZE AZIENDALI                                                                                                            23
ARAG - AREA FINANZA E PARTECIPATE                                                                                                            12
DIPARTIMENTO DI INGEGNERIA DELL'ENERGIA ELETTRICA E DELL'INFORMAZIONE "GUGLIELMO MARCONI"                                     

In [6]:
authors_df[authors_df['Afferenza'] == 'DIPARTIMENTO DI SCIENZE AZIENDALI (attivo dal 14/07/2005 al 27/10/2011)']

Unnamed: 0,Nome completo,Afferenza,Author page


In [7]:
pub_df.groupby('dc.date.issued', dropna=False)['dc.date.issued'].count().sort_values(ascending=False)

dc.date.issued
2022    121
2016     95
2017     94
2021     92
2014     88
2020     87
2019     81
2013     79
2012     78
2018     78
2015     62
2010     61
2011     60
2009     52
2008     49
2023     37
2007     36
2005     36
2006     33
2004     31
9999     18
2002      3
2003      2
1999      2
1996      1
2001      1
1995      1
Name: dc.date.issued, dtype: int64

In [8]:
pub_df.groupby('scopus.keywords', dropna=False)['scopus.keywords'].count()

scopus.keywords
*                                                          469
Document semantics; Semantic publishing; Web interface;      2
NaN                                                          0
Name: scopus.keywords, dtype: int64

In [9]:
pub_df['publication url'].is_unique

False

In [10]:
filtered_df = pub_df[pub_df['internalAuthor'].apply(lambda authors: "/cris/rp/rp07910" in authors and "/cris/rp/rp17523" in authors)]
filtered_df[filtered_df['publication url'].isin(filtered_df['publication url'][filtered_df['publication url'].duplicated()])].sort_values("publication url")

Unnamed: 0,internalAuthor,externalAuthor,dc.title,dc.keywords,scopus.keywords,dc.date.issued,dc.identifier.doi,dc.collection.name,author,publication url
1297,"[/cris/rp/rp19454, /cris/rp/rp21849, /cris/rp/...",[],Multi-layer markup and ontological structures ...,LEGAL XML; ONTOLOGIES; AKOMA NTOSO; MARKUP LAN...,,2009,,4.01 Contributo in Atti di convegno,monica.palmirani,https://cris.unibo.it/handle/11585/100980
311,"[/cris/rp/rp19454, /cris/rp/rp21849, /cris/rp/...",[],Multi-layer markup and ontological structures ...,LEGAL XML; ONTOLOGIES; AKOMA NTOSO; MARKUP LAN...,,2009,,4.01 Contributo in Atti di convegno,fabio.vitali,https://cris.unibo.it/handle/11585/100980
170,"[/cris/rp/rp19454, /cris/rp/rp21849, /cris/rp/...",[],Multi-layer markup and ontological structures ...,LEGAL XML; ONTOLOGIES; AKOMA NTOSO; MARKUP LAN...,,2009,,4.01 Contributo in Atti di convegno,silvio.peroni,https://cris.unibo.it/handle/11585/100980
161,"[/cris/rp/rp07350, /cris/rp/rp17523, /cris/rp/...",[],Handling markup overlaps using OWL,OVERLAPPING MARKUP; ONTOLOGIES; SEMANTIC WEB; ...,*,2010,,4.01 Contributo in Atti di convegno,silvio.peroni,https://cris.unibo.it/handle/11585/100988
297,"[/cris/rp/rp07350, /cris/rp/rp17523, /cris/rp/...",[],Handling markup overlaps using OWL,OVERLAPPING MARKUP; ONTOLOGIES; SEMANTIC WEB; ...,*,2010,,4.01 Contributo in Atti di convegno,fabio.vitali,https://cris.unibo.it/handle/11585/100988
...,...,...,...,...,...,...,...,...,...,...
171,"[/cris/rp/rp07350, /cris/rp/rp17523, /cris/rp/...",[],Towards markup support for full GODDAGs and be...,,,2009,,4.01 Contributo in Atti di convegno,silvio.peroni,https://cris.unibo.it/handle/11585/87836
316,"[/cris/rp/rp07350, /cris/rp/rp17523, /cris/rp/...",[],Towards markup support for full GODDAGs and be...,,,2009,,4.01 Contributo in Atti di convegno,fabio.vitali,https://cris.unibo.it/handle/11585/87836
889,"[/cris/rp/rp17523, /cris/rp/rp38970, /cris/rp/...",[Di Matteo N.;],Of Mice and Terms: Clustering Algorithms on Am...,FOLKSONOMIES; SEMANTIC WEB; WORD SPACE MODELS,*,2010,,4.01 Contributo in Atti di convegno,fabio.tamburini,https://cris.unibo.it/handle/11585/96860
300,"[/cris/rp/rp17523, /cris/rp/rp38970, /cris/rp/...",[Di Matteo N.;],Of Mice and Terms: Clustering Algorithms on Am...,FOLKSONOMIES; SEMANTIC WEB; WORD SPACE MODELS,*,2010,,4.01 Contributo in Atti di convegno,fabio.vitali,https://cris.unibo.it/handle/11585/96860


In [11]:
# Create a set of unique URLs from the DataFrame column
unique_urls = set(filtered_df['publication url'])

url_list = ['https://cris.unibo.it/handle/11585/739863', 'https://cris.unibo.it/handle/11585/621575', 'https://cris.unibo.it/handle/11585/591739', 'https://cris.unibo.it/handle/11585/611245', 'https://cris.unibo.it/handle/11585/621578', 'https://cris.unibo.it/handle/11585/621574', 'https://cris.unibo.it/handle/11585/621579', 'https://cris.unibo.it/handle/11585/591734', 'https://cris.unibo.it/handle/11585/621573', 'https://cris.unibo.it/handle/11585/607065', 'https://cris.unibo.it/handle/11585/621576', 'https://cris.unibo.it/handle/11585/591737', 'https://cris.unibo.it/handle/11585/590494', 'https://cris.unibo.it/handle/11585/570685', 'https://cris.unibo.it/handle/11585/570682', 'https://cris.unibo.it/handle/11585/570679', 'https://cris.unibo.it/handle/11585/556162', 'https://cris.unibo.it/handle/11585/556160', 'https://cris.unibo.it/handle/11585/590490', 'https://cris.unibo.it/handle/11585/564532', 'https://cris.unibo.it/handle/11585/553967', 'https://cris.unibo.it/handle/11585/570676', 'https://cris.unibo.it/handle/11585/555883', 'https://cris.unibo.it/handle/11585/555934', 'https://cris.unibo.it/handle/11585/551219', 'https://cris.unibo.it/handle/11585/551290', 'https://cris.unibo.it/handle/11585/543615', 'https://cris.unibo.it/handle/11585/555908', 'https://cris.unibo.it/handle/11585/309347', 'https://cris.unibo.it/handle/11585/521151', 'https://cris.unibo.it/handle/11585/521165', 'https://cris.unibo.it/handle/11585/521121', 'https://cris.unibo.it/handle/11585/310115', 'https://cris.unibo.it/handle/11585/570664', 'https://cris.unibo.it/handle/11585/555875', 'https://cris.unibo.it/handle/11585/310113', 'https://cris.unibo.it/handle/11585/146350', 'https://cris.unibo.it/handle/11585/521166', 'https://cris.unibo.it/handle/11585/556150', 'https://cris.unibo.it/handle/11585/310114', 'https://cris.unibo.it/handle/11585/399426', 'https://cris.unibo.it/handle/11585/392200', 'https://cris.unibo.it/handle/11585/185113', 'https://cris.unibo.it/handle/11585/392563', 'https://cris.unibo.it/handle/11585/127987', 'https://cris.unibo.it/handle/11585/392192', 'https://cris.unibo.it/handle/11585/146344', 'https://cris.unibo.it/handle/11585/399440', 'https://cris.unibo.it/handle/11585/555513', 'https://cris.unibo.it/handle/11585/123083', 'https://cris.unibo.it/handle/11585/123101', 'https://cris.unibo.it/handle/11585/112587', 'https://cris.unibo.it/handle/11585/123112', 'https://cris.unibo.it/handle/11585/131049', 'https://cris.unibo.it/handle/11585/134556', 'https://cris.unibo.it/handle/11585/112584', 'https://cris.unibo.it/handle/11585/123088', 'https://cris.unibo.it/handle/11585/123107', 'https://cris.unibo.it/handle/11585/112578', 'https://cris.unibo.it/handle/11585/112589', 'https://cris.unibo.it/handle/11585/112577', 'https://cris.unibo.it/handle/11585/101002', 'https://cris.unibo.it/handle/11585/100988', 'https://cris.unibo.it/handle/11585/112609', 'https://cris.unibo.it/handle/11585/100993', 'https://cris.unibo.it/handle/11585/96860', 'https://cris.unibo.it/handle/11585/101004', 'https://cris.unibo.it/handle/11585/83936', 'https://cris.unibo.it/handle/11585/87813', 'https://cris.unibo.it/handle/11585/112606', 'https://cris.unibo.it/handle/11585/100980', 'https://cris.unibo.it/handle/11585/87836', 'https://cris.unibo.it/handle/11585/112610', 'https://cris.unibo.it/handle/11585/739863', 'https://cris.unibo.it/handle/11585/621575', 'https://cris.unibo.it/handle/11585/591739', 'https://cris.unibo.it/handle/11585/611245', 'https://cris.unibo.it/handle/11585/621578', 'https://cris.unibo.it/handle/11585/621574', 'https://cris.unibo.it/handle/11585/621579', 'https://cris.unibo.it/handle/11585/591734', 'https://cris.unibo.it/handle/11585/621573', 'https://cris.unibo.it/handle/11585/607065', 'https://cris.unibo.it/handle/11585/621576', 'https://cris.unibo.it/handle/11585/591737', 'https://cris.unibo.it/handle/11585/590494', 'https://cris.unibo.it/handle/11585/570685', 'https://cris.unibo.it/handle/11585/570682', 'https://cris.unibo.it/handle/11585/570679', 'https://cris.unibo.it/handle/11585/556162', 'https://cris.unibo.it/handle/11585/556160', 'https://cris.unibo.it/handle/11585/590490', 'https://cris.unibo.it/handle/11585/564532', 'https://cris.unibo.it/handle/11585/553967', 'https://cris.unibo.it/handle/11585/570676', 'https://cris.unibo.it/handle/11585/555883', 'https://cris.unibo.it/handle/11585/555934', 'https://cris.unibo.it/handle/11585/551219', 'https://cris.unibo.it/handle/11585/551290', 'https://cris.unibo.it/handle/11585/543615', 'https://cris.unibo.it/handle/11585/555908', 'https://cris.unibo.it/handle/11585/309347', 'https://cris.unibo.it/handle/11585/521151', 'https://cris.unibo.it/handle/11585/521165', 'https://cris.unibo.it/handle/11585/521121', 'https://cris.unibo.it/handle/11585/310115', 'https://cris.unibo.it/handle/11585/570664', 'https://cris.unibo.it/handle/11585/555875', 'https://cris.unibo.it/handle/11585/310113', 'https://cris.unibo.it/handle/11585/146350', 'https://cris.unibo.it/handle/11585/521166', 'https://cris.unibo.it/handle/11585/556150', 'https://cris.unibo.it/handle/11585/310114', 'https://cris.unibo.it/handle/11585/399426', 'https://cris.unibo.it/handle/11585/392200', 'https://cris.unibo.it/handle/11585/185113', 'https://cris.unibo.it/handle/11585/392563', 'https://cris.unibo.it/handle/11585/127987', 'https://cris.unibo.it/handle/11585/392192', 'https://cris.unibo.it/handle/11585/146344', 'https://cris.unibo.it/handle/11585/399440', 'https://cris.unibo.it/handle/11585/555513', 'https://cris.unibo.it/handle/11585/123083', 'https://cris.unibo.it/handle/11585/123101', 'https://cris.unibo.it/handle/11585/112587', 'https://cris.unibo.it/handle/11585/123112', 'https://cris.unibo.it/handle/11585/131049', 'https://cris.unibo.it/handle/11585/134556', 'https://cris.unibo.it/handle/11585/112584', 'https://cris.unibo.it/handle/11585/123088', 'https://cris.unibo.it/handle/11585/123107', 'https://cris.unibo.it/handle/11585/112578', 'https://cris.unibo.it/handle/11585/112589', 'https://cris.unibo.it/handle/11585/112577', 'https://cris.unibo.it/handle/11585/101002', 'https://cris.unibo.it/handle/11585/100988', 'https://cris.unibo.it/handle/11585/112609', 'https://cris.unibo.it/handle/11585/100993', 'https://cris.unibo.it/handle/11585/96860', 'https://cris.unibo.it/handle/11585/101004', 'https://cris.unibo.it/handle/11585/83936', 'https://cris.unibo.it/handle/11585/87813', 'https://cris.unibo.it/handle/11585/112606', 'https://cris.unibo.it/handle/11585/100980', 'https://cris.unibo.it/handle/11585/87836', 'https://cris.unibo.it/handle/11585/112610', 'https://cris.unibo.it/handle/11585/611245', 'https://cris.unibo.it/handle/11585/590494', 'https://cris.unibo.it/handle/11585/590490', 'https://cris.unibo.it/handle/11585/564532', 'https://cris.unibo.it/handle/11585/309347', 'https://cris.unibo.it/handle/11585/310115', 'https://cris.unibo.it/handle/11585/310113', 'https://cris.unibo.it/handle/11585/146350', 'https://cris.unibo.it/handle/11585/310114', 'https://cris.unibo.it/handle/11585/185113', 'https://cris.unibo.it/handle/11585/127987', 'https://cris.unibo.it/handle/11585/146344', 'https://cris.unibo.it/handle/11585/591734', 'https://cris.unibo.it/handle/11585/555908', 'https://cris.unibo.it/handle/11585/570664', 'https://cris.unibo.it/handle/11585/112589', 'https://cris.unibo.it/handle/11585/112609', 'https://cris.unibo.it/handle/11585/543615', 'https://cris.unibo.it/handle/11585/96860', 'https://cris.unibo.it/handle/11585/83936', 'https://cris.unibo.it/handle/11585/621576', 'https://cris.unibo.it/handle/11585/556162', 'https://cris.unibo.it/handle/11585/100993', 'https://cris.unibo.it/handle/11585/100980']
# Compare the unique URLs with the URLs in the list
urls_not_in_dataframe = [url for url in url_list if url not in unique_urls]

# URLs not present in the DataFrame are in the 'urls_not_in_dataframe' list
print("URLs not present in the DataFrame:")
print(len(url_list), len(unique_urls))
print(unique_urls)
for url in urls_not_in_dataframe:
    print(url)


URLs not present in the DataFrame:
170 73
{'https://cris.unibo.it/handle/11585/551290', 'https://cris.unibo.it/handle/11585/112610', 'https://cris.unibo.it/handle/11585/127987', 'https://cris.unibo.it/handle/11585/392563', 'https://cris.unibo.it/handle/11585/555934', 'https://cris.unibo.it/handle/11585/590490', 'https://cris.unibo.it/handle/11585/112609', 'https://cris.unibo.it/handle/11585/621575', 'https://cris.unibo.it/handle/11585/591734', 'https://cris.unibo.it/handle/11585/551219', 'https://cris.unibo.it/handle/11585/521151', 'https://cris.unibo.it/handle/11585/555875', 'https://cris.unibo.it/handle/11585/112589', 'https://cris.unibo.it/handle/11585/83936', 'https://cris.unibo.it/handle/11585/87836', 'https://cris.unibo.it/handle/11585/543615', 'https://cris.unibo.it/handle/11585/101004', 'https://cris.unibo.it/handle/11585/590494', 'https://cris.unibo.it/handle/11585/96860', 'https://cris.unibo.it/handle/11585/621578', 'https://cris.unibo.it/handle/11585/591739', 'https://cris.u

In [12]:
with open('./unique_publications.json', 'r') as f:
    input_json = json.load(f)

unique_df = pd.read_json('./unique_publications.json').transpose()

In [22]:
unique_df.head()

Unnamed: 0,internalAuthor,externalAuthor,dc.title,dc.keywords,scopus.keywords,dc.date.issued,dc.identifier.doi,dc.collection.name
https://cris.unibo.it/handle/11585/740067,[/cris/rp/rp17523],[],"Data Science - Methods, Infrastructure, and Ap...",data science; semantic publishing,,9999,,8.01 Ruolo editoriale in rivista
https://cris.unibo.it/handle/11585/902321,[/cris/rp/rp17523],[],Frontiers in Research Metrics & Analytics,"scientometrics, bibliometrics, science of science",,9999,,8.01 Ruolo editoriale in rivista
https://cris.unibo.it/handle/11585/902319,[/cris/rp/rp17523],[],Journal of Documentation,"library and information sciences, psychology, ...",,9999,,8.01 Ruolo editoriale in rivista
https://cris.unibo.it/handle/11585/809974,[/cris/rp/rp17523],[],OpenAIRE-Nexus Scholarly Communication Service...,"OpenAIRE, Open Science, Open Science infrastru...",,9999,,8.04 Coordinamento di progetti di ricerca
https://cris.unibo.it/handle/11585/740057,[/cris/rp/rp17523],[],PeerJ Computer Science,computer science,,9999,,8.01 Ruolo editoriale in rivista


In [14]:
filtered_df2 = unique_df[unique_df['internalAuthor'].apply(lambda authors: "/cris/rp/rp07910" in authors and "/cris/rp/rp17523" in authors)].reset_index()
len(filtered_df2)
#filtered_df2[filtered_df2['index'].isin(filtered_df2['index'][filtered_df2['index'].duplicated()])].sort_values("index")

73

In [26]:
pd.set_option('display.max_rows', None)

kw_df = unique_df.copy()
kw_df['dc.keywords'] = kw_df['dc.keywords'].str.replace(r'[;\·\.]|\r\n|\s\-\s', ',', regex=True).str.split(',')
keyword_df = kw_df.explode('dc.keywords').reset_index(drop=True)
keyword_df['dc.keywords'] = keyword_df['dc.keywords'].str.strip().str.lower()
keyword_df = keyword_df[keyword_df['dc.keywords'] != '']
keyword_df['dc.keywords'].value_counts()
#keyword_df['dc.keywords'].to_csv('keywords.csv')

keyword_df.shape
#keyword_df.query('`dc.keywords`.str.contains("data modeling", na=False)', engine='python')

(5087, 8)

In [16]:
def correct_keywords(keyword):
    if len(processed) == 0:
        processed.append(keyword)
        return keyword
    if keyword == None:
        return None
    match, score = process.extractOne(keyword, processed, scorer=fuzz.token_sort_ratio)
    if match != keyword and score >= 90:
        return match
    else:
        processed.append(keyword)
        return keyword

processed = []

uk_df = keyword_df.copy()

uk_df['dc_keywords'] = uk_df['dc.keywords'].apply(correct_keywords)


In [17]:
uk_df['dc_keywords'].value_counts()

dc_keywords
letteratura italiana                                                                                                                                   117
semantic web                                                                                                                                            97
filologia d'autore                                                                                                                                      39
ontologies                                                                                                                                              38
carlo emilio gadda                                                                                                                                      38
semantic publishing                                                                                                                                     35
legal xml                                                 

In [18]:
#uk_df.query("`dc.keywords` == 'ntural language processing'")
uk_df['dc.keywords'].value_counts(dropna=False)
filtered_df = uk_df[uk_df['dc.keywords'] != uk_df['dc_keywords']]
uk_df.query('`dc_keywords`.str.contains(r"embeddings", na=False)', engine='python')

Unnamed: 0,internalAuthor,externalAuthor,dc.title,dc.keywords,scopus.keywords,dc.date.issued,dc.identifier.doi,dc.collection.name,dc_keywords
47,"[/cris/rp/rp224591, /cris/rp/rp17523, /cris/rp...","[Gesese G. A.;, Sack H.;, Alam M.]",A knowledge graph embeddings based approach fo...,knowledge graph embeddings,*,2022,10.1007/s11192-022-04426-2,1.01 Articolo in rivista,knowledge graph embeddings
1626,"[/cris/rp/rp104381, /cris/rp/rp38918]",[Marinucci L.;],Exposing implicit biases and stereotypes in hu...,word embeddings,*,2023,10.1007/s00146-022-01474-3,1.01 Articolo in rivista,word embeddings
1838,"[/cris/rp/rp158301, /cris/rp/rp38918]","[Reforgiato Recupero D;, Mongiovi M;, Ristoski P]",Event-based knowledge reconciliation using fra...,frame embeddings,*,2017,10.1016/j.knosys.2017.08.014,1.01 Articolo in rivista,frame embeddings
1841,[/cris/rp/rp38918],[],Frame Embeddings for Event-Based Knowledge Rec...,embeddings,,2017,,4.01 Contributo in Atti di convegno,embeddings
3170,[/cris/rp/rp38970],[],How “BERTology” changed the state-of-the-art a...,contextualised word embeddings,,2020,10.4000/books.aaccademia.8920,4.01 Contributo in Atti di convegno,contextualised word embeddings
3181,[/cris/rp/rp38970],[],UniBO@KIPoS: Fine-tuning the Italian “BERTolog...,contextualised word embeddings,,2020,,4.01 Contributo in Atti di convegno,contextualised word embeddings
3186,[/cris/rp/rp38970],[],A Quantum-Like Approach to Word Sense Disambig...,complex embeddings,,2019,10.26615/978-954-452-056-4_135,4.01 Contributo in Atti di convegno,complex embeddings
3189,"[/cris/rp/rp157777, /cris/rp/rp38970]",[],Enhancing a Text Summarization System with ELMo,contextual embeddings,,2019,,4.01 Contributo in Atti di convegno,contextual embeddings


In [19]:
print(uk_df['dc.keywords'].nunique(), uk_df['dc_keywords'].nunique())

2575 2443


In [20]:
def similarity_check_df(majors, minors, threshold, dataframe_column):
    matches = {}
    for minor in minors:
        for major in majors:
            similarity = fuzz.token_sort_ratio(minor.lower(), major.lower())
            if similarity >= threshold:
                if minor not in matches or similarity > matches[minor][1]:
                    times = dataframe_column.value_counts()[minor]
                    matches[minor] = (major, similarity, times)
    
    sorted_matches = sorted(matches.items(), key=lambda x: x[1][2], reverse=True)

    results = []
    for minor, (major, similarity, times) in sorted_matches:
        results.append([minor, major, times, similarity])

    df_check = pd.DataFrame(results, columns=['minor_location', 'major_location', 'frequency', 'similarity_score'])
    return df_check
