# Matching der Ergebnisse Scopus und OpenAlex


## Inhalt

* [Zeitscheibe 1: 2010 - 2013](#2010)
  * [Matching](#matching_2010)
* [Zeitscheibe 2: 2020 - 2023](#2020)
  * [Matching](#matching_2020)

In [1]:
import pandas as pd
import numpy as np
from rapidfuzz.distance import Levenshtein
import uuid
import matplotlib.pyplot as plt

pd.options.mode.copy_on_write = True

In [2]:
import jupyter_black

jupyter_black.load()

In [3]:
%load_ext watermark
%watermark -v --iversions

Python implementation: CPython
Python version       : 3.11.2
IPython version      : 8.5.0

jupyter_black: 0.4.0
rapidfuzz    : 3.13.0
matplotlib   : 3.6.3
numpy        : 1.24.2
pandas       : 1.5.3



In [4]:
# implementation based on https://github.com/gavieira/biblioverlap/blob/main/R/04-score_matching.R


def calc_distance_score_matrix(db1_col, db2_col, max_score, penalty):
    """  Compute normalized Levenshtein distance matrix. """ 
    n, m = len(db1_col), len(db2_col)
    matrix = np.zeros((n, m), dtype=np.float32)
    for i, val1 in enumerate(db1_col):
        for j, val2 in enumerate(db2_col):
            dist = Levenshtein.normalized_distance(str(val1), str(val2))
            matrix[i, j] = max_score - dist * penalty
    return matrix


def calc_year_score_matrix(db1_col, db2_col):
    """  Compute score for year  difference. """
    n, m = len(db1_col), len(db2_col)
    matrix = np.zeros((n, m), dtype=np.float32)
    for i, val1 in enumerate(db1_col):
        for j, val2 in enumerate(db2_col):
            if val1 == val2:
                matrix[i, j] = 0
            elif abs(val1 - val2) == 1:
                matrix[i, j] = 0.5
            else:
                matrix[i, j] = 1.0

    return matrix


def extract_score_matches(final_score_matrix, db1_index, db2_index):
    """ Get matches based on score. """
    max_scores = final_score_matrix.max(
        axis=1
    )  # get max value for all rows (db1 values)
    db2_pos = final_score_matrix.argmax(axis=1)
    matches = []
    for db1_id, score, db2_idx in zip(db1_index, max_scores, db2_pos):
        if score != 0:
            matches.append(
                {"scopus_id": db1_id, "score": score, "openalex_id": db2_index[db2_idx]}
            )
    return matches


def score_matching(
    db1,
    db2,
    title_penalty,
    title_max,
    source_penalty,
    source_max,
    author_penalty,
    author_max,
    score_cutoff,
):
    """
    Conduct score based matching.

    Parameters
    ----------
    db1 : pandas.DataFrame
        First collection of works for comparison, from Scopus.
    db2 : pandas.DataFrame
        Second collection of works for comparison, from OpenAlex.
    title_penalty: float
        Penalty for title score calculation
    title_max: float
        Maximum value for title score calculation
    source_penalty: float
        Penalty for source title score calculation
    source_max: float
        Maximum value for source title score calculation
    author_penalty : float
        Penalty for author name score calculation
    author_max : float
        Maximum value for author name score calculation
    score_cutoff : float
        Cutoff value to determine matching
     
    Returns
    --------
    matches : array of dict
        Found matches, contains scopus_id from db1, score and openalex_id from db2
    final_score_matrix : np.array
        Matrix containing final score values.
    """ 
    title_matrix = calc_distance_score_matrix(
        db1["scopus_title"], db2["openalex_title"], title_max, title_penalty
    )
    author_matrix = calc_distance_score_matrix(
        db1["scopus_name_1st_author"],
        db2["openalex_name_1st_author"],
        author_max,
        author_penalty,
    )
    source_matrix = calc_distance_score_matrix(
        db1["scopus_source_title"],
        db2["openalex_source_title"],
        source_max,
        source_penalty,
    )
    year_matrix = calc_year_score_matrix(
        db1["scopus_publication_year"], db2["openalex_publication_year"]
    )

    final_score_matrix = title_matrix + author_matrix + source_matrix - year_matrix
    final_score_matrix = final_score_matrix * (final_score_matrix >= score_cutoff)
    matches = extract_score_matches(
        final_score_matrix, db1["scopus_id"], db2["openalex_id"]
    )
    return matches, final_score_matrix

In [5]:
# Check: Wurden Anhand des Score Publikationen gematcht, welche unterschiedliche DOIS besitzen? 
# --> Test für mögliche falsch positive Matches


def get_scopus_id_for_false_positive_matches(data_matches, data_scopus, data_openalex):
    """
    Check found matches and identify falsely matched works.

    Parameters
    ----------
    data_matches : array of dict
        Found matches, contains scopus_id from db1, score and openalex_id from db2
    data_scopus : pandas.DataFrame
        Collections of works from Scopus.
    data_openalex : pandas.DataFrame
        Collections of works from OpenAlex.
     
    Returns
    --------
    index_list_to_be_removed : list
        Indexes of data_matches which contain falsely matched works.
    """
    index_list_to_be_removed = []
    data_matches_df = pd.DataFrame.from_dict(data_matches)

    for index, row in data_matches_df.iterrows():
        scopus_id = row["scopus_id"]
        openalex_id = row["openalex_id"]
        scopus_data = data_scopus[data_scopus["scopus_id"] == scopus_id]
        openalex_data = data_openalex[data_openalex["openalex_id"] == openalex_id]

        if not scopus_data.empty:  # Scopus Eintrag vorhanden?
            if not openalex_data.empty:  # OpenAlex Eintrag vorhanden?
                scopus_doi = scopus_data.iloc[0, scopus_data.columns.get_loc("doi")]
                openalex_doi = openalex_data.iloc[
                    0, openalex_data.columns.get_loc("doi")
                ]
                if scopus_doi != openalex_doi:  #  DOI unterschiedlich?
                    index_list_to_be_removed.append(index)
    return index_list_to_be_removed


def remove_matches_with_different_doi(data_matches, data_scopus, data_openalex):
    """
    Check found matches and remove falsely matched works.

    Parameters
    ----------
    data_matches : array of dict
        Found matches, contains scopus_id from db1, score and openalex_id from db2
    data_scopus : pandas.DataFrame
        Collections of works from Scopus.
    data_openalex : pandas.DataFrame
        Collections of works from OpenAlex.
     
    Returns
    --------
    data_matches_update : array of dict
        Updated array of matches with falsely matched works removed.
    """
    index_to_be_removed = get_scopus_id_for_false_positive_matches(
        data_matches, data_scopus, data_openalex
    )
    data_matches_update = [
        i for j, i in enumerate(data_matches) if j not in index_to_be_removed
    ]
    return data_matches_update

## Zeitscheibe 1: 2010 - 2013<a class="anchor" id="2010"></a>


In [6]:
# Daten einlesen
data_scopus_2010 = pd.read_csv("../data/processed/data_scopus_2010_cleaned.csv")
data_openalex_2010 = pd.read_csv("../data/processed/data_openalex_2010_cleaned.csv")

In [7]:
data_scopus_2010.columns

Index(['eid', 'doi', 'pii', 'pubmed_id', 'title', 'subtype',
       'subtypeDescription', 'creator', 'afid', 'affilname',
       'affiliation_city', 'affiliation_country', 'author_count',
       'author_names', 'author_ids', 'author_afids', 'coverDate',
       'coverDisplayDate', 'publicationName', 'issn', 'source_id', 'eIssn',
       'aggregationType', 'volume', 'issueIdentifier', 'article_number',
       'pageRange', 'description', 'authkeywords', 'citedby_count',
       'openaccess', 'freetoread', 'freetoreadLabel', 'fund_acr', 'fund_no',
       'fund_sponsor', 'name_1st_author', 'publication_year'],
      dtype='object')

In [8]:
# Spaltennamen umbenennen für Abgleich
data_scopus_2010.rename(
    columns={
        "eid": "scopus_id",
        "title": "scopus_title",
        "name_1st_author": "scopus_name_1st_author",
        "publicationName": "scopus_source_title",
        "volume": "scopus_volume",
        "publication_year": "scopus_publication_year",
    },
    inplace=True,
)
data_openalex_2010.rename(
    columns={
        "id": "openalex_id",
        "title": "openalex_title",
        "name_1st_author": "openalex_name_1st_author",
        "source_title": "openalex_source_title",
        "volume": "openalex_volume",
        "publication_year": "openalex_publication_year",
    },
    inplace=True,
)

In [None]:
data_scopus_2010.head()

In [10]:
data_openalex_2010.columns

Index(['openalex_id', 'doi', 'openalex_title', 'type', 'publication_date',
       'openalex_publication_year', 'openalex_volume', 'issue', 'first_page',
       'last_page', 'authors', 'keywords', 'abstract', 'openalex_source_title',
       'source_issn', 'source_type', 'cited_by_count', 'referenced_works',
       'openalex_name_1st_author'],
      dtype='object')

In [11]:
data_openalex_2010.index[-1]

5000

In [12]:
data_openalex_2010.head()

Unnamed: 0,openalex_id,doi,openalex_title,type,publication_date,openalex_publication_year,openalex_volume,issue,first_page,last_page,authors,keywords,abstract,openalex_source_title,source_issn,source_type,cited_by_count,referenced_works,openalex_name_1st_author
0,https://openalex.org/W2035931858,10.1063/1.4757907,crystallinity of inorganic films grown by atom...,article,2013-01-08,2013,113,2,,,"Ville Miikkulainen, Markku Leskelä, Mikko Rita...",Deposition,Atomic layer deposition (ALD) is gaining atten...,journal of applied physics,"['0021-8979', '1089-7550', '1520-8850']",journal,1339,"['https://openalex.org/W101290300', 'https://o...",Miikkulainen
1,https://openalex.org/W2025831662,10.1021/ja4050828,vapor-phase metalation by atomic layer deposit...,article,2013-07-05,2013,135,28,10294.0,10297.0,"Joseph E. Mondloch, Wojciech Bury, David Faire...","Metalation, Deposition",Metal–organic frameworks (MOFs) have received ...,journal of the american chemical society,"['0002-7863', '1520-5126', '1943-2984']",journal,899,"['https://openalex.org/W1578420299', 'https://...",Mondloch
2,https://openalex.org/W2149248689,10.1116/1.3609974,plasma-assisted atomic layer deposition: basic...,article,2011-08-18,2011,29,5,,,"Harald B. Profijt, Stephen E. Potts, M. C. M. ...","Microelectronics, Plasma Processing, Deposition",Plasma-assisted atomic layer deposition (ALD) ...,journal of vacuum science & technology a vacuu...,"['0734-2101', '1520-8559']",journal,811,"['https://openalex.org/W1508775127', 'https://...",Profijt
3,https://openalex.org/W2054254706,10.1126/science.1212906,coking- and sintering-resistant palladium cata...,article,2012-03-08,2012,335,6073,1205.0,1208.0,"Junling Lu, Baosong Fu, Mayfair C. Kung, Guomi...","Thermogravimetric analysis, Deposition",We showed that alumina (Al(2)O(3)) overcoating...,science,"['0036-8075', '1095-9203']",journal,795,"['https://openalex.org/W1547883012', 'https://...",Lu
4,https://openalex.org/W2151747661,10.1038/srep01775,single-atom catalysis using pt/graphene achiev...,article,2013-05-03,2013,3,1,,,"Shuhui Sun, Gaixia Zhang, Nicolas Gauquelin, N...","Nanosheet, Deposition",Platinum-nanoparticle-based catalysts are wide...,scientific reports,['2045-2322'],journal,837,"['https://openalex.org/W1495146461', 'https://...",Sun


### Matching<a class="anchor" id="matching_2010"></a>

Das Vorgehen zum Abgleich der Datenbanken bzw. Matching der Datenbankeinträge orientiert sich an [Vieira & Leta, 2024](https://doi.org/10.1007/s11192-024-05065-5), welche die documentbasierte Matching-Stratgie der R-Bibliothek *biblioverlap* beschreibt. 

In [13]:
# Erzeuge UUID zur eindeutigen Identifikation der Daten
# Scopus dient als UUID Spender, bei erfolgten Match wird der OpenAlex Datensatz mit dieser UUID verknüpft

data_scopus_2010["uuid"] = [uuid.uuid4() for _ in range(len(data_scopus_2010.index))]
data_openalex_2010["uuid"] = pd.Series(dtype="object")

In [14]:
data_scopus_2010["uuid"]

0       d676dcb2-2d94-47e2-ab80-f857fee22536
1       8f5df863-ccfe-424b-b8ea-e00551c3d542
2       3b34101c-b6fc-4373-b90f-db3f7d6b92d0
3       23e52f82-eabe-41f9-9b0b-100990e512ea
4       5df2cf85-2690-45db-8739-db3e1deaccea
                        ...                 
4001    ec1d84f6-8952-4063-bed1-66685ef741db
4002    298f65c1-9b8b-448b-83d5-ef18f6d511b3
4003    34525b1f-64ac-48bb-8eba-efc43a7ef3ea
4004    f3622463-312a-4339-8b52-5a15f5916773
4005    998e588c-98bc-4e9f-a6c2-3ba9743fd596
Name: uuid, Length: 4006, dtype: object

In [15]:
data_openalex_2010["uuid"]

0       NaN
1       NaN
2       NaN
3       NaN
4       NaN
       ... 
4996    NaN
4997    NaN
4998    NaN
4999    NaN
5000    NaN
Name: uuid, Length: 5001, dtype: object

In [16]:
# Desweiteren wird ein Label genutzt um schon gematchte Datensätze besser identifizieren zu können
data_scopus_2010["matched"] = False
data_openalex_2010["matched"] = False

In [17]:
# Schritt 1: Matching anhand der DOI
# Zuerst werden die Einträge mit DOI aus Scopus mit den Einträgen mit DOI aus OpenAlex verglichen.

# Nur Datensätze mit DOI berücksichtigen
data_scopus_2010_mitdoi = data_scopus_2010[data_scopus_2010["doi"].notnull()]
data_openalex_2010_mitdoi = data_openalex_2010[data_openalex_2010["doi"].notnull()]

# Index aktualisieren
data_scopus_2010_mitdoi.reset_index(drop=True, inplace=True)
data_openalex_2010_mitdoi.reset_index(drop=True, inplace=True)

print(data_scopus_2010_mitdoi.shape[0])
print(data_openalex_2010_mitdoi.shape[0])

3837
4375


In [18]:
for index_scopus, row_scopus in data_scopus_2010_mitdoi.iterrows():
    for index_openalex, row_openalex in data_openalex_2010_mitdoi.iterrows():
        if row_scopus["doi"] == row_openalex["doi"]:
            # mark rows as matched
            data_scopus_2010_mitdoi.iloc[
                index_scopus, data_scopus_2010_mitdoi.columns.get_loc("matched")
            ] = True
            data_openalex_2010_mitdoi.iloc[
                index_openalex, data_openalex_2010_mitdoi.columns.get_loc("matched")
            ] = True
            # write UUID to OpenAlex
            data_openalex_2010_mitdoi.iloc[
                index_openalex, data_openalex_2010_mitdoi.columns.get_loc("uuid")
            ] = row_scopus["uuid"]

In [19]:
# No. of matched items
data_scopus_2010_doi_matched = data_scopus_2010_mitdoi[
    data_scopus_2010_mitdoi["matched"] == True
]
data_openalex_2010_doi_matched = data_openalex_2010_mitdoi[
    data_openalex_2010_mitdoi["matched"] == True
]
num_scopus = data_scopus_2010_doi_matched.shape[0]
num_openalex = data_openalex_2010_doi_matched.shape[0]

if num_scopus == num_openalex:
    print(f"Es wurden {num_scopus} übereinstimmende DOIs gefunden.")

Es wurden 2872 übereinstimmende DOIs gefunden.


Für alle weiteren Datensätze wird in einem zweiten Schritt ein Matching anhand der Metadaten erzielt.   
Die Entwicklung des genutzen Scores erfolgt in einem separaten Notebook ('05a_Matching_score.ipynb'). Hierfür wurden die per DOI gematchten Daten für die Zeitscheibe 2010 - 2013 genutzt.

In [None]:
# scopus data
data_doi_matched_scopus = data_scopus_2010_doi_matched[
    [
        "eid",
        "doi",
        "title",
        "name_1st_author",
        "publicationName",
        "volume",
        "publication_year",
    ]
].copy()
data_doi_matched_scopus.rename(
    columns={
        "eid": "scopus_id",
        "title": "scopus_title",
        "name_1st_author": "scopus_name_1st_author",
        "publicationName": "scopus_source_title",
        "volume": "scopus_volume",
        "publication_year": "scopus_publication_year",
    },
    inplace=True,
)

# openalex data
data_doi_matched_openalex = data_openalex_2010_doi_matched[
    [
        "id",
        "doi",
        "title",
        "name_1st_author",
        "source_title",
        "volume",
        "publication_year",
    ]
].copy()
data_doi_matched_openalex.rename(
    columns={
        "id": "openalex_id",
        "title": "openalex_title",
        "name_1st_author": "openalex_name_1st_author",
        "source_title": "openalex_source_title",
        "volume": "openalex_volume",
        "publication_year": "openalex_publication_year",
    },
    inplace=True,
)

# join
data_doi_matched = pd.merge(
    data_doi_matched_scopus, data_doi_matched_openalex, on="doi"
)

# save data
data_doi_matched.to_csv("../data/processed/data_doi_matched.csv", index=False)

In [20]:
# Erstellung der noch zu matchenden Datensätze: ohne DOI und die mit DOI aber unmatched

data_scopus_2010_doi_unmatched = data_scopus_2010_mitdoi[
    data_scopus_2010_mitdoi["matched"] == False
]
data_openalex_2010_doi_unmatched = data_openalex_2010_mitdoi[
    data_openalex_2010_mitdoi["matched"] == False
]

data_scopus_2010_nodoi = data_scopus_2010[data_scopus_2010["doi"].isnull()]
data_openalex_2010_nodoi = data_openalex_2010[data_openalex_2010["doi"].isnull()]

data_scopus_2010_unmatched = pd.concat(
    [data_scopus_2010_doi_unmatched, data_scopus_2010_nodoi], ignore_index=True
)
data_openalex_2010_unmatched = pd.concat(
    [data_openalex_2010_doi_unmatched, data_openalex_2010_nodoi], ignore_index=True
)

In [21]:
print(
    f"Für Scopus sind {data_scopus_2010.shape[0]} Einträge vorhanden. Hiervon haben {data_scopus_2010_mitdoi.shape[0]} eine DOI und {data_scopus_2010_nodoi.shape[0]} keine DOI."
)
print(
    f"Für OpenAlex sind {data_openalex_2010.shape[0]} Einträge vorhanden. Hiervon haben {data_openalex_2010_mitdoi.shape[0]} eine DOI und {data_openalex_2010_nodoi.shape[0]} keine DOI."
)

Für Scopus sind 4006 Einträge vorhanden. Hiervon haben 3837 eine DOI und 169 keine DOI.
Für OpenAlex sind 5001 Einträge vorhanden. Hiervon haben 4375 eine DOI und 626 keine DOI.


In [22]:
# ohne Titel
print(
    f"Scopus ohne Titel: {data_scopus_2010[data_scopus_2010['scopus_title'].isnull()].shape[0]}"
)
print(
    f"OpenAlex ohne Titel: {data_openalex_2010[data_openalex_2010['openalex_title'].isnull()].shape[0]} \n"
)

# ohne Publikationsjahr

print(
    f"Scopus ohne Publikationsjahr: {data_scopus_2010[data_scopus_2010['scopus_publication_year'].isnull()].shape[0]}"
)
print(
    f"OpenAlex ohne Publikationsjahr: {data_openalex_2010[data_openalex_2010['openalex_publication_year'].isnull()].shape[0]} \n"
)

# ohne Autornamen

print(
    f"Scopus ohne Autorennamen: {data_scopus_2010[data_scopus_2010['scopus_name_1st_author'].isnull()].shape[0]}"
)
print(
    f"OpenAlex ohne Autorennamen: {data_openalex_2010[data_openalex_2010['openalex_name_1st_author'].isnull()].shape[0]} \n"
)

# ohne Titel Quelle (e.g. Journal)

print(
    f"Scopus ohne Quellentitel: {data_scopus_2010[data_scopus_2010['scopus_source_title'].isnull()].shape[0]}"
)
print(
    f"OpenAlex ohne Quellentitel: {data_openalex_2010[data_openalex_2010['openalex_source_title'].isnull()].shape[0]} \n"
)

# ohne Volume

print(
    f"Scopus ohne Ausgabe (Volume): {data_scopus_2010[data_scopus_2010['scopus_volume'].isnull()].shape[0]}"
)
print(
    f"OpenAlex ohne Ausgabe (Volume): {data_openalex_2010[data_openalex_2010['openalex_volume'].isnull()].shape[0]}\n"
)


# ohne ISSN

print(
    f"Scopus ohne ISSN: {data_scopus_2010[data_scopus_2010['issn'].isnull()].shape[0]}"
)
print(
    f"OpenAlex ohne ISSN: {data_openalex_2010[data_openalex_2010['source_issn'].isnull()].shape[0]}\n"
)


# ohne Seitenangaben

print(
    f"Scopus ohne Seitenangaben: {data_scopus_2010[data_scopus_2010['pageRange'].isnull()].shape[0]}"
)
print(
    f"OpenAlex ohne Seitenangaben: {data_openalex_2010[data_openalex_2010['first_page'].isnull()].shape[0]}\n"
)

# ohne Abstract

print(
    f"Scopus ohne Abstract: {data_scopus_2010[data_scopus_2010['description'].isnull()].shape[0]}"
)
print(
    f"OpenAlex ohne Abstract: {data_openalex_2010[data_openalex_2010['abstract'].isnull()].shape[0]}"
)

Scopus ohne Titel: 0
OpenAlex ohne Titel: 0 

Scopus ohne Publikationsjahr: 0
OpenAlex ohne Publikationsjahr: 0 

Scopus ohne Autorennamen: 5
OpenAlex ohne Autorennamen: 8 

Scopus ohne Quellentitel: 0
OpenAlex ohne Quellentitel: 641 

Scopus ohne Ausgabe (Volume): 428
OpenAlex ohne Ausgabe (Volume): 964

Scopus ohne ISSN: 391
OpenAlex ohne ISSN: 960

Scopus ohne Seitenangaben: 1233
OpenAlex ohne Seitenangaben: 1316

Scopus ohne Abstract: 61
OpenAlex ohne Abstract: 703


In [23]:
print(data_scopus_2010_unmatched.shape[0])
print(data_openalex_2010_unmatched.shape[0])

print(
    data_scopus_2010_unmatched[data_scopus_2010_unmatched["matched"] == True].shape[0]
)
print(
    data_openalex_2010_unmatched[data_openalex_2010_unmatched["matched"] == True].shape[
        0
    ]
)

1134
2129
0
0


In [None]:
data_scopus_2010_unmatched

In [25]:
# match rest of data
matches_2010, score_matrix_2010 = score_matching(
    data_scopus_2010_unmatched,
    data_openalex_2010_unmatched,
    title_penalty=0.1,
    title_max=0.5,
    source_penalty=0.1,
    source_max=0.3,
    author_penalty=0.1,
    author_max=0.3,
    score_cutoff=1.0,
)

In [26]:
# Entferne Matches mit unterschiedlicher DOI (vermutlich falsch positiv)
matches_2010 = remove_matches_with_different_doi(
    matches_2010, data_scopus_2010_mitdoi, data_openalex_2010_mitdoi
)

In [27]:
matches_2010_df = pd.DataFrame.from_dict(matches_2010)
matches_2010_df

Unnamed: 0,scopus_id,score,openalex_id
0,2-s2.0-84887892210,1.09375,https://openalex.org/W2587422537
1,2-s2.0-84887868619,1.038793,https://openalex.org/W3036867614
2,2-s2.0-84887112409,1.041448,https://openalex.org/W590265867
3,2-s2.0-84889053031,1.044003,https://openalex.org/W343490430
4,2-s2.0-84883141524,1.062424,https://openalex.org/W278034884
5,2-s2.0-84880995336,1.006,https://openalex.org/W2417444939
6,2-s2.0-84878700951,1.1,https://openalex.org/W1188059325
7,2-s2.0-84878679783,1.1,https://openalex.org/W2143765521
8,2-s2.0-84880858566,1.048907,https://openalex.org/W592904155
9,2-s2.0-78651500481,1.008333,https://openalex.org/W3144443442


In [28]:
matches_2010_df[matches_2010_df.duplicated("scopus_id", keep=False)]

Unnamed: 0,scopus_id,score,openalex_id


In [29]:
matches_2010_df[matches_2010_df.duplicated("openalex_id", keep=False)]

Unnamed: 0,scopus_id,score,openalex_id


In [30]:
# copy results back to dataframes
data_openalex_2010_unmatched["score"] = None
for match in matches_2010:
    # set entries as matched
    index_scopus = data_scopus_2010_unmatched[
        data_scopus_2010_unmatched["scopus_id"] == match["scopus_id"]
    ].index.to_list()[0]
    index_openalex = data_openalex_2010_unmatched[
        data_openalex_2010_unmatched["openalex_id"] == match["openalex_id"]
    ].index.to_list()[0]

    data_scopus_2010_unmatched.iloc[
        index_scopus, data_scopus_2010_unmatched.columns.get_loc("matched")
    ] = True
    data_openalex_2010_unmatched.iloc[
        index_openalex, data_openalex_2010_unmatched.columns.get_loc("matched")
    ] = True

    # write UUID to OpenAlex
    uuid_scopus = data_scopus_2010_unmatched.iloc[
        index_scopus, data_scopus_2010_unmatched.columns.get_loc("uuid")
    ]
    data_openalex_2010_unmatched.iloc[
        index_openalex, data_openalex_2010_unmatched.columns.get_loc("uuid")
    ] = uuid_scopus

    # write score to openalex data
    data_openalex_2010_unmatched.iloc[
        index_openalex, data_openalex_2010_unmatched.columns.get_loc("score")
    ] = match["score"]

In [31]:
data_scopus_2010_unmatched[data_scopus_2010_unmatched["matched"] == True].shape[0]

52

In [32]:
data_scopus_2010_unmatched[data_scopus_2010_unmatched["matched"] == False].shape[0]

1082

In [33]:
data_openalex_2010_unmatched[data_openalex_2010_unmatched["matched"] == False].shape[0]

2077

In [34]:
# No. of matched items
data_scopus_2010_score_matched = data_scopus_2010_unmatched[
    data_scopus_2010_unmatched["matched"] == True
]
data_openalex_2010_score_matched = data_openalex_2010_unmatched[
    data_openalex_2010_unmatched["matched"] == True
]
num_scopus = data_scopus_2010_score_matched.shape[0]
num_openalex = data_openalex_2010_score_matched.shape[0]

if num_scopus == num_openalex:
    print(f"Es wurden {num_scopus} übereinstimmende DOIs gefunden.")

Es wurden 52 übereinstimmende DOIs gefunden.


In [35]:
# entries in openalex with same uuid??
data_openalex_matched = data_openalex_2010_unmatched[
    data_openalex_2010_unmatched["matched"] == True
]
data_openalex_matched[data_openalex_matched.duplicated("uuid", keep=False)]

Unnamed: 0,openalex_id,doi,openalex_title,type,publication_date,openalex_publication_year,openalex_volume,issue,first_page,last_page,...,abstract,openalex_source_title,source_issn,source_type,cited_by_count,referenced_works,openalex_name_1st_author,uuid,matched,score


In [36]:
# Ergebnisse aus beiden Matching Schritten zusammen führen

data_scopus_2010_matched = pd.concat(
    [data_scopus_2010_doi_matched, data_scopus_2010_score_matched]
)
data_openalex_2010_matched = pd.concat(
    [data_openalex_2010_doi_matched, data_openalex_2010_score_matched]
)

data_2010_matched = pd.merge(
    data_scopus_2010_matched, data_openalex_2010_matched, on="uuid"
)

# Spalten umbenennen
data_2010_matched.rename(
    columns={
        "matched_y": "matched",
        "doi_x": "doi",
    },
    inplace=True,
)

# Spalte uuid nicht mit abspeichern
data_2010_matched.drop(columns=["uuid", "matched_x", "doi_y"], inplace=True)

In [37]:
data_2010_matched.columns

Index(['scopus_id', 'doi', 'pii', 'pubmed_id', 'scopus_title', 'subtype',
       'subtypeDescription', 'creator', 'afid', 'affilname',
       'affiliation_city', 'affiliation_country', 'author_count',
       'author_names', 'author_ids', 'author_afids', 'coverDate',
       'coverDisplayDate', 'scopus_source_title', 'issn', 'source_id', 'eIssn',
       'aggregationType', 'scopus_volume', 'issueIdentifier', 'article_number',
       'pageRange', 'description', 'authkeywords', 'citedby_count',
       'openaccess', 'freetoread', 'freetoreadLabel', 'fund_acr', 'fund_no',
       'fund_sponsor', 'scopus_name_1st_author', 'scopus_publication_year',
       'openalex_id', 'openalex_title', 'type', 'publication_date',
       'openalex_publication_year', 'openalex_volume', 'issue', 'first_page',
       'last_page', 'authors', 'keywords', 'abstract', 'openalex_source_title',
       'source_issn', 'source_type', 'cited_by_count', 'referenced_works',
       'openalex_name_1st_author', 'matched', 'scor

In [38]:
# Daten speichern
data_2010_matched.to_csv("../data/diff/data_2010_matched.csv")

data_scopus_2010_unmatched[data_scopus_2010_unmatched["matched"] == False].to_csv(
    "../data/diff/data_2010_only_scopus.csv"
)
data_openalex_2010_unmatched[data_openalex_2010_unmatched["matched"] == False].to_csv(
    "../data/diff/data_2010_only_openalex.csv"
)

In [39]:
# Daten für Veröffentlichung (nur DOI, EID und ID)
df_2010_matched = data_2010_matched[
    [
        "doi",
        "scopus_id",
        "openalex_id",
    ]
]
df_2010_only_scopus = data_scopus_2010_unmatched[
    [
        "doi",
        "scopus_id",
    ]
]
df_2010_only_openalex = data_openalex_2010_unmatched[
    [
        "doi",
        "openalex_id",
    ]
]

df_2010_matched_all = pd.concat(
    [df_2010_matched, df_2010_only_scopus, df_2010_only_openalex]
)

In [40]:
# check: correct df size
df_2010_matched_all.shape[0] == (
    df_2010_matched.shape[0]
    + df_2010_only_scopus.shape[0]
    + df_2010_only_openalex.shape[0]
)

True

In [41]:
# Daten speichern
df_2010_matched_all.to_csv("../data/diff/data_2010_matching_results.csv")

## Zeitscheibe 2: 2020 - 2023<a class="anchor" id="2020"></a>

In [42]:
# Daten einlesen
data_scopus_2020 = pd.read_csv("../data/processed/data_scopus_2020_cleaned.csv")
data_openalex_2020 = pd.read_csv("../data/processed/data_openalex_2020_cleaned.csv")

In [43]:
# Spaltennamen umbennnen für Abgleich
data_scopus_2020.rename(
    columns={
        "eid": "scopus_id",
        "title": "scopus_title",
        "name_1st_author": "scopus_name_1st_author",
        "publicationName": "scopus_source_title",
        "volume": "scopus_volume",
        "publication_year": "scopus_publication_year",
    },
    inplace=True,
)
data_openalex_2020.rename(
    columns={
        "id": "openalex_id",
        "title": "openalex_title",
        "name_1st_author": "openalex_name_1st_author",
        "source_title": "openalex_source_title",
        "volume": "openalex_volume",
        "publication_year": "openalex_publication_year",
    },
    inplace=True,
)

In [None]:
data_scopus_2020.head()

In [45]:
data_openalex_2020.head()

Unnamed: 0,openalex_id,doi,openalex_title,type,publication_date,openalex_publication_year,openalex_volume,issue,first_page,last_page,authors,keywords,abstract,openalex_source_title,source_issn,source_type,cited_by_count,referenced_works,openalex_name_1st_author
0,https://openalex.org/W3001010696,10.1063/1.5133390,understanding chemical and physical mechanisms...,article,2020-01-22,2020,152,4,,,"Nathaniel E. Richey, Camila de Paula, Stacey F...","Chemisorption, Deposition, Atomic layer epitaxy",Atomic layer deposition (ALD) is a powerful to...,the journal of chemical physics,"['0021-9606', '1089-7690', '1520-9032']",journal,229,"['https://openalex.org/W1905740964', 'https://...",Richey
1,https://openalex.org/W3168021923,10.1021/acscatal.1c01200,single-atom catalysts designed and prepared by...,article,2021-06-02,2021,11,12,7018.0,7059.0,"Javier Fonseca, Junling Lu","Atomic units, Characterization, Deposition, At...",The atomic layer deposition (ALD) technique al...,acs catalysis,['2155-5435'],journal,206,"['https://openalex.org/W1162924051', 'https://...",Fonseca
2,https://openalex.org/W3022712516,10.1039/d0ee00385a,applications of atomic layer deposition and ch...,article,2020-01-01,2020,13,7,1997.0,2023.0,"James A. Raiford, Solomon T. Oyakhire, Stacey ...","Deposition, Perovskite solar cell",A review on the versatility of atomic layer de...,energy & environmental science,"['1754-5692', '1754-5706']",journal,155,"['https://openalex.org/W1184977109', 'https://...",Raiford
3,https://openalex.org/W4212902471,10.1038/s41928-022-00718-w,scaled indium oxide transistors fabricated usi...,article,2022-02-21,2022,5,3,164.0,170.0,"Mengwei Si, Zehao Lin, Zhizhong Chen, Xing Sun...",Transconductance,,nature electronics,['2520-1131'],journal,179,"['https://openalex.org/W1990230218', 'https://...",Si
4,https://openalex.org/W3006809768,10.1021/acs.chemmater.9b04647,inherently selective atomic layer deposition a...,article,2020-02-26,2020,32,6,2195.0,2207.0,"Kun Cao, Jiaming Cai, Rong Chen","Dangling bond, Passivation",The chemical approaches enabling selective ato...,chemistry of materials,"['0897-4756', '1520-5002']",journal,108,"['https://openalex.org/W1463728261', 'https://...",Cao


### Matching<a class="anchor" id="matching_2020"></a>

In [46]:
# Erzeuge UUID zur eindeutigen Identifikation der Daten
# Scopus dient als UUID Spender, bei erfolgten Match wird der OpenAlex Datensatz mit dieser UUId verknüpft

data_scopus_2020["uuid"] = [uuid.uuid4() for _ in range(len(data_scopus_2020.index))]
data_openalex_2020["uuid"] = pd.Series(dtype="object")

# Desweiteren wird ein Label genutzt um schon gematchte Datensätze besser identifizieren zu können
data_scopus_2020["matched"] = False
data_openalex_2020["matched"] = False

In [47]:
# Aufteilung in Daten mit und ohne DOI
data_scopus_2020_nodoi = data_scopus_2020[data_scopus_2020["doi"].isnull()]
data_scopus_2020_mitdoi = data_scopus_2020[data_scopus_2020["doi"].notnull()]

data_openalex_2020_nodoi = data_openalex_2020[data_openalex_2020["doi"].isnull()]
data_openalex_2020_mitdoi = data_openalex_2020[data_openalex_2020["doi"].notnull()]

print(
    f"Für Scopus wurden {data_scopus_2020.shape[0]} Einträge gefunden. Hiervon haben {data_scopus_2020_mitdoi.shape[0]} eine DOI und {data_scopus_2020_nodoi.shape[0]} keine DOI."
)
print(
    f"Für OpenAlex wurden {data_openalex_2020.shape[0]} Einträge gefunden. Hiervon haben {data_openalex_2020_mitdoi.shape[0]} eine DOI und {data_openalex_2020_nodoi.shape[0]} keine DOI."
)

Für Scopus wurden 6476 Einträge gefunden. Hiervon haben 6388 eine DOI und 88 keine DOI.
Für OpenAlex wurden 6282 Einträge gefunden. Hiervon haben 6118 eine DOI und 164 keine DOI.


In [48]:
# ohen Titel
print(
    f"Scopus ohne Titel: {data_scopus_2020[data_scopus_2020['scopus_title'].isnull()].shape[0]}"
)
print(
    f"OpenAlex ohne Titel: {data_openalex_2020[data_openalex_2020['openalex_title'].isnull()].shape[0]} \n"
)

# ohne Publikationsjahr

print(
    f"Scopus ohne Publikationsjahr: {data_scopus_2020[data_scopus_2020['scopus_publication_year'].isnull()].shape[0]}"
)
print(
    f"OpenAlex ohne Publikationsjahr: {data_openalex_2020[data_openalex_2020['openalex_publication_year'].isnull()].shape[0]} \n"
)

# ohne Autornamen

print(
    f"Scopus ohne Autorennamen: {data_scopus_2020[data_scopus_2020['scopus_name_1st_author'].isnull()].shape[0]}"
)
print(
    f"OpenAlex ohne Autorennamen: {data_openalex_2020[data_openalex_2020['openalex_name_1st_author'].isnull()].shape[0]} \n"
)

# ohne Titel Quelle (e.g. Journal)

print(
    f"Scopus ohne Quellentitel: {data_scopus_2020[data_scopus_2020['scopus_source_title'].isnull()].shape[0]}"
)
print(
    f"OpenAlex ohne Quellentitel: {data_openalex_2020[data_openalex_2020['openalex_source_title'].isnull()].shape[0]} \n"
)

# ohne Volume

print(
    f"Scopus ohne Ausgabe (Volume): {data_scopus_2020[data_scopus_2020['scopus_volume'].isnull()].shape[0]}"
)
print(
    f"OpenAlex ohne Ausgabe (Volume): {data_openalex_2020[data_openalex_2020['openalex_volume'].isnull()].shape[0]}\n"
)


# ohne ISSN

print(
    f"Scopus ohne ISSN: {data_scopus_2020[data_scopus_2020['issn'].isnull()].shape[0]}"
)
print(
    f"OpenAlex ohne ISSN: {data_openalex_2020[data_openalex_2020['source_issn'].isnull()].shape[0]}\n"
)


# ohne Seitenangaben

print(
    f"Scopus ohne Seitenangaben: {data_scopus_2020[data_scopus_2020['pageRange'].isnull()].shape[0]}"
)
print(
    f"OpenAlex ohne Seitenangaben: {data_openalex_2020[data_openalex_2020['first_page'].isnull()].shape[0]}\n"
)

# ohne Abstract

print(
    f"Scopus ohne Abstract: {data_scopus_2020[data_scopus_2020['description'].isnull()].shape[0]}"
)
print(
    f"OpenAlex ohne Abstract: {data_openalex_2020[data_openalex_2020['abstract'].isnull()].shape[0]}"
)

Scopus ohne Titel: 0
OpenAlex ohne Titel: 0 

Scopus ohne Publikationsjahr: 0
OpenAlex ohne Publikationsjahr: 0 

Scopus ohne Autorennamen: 7
OpenAlex ohne Autorennamen: 15 

Scopus ohne Quellentitel: 0
OpenAlex ohne Quellentitel: 465 

Scopus ohne Ausgabe (Volume): 345
OpenAlex ohne Ausgabe (Volume): 763

Scopus ohne ISSN: 1891
OpenAlex ohne ISSN: 716

Scopus ohne Seitenangaben: 3530
OpenAlex ohne Seitenangaben: 1684

Scopus ohne Abstract: 18
OpenAlex ohne Abstract: 750


In [49]:
# Schritt 1: Matching anhand der DOI
# Zuerst werden die Einträge mit DOI aus Scopus mit den Einträgen mit DOI aus OpenAlex verglichen.

# Nur Datensätze mit DOI berücksichtigen
# Index aktualisieren
data_scopus_2020_mitdoi.reset_index(drop=True, inplace=True)
data_openalex_2020_mitdoi.reset_index(drop=True, inplace=True)

print(data_scopus_2020_mitdoi.shape[0])
print(data_openalex_2020_mitdoi.shape[0])

6388
6118


In [50]:
for index_scopus, row_scopus in data_scopus_2020_mitdoi.iterrows():
    for index_openalex, row_openalex in data_openalex_2020_mitdoi.iterrows():
        if row_scopus["doi"] == row_openalex["doi"]:
            # mark rows as matched
            data_scopus_2020_mitdoi.iloc[
                index_scopus, data_scopus_2020_mitdoi.columns.get_loc("matched")
            ] = True
            data_openalex_2020_mitdoi.iloc[
                index_openalex, data_openalex_2020_mitdoi.columns.get_loc("matched")
            ] = True
            # write UUID to OpenAlex
            data_openalex_2020_mitdoi.iloc[
                index_openalex, data_openalex_2020_mitdoi.columns.get_loc("uuid")
            ] = row_scopus["uuid"]

# No. of matched items
data_scopus_2020_doi_matched = data_scopus_2020_mitdoi[
    data_scopus_2020_mitdoi["matched"] == True
]
data_openalex_2020_doi_matched = data_openalex_2020_mitdoi[
    data_openalex_2020_mitdoi["matched"] == True
]
num_scopus = data_scopus_2020_doi_matched.shape[0]
num_openalex = data_openalex_2020_doi_matched.shape[0]

if num_scopus == num_openalex:
    print(f"Es wurden {num_scopus} übereinstimmende DOIs gefunden.")

Es wurden 4237 übereinstimmende DOIs gefunden.


In [51]:
# Schritt 2: Matching anhand des Scores
# Erstellung der noch zu matchenden Datensätze: ohne DOI und die mit DOI aber unmatched

data_scopus_2020_doi_unmatched = data_scopus_2020_mitdoi[
    data_scopus_2020_mitdoi["matched"] == False
]
data_openalex_2020_doi_unmatched = data_openalex_2020_mitdoi[
    data_openalex_2020_mitdoi["matched"] == False
]

data_scopus_2020_nodoi = data_scopus_2020[data_scopus_2020["doi"].isnull()]
data_openalex_2020_nodoi = data_openalex_2020[data_openalex_2020["doi"].isnull()]

data_scopus_2020_unmatched = pd.concat(
    [data_scopus_2020_doi_unmatched, data_scopus_2020_nodoi], ignore_index=True
)
data_openalex_2020_unmatched = pd.concat(
    [data_openalex_2020_doi_unmatched, data_openalex_2020_nodoi], ignore_index=True
)

In [52]:
# match rest of data
matches_2020, score_matrix_2020 = score_matching(
    data_scopus_2020_unmatched,
    data_openalex_2020_unmatched,
    title_penalty=0.1,
    title_max=0.5,
    source_penalty=0.1,
    source_max=0.3,
    author_penalty=0.1,
    author_max=0.3,
    score_cutoff=1.0,
)

In [53]:
# Entferne Matches mit unterschiedlicher DOI (vermutlich falsch positiv)
matches_2020 = remove_matches_with_different_doi(
    matches_2020, data_scopus_2020_mitdoi, data_openalex_2020_mitdoi
)

In [54]:
matches_2020_df = pd.DataFrame.from_dict(matches_2020)
matches_2020_df.head()

Unnamed: 0,scopus_id,score,openalex_id
0,2-s2.0-85091847825,1.0,https://openalex.org/W3036865856
1,2-s2.0-85213956793,1.003191,https://openalex.org/W4400943923
2,2-s2.0-85176363689,1.005455,https://openalex.org/W4385365359
3,2-s2.0-85174120522,1.004054,https://openalex.org/W4386762134
4,2-s2.0-85128729486,1.01087,https://openalex.org/W4310748616


In [55]:
matches_2020_df.shape[0]

26

In [56]:
matches_2020_df[matches_2020_df.duplicated("scopus_id", keep=False)]

Unnamed: 0,scopus_id,score,openalex_id


In [57]:
matches_2020_df[matches_2020_df.duplicated("openalex_id", keep=False)]

Unnamed: 0,scopus_id,score,openalex_id
17,2-s2.0-85120485661,1.05577,https://openalex.org/W3194456814
24,2-s2.0-85119440571,1.028205,https://openalex.org/W3194456814


In [58]:
# OpenAlex-Eintrag doppelt gematcht, Eintrag immer geringerem Score entfernen
del matches_2020[24]

In [59]:
# copy results back to dataframes
data_openalex_2020_unmatched["score"] = None
for match in matches_2020:
    # set entries as matched
    index_scopus = data_scopus_2020_unmatched[
        data_scopus_2020_unmatched["scopus_id"] == match["scopus_id"]
    ].index.to_list()[0]
    index_openalex = data_openalex_2020_unmatched[
        data_openalex_2020_unmatched["openalex_id"] == match["openalex_id"]
    ].index.to_list()[0]

    data_scopus_2020_unmatched.iloc[
        index_scopus, data_scopus_2020_unmatched.columns.get_loc("matched")
    ] = True
    data_openalex_2020_unmatched.iloc[
        index_openalex, data_openalex_2020_unmatched.columns.get_loc("matched")
    ] = True

    # write UUID to OpenAlex
    uuid_scopus = data_scopus_2020_unmatched.iloc[
        index_scopus, data_scopus_2020_unmatched.columns.get_loc("uuid")
    ]
    data_openalex_2020_unmatched.iloc[
        index_openalex, data_openalex_2020_unmatched.columns.get_loc("uuid")
    ] = uuid_scopus

    # write score to openalex data
    data_openalex_2020_unmatched.iloc[
        index_openalex, data_openalex_2020_unmatched.columns.get_loc("score")
    ] = match["score"]

In [60]:
data_scopus_2020_unmatched[data_scopus_2020_unmatched["matched"] == False].shape[0]

2214

In [61]:
data_scopus_2020_unmatched[data_scopus_2020_unmatched["matched"] == True].shape[0]

25

In [62]:
data_openalex_2020_unmatched[data_openalex_2020_unmatched["matched"] == True].shape[0]

25

In [63]:
# No. of matched items
data_scopus_2020_score_matched = data_scopus_2020_unmatched[
    data_scopus_2020_unmatched["matched"] == True
]
data_openalex_2020_score_matched = data_openalex_2020_unmatched[
    data_openalex_2020_unmatched["matched"] == True
]
num_scopus = data_scopus_2020_score_matched.shape[0]
num_openalex = data_openalex_2020_score_matched.shape[0]

print(num_scopus)
print(num_openalex)

if num_scopus == num_openalex:
    print(f"Es wurden {num_scopus} übereinstimmende DOIs gefunden.")

25
25
Es wurden 25 übereinstimmende DOIs gefunden.


In [64]:
# Ergebnisse aus beiden Matching-Schritten zusammen führen
data_scopus_2020_matched = pd.concat(
    [data_scopus_2020_doi_matched, data_scopus_2020_score_matched]
)
data_openalex_2020_matched = pd.concat(
    [data_openalex_2020_doi_matched, data_openalex_2020_score_matched]
)

data_2020_matched = pd.merge(
    data_scopus_2020_matched, data_openalex_2020_matched, on="uuid"
)

# Spalten umbenennen
data_2020_matched.rename(
    columns={
        "doi_x": "doi",
        "matched_y": "matched",
    },
    inplace=True,
)

# Spalte UUID nicht mit abspeichern
data_2020_matched.drop(columns=["uuid", "matched_x", "doi_y"], inplace=True)

In [65]:
data_2020_matched.columns

Index(['scopus_id', 'doi', 'pii', 'pubmed_id', 'scopus_title', 'subtype',
       'subtypeDescription', 'creator', 'afid', 'affilname',
       'affiliation_city', 'affiliation_country', 'author_count',
       'author_names', 'author_ids', 'author_afids', 'coverDate',
       'coverDisplayDate', 'scopus_source_title', 'issn', 'source_id', 'eIssn',
       'aggregationType', 'scopus_volume', 'issueIdentifier', 'article_number',
       'pageRange', 'description', 'authkeywords', 'citedby_count',
       'openaccess', 'freetoread', 'freetoreadLabel', 'fund_acr', 'fund_no',
       'fund_sponsor', 'scopus_name_1st_author', 'scopus_publication_year',
       'openalex_id', 'openalex_title', 'type', 'publication_date',
       'openalex_publication_year', 'openalex_volume', 'issue', 'first_page',
       'last_page', 'authors', 'keywords', 'abstract', 'openalex_source_title',
       'source_issn', 'source_type', 'cited_by_count', 'referenced_works',
       'openalex_name_1st_author', 'matched', 'scor

In [66]:
# Daten speichern
data_2020_matched.to_csv("../data/diff/data_2020_matched.csv")

data_scopus_2020_unmatched[data_scopus_2020_unmatched["matched"] == False].to_csv(
    "../data/diff/data_2020_only_scopus.csv"
)
data_openalex_2020_unmatched[data_openalex_2020_unmatched["matched"] == False].to_csv(
    "../data/diff/data_2020_only_openalex.csv"
)

In [67]:
data_scopus_2020_unmatched.columns

Index(['scopus_id', 'doi', 'pii', 'pubmed_id', 'scopus_title', 'subtype',
       'subtypeDescription', 'creator', 'afid', 'affilname',
       'affiliation_city', 'affiliation_country', 'author_count',
       'author_names', 'author_ids', 'author_afids', 'coverDate',
       'coverDisplayDate', 'scopus_source_title', 'issn', 'source_id', 'eIssn',
       'aggregationType', 'scopus_volume', 'issueIdentifier', 'article_number',
       'pageRange', 'description', 'authkeywords', 'citedby_count',
       'openaccess', 'freetoread', 'freetoreadLabel', 'fund_acr', 'fund_no',
       'fund_sponsor', 'scopus_name_1st_author', 'scopus_publication_year',
       'uuid', 'matched'],
      dtype='object')

In [68]:
# Daten für Veröffentlichung (nur DOI, EID und ID)
df_2020_matched = data_2020_matched[
    [
        "doi",
        "scopus_id",
        "openalex_id",
    ]
]
df_2020_only_scopus = data_scopus_2020_unmatched[
    [
        "doi",
        "scopus_id",
    ]
]
df_2020_only_openalex = data_openalex_2020_unmatched[
    [
        "doi",
        "openalex_id",
    ]
]

df_2020_matched_all = pd.concat(
    [df_2020_matched, df_2020_only_scopus, df_2020_only_openalex]
)

In [69]:
# check: correct df size
df_2020_matched_all.shape[0] == (
    df_2020_matched.shape[0]
    + df_2020_only_scopus.shape[0]
    + df_2020_only_openalex.shape[0]
)

True

In [None]:
df_2020_matched_all.head(10)

In [71]:
# Daten speichern
df_2020_matched_all.to_csv("../data/diff/data_2020_matching_results.csv")