# DBLP Scholar: Record Linkage attacks

This Notebook implements a few sample record linkage attacks and calculates the accuracy, precision and recall.

In [1]:
import recordlinkage
from recordlinkage.preprocessing import clean, phonetic
from recordlinkage.index import BlockIndex, SortedNeighbourhood
from recordlinkage import Compare
from recordlinkage.compare import String
from recordlinkage import KMeansClassifier
# import utility functions for dealing with datasets
from utils import read_data, clean_attributes

# set debug flag:
debug = True

## 1. Loading the data

The data is loaded from the filesystem

In [2]:
# read DBLP and Google Scholar dataset
dataDBLP, dataScholar, links = read_data(
    'DBLP1', 'Scholar', 'DBLP-Scholar_perfectMapping', debug)

In [3]:
# show the dataframes
if debug:
    display(dataDBLP)
    display(dataScholar)
    display(links)

Unnamed: 0,id,title,authors,venue,year
0,conf/vldb/RusinkiewiczKTWM95,Towards a Cooperative Transaction Model - The ...,"M Rusinkiewicz, W Klas, T Tesch, J W�sch, P Muth",VLDB,1995
1,journals/sigmod/EisenbergM02,SQL/XML is Making Good Progress,"A Eisenberg, J Melton",SIGMOD Record,2002
2,conf/vldb/AmmannJR95,Using Formal Methods to Reason about Semantics...,"P Ammann, S Jajodia, I Ray",VLDB,1995
3,journals/sigmod/Liu02,Editor's Notes,L Liu,SIGMOD Record,2002
4,journals/sigmod/Hammer02,Report on the ACM Fourth International Worksho...,,,2002
...,...,...,...,...,...
994,conf/sigmod/HaasH99,Ripple Joins for Online Aggregation,"P Haas, J Hellerstein",SIGMOD Conference,1999
995,journals/sigmod/GeppertD94,Constructing the Next 100 Database Management ...,"A Geppert, K Dittrich",SIGMOD Record,1994
996,conf/sigmod/AcharyaGPR99a,Join Synopses for Approximate Query Answering,"S Acharya, P Gibbons, V Poosala, S Ramaswamy",SIGMOD Conference,1999
997,journals/sigmod/Yang94,A Hypertext Query Language for Images,L Yang,SIGMOD Record,1994


Unnamed: 0,id,title,authors,venue,year
0,aKcZKwvwbQwJ,11578 Sorrento Valley Road,QD Inc,"San Diego,",
1,ixKfiTHoaDoJ,Initiation of crazes in polystyrene,"AS Argon, JG Hannoosh","Phil. Mag,",
2,3BxllB4wwcIJ,Immunogold labelling is a quantitative method ...,"GH Hansen, LL Wetterberg, H SjÃ¶strÃ¶m, O NorÃ©n","The Histochemical Journal,",1992.0
3,d2WWxwKMex4J,The Burden of Infectious Disease Among Inmates...,"TM Hammett, P Harmon, W Rhodes",see,
4,cZCX-AQpjccJ,The Role of Faculty Advising in Science and En...,JR Cogdell,"NEW DIRECTIONS FOR TEACHING AND LEARNING,",1995.0
...,...,...,...,...,...
994,81tq2S0IGYcJ,Have you hugged an editor today?,R Raskin,"PC Magazine,",
995,mzFzhewxu1UJ,Anti-self dual Lagrangians II: Unbounded non s...,"N Ghoussoub, L Tzou",,
996,tq4bpWYOcEEJ,Customer handling intermediate serverâ??an arc...,G Mathieson,"BT Technology Journal,",1997.0
997,SDS7uWO7Mj4J,Modell des langwelligen Strahlungsaustauschs u...,C Felsmann,"Technical University of Dresden,",


MultiIndex([(958, 295),
            (211, 202),
            (431, 735),
            (287, 533),
            (412, 843),
            (986, 907),
            (254, 850),
            (254, 808),
            ( 35, 729),
            (428, 166),
            (141, 370),
            (852, 406),
            (248, 191),
            (611, 625),
            (417, 435),
            (229, 811),
            (245,  25)],
           )

## 2. Cleaning and Pre-Processing

As a first step, the data is cleaned and pre-processed.

In [4]:
# cleaning: bring all to lowercase, remove unwanted tokens
dataDBLP = clean_attributes(dataDBLP, ['title', 'authors', 'venue'])
dataScholar = clean_attributes(dataScholar, ['title', 'authors', 'venue'])
# show the dataframes
if debug:
    display(dataDBLP)
    display(dataScholar)

Unnamed: 0,id,title,authors,venue,year,title_clean,authors_clean,venue_clean
0,conf/vldb/RusinkiewiczKTWM95,Towards a Cooperative Transaction Model - The ...,"M Rusinkiewicz, W Klas, T Tesch, J W�sch, P Muth",VLDB,1995,towards a cooperative transaction model the co...,m rusinkiewicz w klas t tesch j wsch p muth,vldb
1,journals/sigmod/EisenbergM02,SQL/XML is Making Good Progress,"A Eisenberg, J Melton",SIGMOD Record,2002,sqlxml is making good progress,a eisenberg j melton,sigmod record
2,conf/vldb/AmmannJR95,Using Formal Methods to Reason about Semantics...,"P Ammann, S Jajodia, I Ray",VLDB,1995,using formal methods to reason about semantics...,p ammann s jajodia i ray,vldb
3,journals/sigmod/Liu02,Editor's Notes,L Liu,SIGMOD Record,2002,editors notes,l liu,sigmod record
4,journals/sigmod/Hammer02,Report on the ACM Fourth International Worksho...,,,2002,report on the acm fourth international worksho...,,
...,...,...,...,...,...,...,...,...
994,conf/sigmod/HaasH99,Ripple Joins for Online Aggregation,"P Haas, J Hellerstein",SIGMOD Conference,1999,ripple joins for online aggregation,p haas j hellerstein,sigmod conference
995,journals/sigmod/GeppertD94,Constructing the Next 100 Database Management ...,"A Geppert, K Dittrich",SIGMOD Record,1994,constructing the next 100 database management ...,a geppert k dittrich,sigmod record
996,conf/sigmod/AcharyaGPR99a,Join Synopses for Approximate Query Answering,"S Acharya, P Gibbons, V Poosala, S Ramaswamy",SIGMOD Conference,1999,join synopses for approximate query answering,s acharya p gibbons v poosala s ramaswamy,sigmod conference
997,journals/sigmod/Yang94,A Hypertext Query Language for Images,L Yang,SIGMOD Record,1994,a hypertext query language for images,l yang,sigmod record


Unnamed: 0,id,title,authors,venue,year,title_clean,authors_clean,venue_clean
0,aKcZKwvwbQwJ,11578 Sorrento Valley Road,QD Inc,"San Diego,",,11578 sorrento valley road,qd inc,san diego
1,ixKfiTHoaDoJ,Initiation of crazes in polystyrene,"AS Argon, JG Hannoosh","Phil. Mag,",,initiation of crazes in polystyrene,as argon jg hannoosh,phil mag
2,3BxllB4wwcIJ,Immunogold labelling is a quantitative method ...,"GH Hansen, LL Wetterberg, H SjÃ¶strÃ¶m, O NorÃ©n","The Histochemical Journal,",1992.0,immunogold labelling is a quantitative method ...,gh hansen ll wetterberg h sjstrm o norn,the histochemical journal
3,d2WWxwKMex4J,The Burden of Infectious Disease Among Inmates...,"TM Hammett, P Harmon, W Rhodes",see,,the burden of infectious disease among inmates...,tm hammett p harmon w rhodes,see
4,cZCX-AQpjccJ,The Role of Faculty Advising in Science and En...,JR Cogdell,"NEW DIRECTIONS FOR TEACHING AND LEARNING,",1995.0,the role of faculty advising in science and en...,jr cogdell,new directions for teaching and learning
...,...,...,...,...,...,...,...,...
994,81tq2S0IGYcJ,Have you hugged an editor today?,R Raskin,"PC Magazine,",,have you hugged an editor today,r raskin,pc magazine
995,mzFzhewxu1UJ,Anti-self dual Lagrangians II: Unbounded non s...,"N Ghoussoub, L Tzou",,,anti self dual lagrangians ii unbounded non se...,n ghoussoub l tzou,
996,tq4bpWYOcEEJ,Customer handling intermediate serverâ??an arc...,G Mathieson,"BT Technology Journal,",1997.0,customer handling intermediate serveran archit...,g mathieson,bt technology journal
997,SDS7uWO7Mj4J,Modell des langwelligen Strahlungsaustauschs u...,C Felsmann,"Technical University of Dresden,",,modell des langwelligen strahlungsaustauschs u...,c felsmann,technical university of dresden


## 3. Indexing

In the indexing step, the candidates for matches are identified.
One option is a "FullIndex", where each record is compared against each other record.
This can however take quite some time, therefore a "BlockIndex" or a "SortedNeighborhood"
are recommended, against an attribute which promises extremely high recall in finding
matches.

In [5]:
# using indexer on 'year'
indexer = SortedNeighbourhood('year')
pairs = indexer.index(dataDBLP, dataScholar)
if debug:
    print(f"Number of candidates: {len(pairs)}")

Number of candidates: 74045


## 4. Comparing
In the comparing step, the pairs are compared against matching attributes.

Especially for the full dataset this takes some significant time (also depending on the index)

In [6]:
comp = Compare()
comp.add(String('title_clean', 'title_clean'))
comp.add(String('authors_clean', 'authors_clean'))
comp.add(String('venue_clean', 'venue_clean'))
result = comp.compute(pairs, dataDBLP, dataScholar)

## 5. Classifying
Now we have for each pair a set of features, specifying where and how good they
match.
In this step we want to classify pairs into either match or non-match.

In [7]:
# simple classifier: add the values and use a threshold of 2
matches = result[result[0]+result[1]+result[2]>2].index

In [8]:
if debug:
    display(matches)

MultiIndex([(852, 406),
            ( 35, 729),
            (229, 811),
            (245,  25),
            (141, 370),
            (412, 843),
            (417, 435),
            (958, 295),
            (986, 907)],
           )

## 6. Evaluation

We use again the recordlinkage package for calculating evaluation values of the results.

In [9]:
precision = recordlinkage.precision(links, matches)
recall = recordlinkage.recall(links, matches)
accuracy = recordlinkage.accuracy(links, matches)
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"Accuracy: {accuracy}")
display(recordlinkage.confusion_matrix(links, matches))

TypeError: int() argument must be a string, a bytes-like object or a number, not 'NoneType'

In [8]:
display(result)

Unnamed: 0,level_0,level_1,title_matched,author_matched
0,0,4,0.0,0.0
1,0,15,0.0,0.0
2,0,75,0.0,0.0
3,0,99,0.0,0.0
4,0,141,0.0,0.0
...,...,...,...,...
23848,767,835,0.0,0.0
23849,767,840,0.0,0.0
23850,767,908,0.0,0.0
23851,767,912,0.0,0.0
