In [2]:
import pandas as pd
from matplotlib import pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer

In [16]:

# import csvs
dblp = pd.DataFrame()
for i in range(1, 5):
    csv = pd.read_csv(f'../application/data/dblp-{i}.csv')
    dblp = pd.concat([dblp, csv]).reset_index(drop=True)

test_hidden = pd.read_csv('../application/data/test_hidden.csv')
train = pd.read_csv('../application/data/train.csv')
validation_hidden = pd.read_csv('../application/data/validation_hidden.csv')

## Making Predictions

We take out partition 8 to have a validation set with labels

In [17]:
train_data = train[train['partition']!=8]
val_data = train[train['partition']==8]

We swap the title and author when the title contains '|' or is too short

In [39]:
text_features = dblp[['pkey','pauthor', 'ptitle', 'pyear']]
mask = text_features['ptitle'].str.contains('|') |((text_features['pauthor'].str.contains('|')==False)
                         &(text_features['pauthor'].str.len()>text_features['ptitle'].str.len()))
text_features.loc[mask,['ptitle','pauthor']] = text_features.loc[mask,['pauthor','ptitle']].values
full_train = pd.merge(train, text_features, left_on='key1', right_on='pkey', suffixes=('', '_x'))
full_train = pd.merge(full_train, text_features, left_on='key2', right_on='pkey', suffixes=('_x', '_y'))


We use Jacard similarity to compare the titles and authors

In [40]:
def get_jaccard_sim(str1, str2): 
    a = set(str1.replace('|',' ').split()) 
    b = set(str2.replace('|',' ').split())
    c = a.intersection(b)
    return float(len(c)) / (len(a) + len(b) - len(c))



In [41]:
full_train['similarity'] = full_train.apply(lambda x: get_jaccard_sim(x['pauthor_x'], x['pauthor_y']), axis=1)
full_train['similarity_title'] = full_train.apply(lambda x: get_jaccard_sim(x['ptitle_x'], x['ptitle_y']), axis=1)

In [42]:
prediction = full_train['similarity'] + full_train['similarity_title'] > 0.2

In [43]:
from sklearn.metrics import accuracy_score
accuracy_score(prediction, full_train['label'])

0.7348218765679879

In [34]:
full_train[full_train['label']==True].tail(20)


Unnamed: 0.1,Unnamed: 0,key1,key2,label,partition,pkey_x,pauthor_x,ptitle_x,pyear_x,pkey_y,pauthor_y,ptitle_y,pyear_y,similarity,similarity_title
7952,9974,journals/nar/KogelnikLBNW96,journals/nar/KogelnikLBNW98,True,7,journals/nar/KogelnikLBNW96,Andreas M. Kogelnik|Marie T. Lott|Michael D. B...,MITOMAP: a human mitochondrial genome database.,-1996,journals/nar/KogelnikLBNW98,Andreas M. Kogelnik|Marie T. Lott|Michael D. B...,MITOMAP: a human mitochondrial genome database...,-1998,1.0,0.625
7953,9975,conf/escape/ShiY07,journals/tcs/ShiY08,True,3,conf/escape/ShiY07,On-Line Bin Packing with Arbitrary Release Times.,Yongqiang Shi|Deshi Ye,2007,journals/tcs/ShiY08,Online bin packing with arbitrary release times.,Yongqiang Shi|Deshi Ye,2008,0.076923,1.0
7954,9976,journals/soco/ChovanecNS10,journals/soco/ChovanecD03,True,2,journals/soco/ChovanecNS10,Ferdinand Chovanec|Olga Nánásiová|Alexander P....,Preface of the guest editors.,-2010,journals/soco/ChovanecD03,Ferdinand Chovanec|Anatolij Dvurecenskij,Preface of the Guest Editors.,-2003,0.222222,0.428571
7955,9977,conf/podc/FernandezR07,conf/opodis/AntaR07,True,4,conf/podc/FernandezR07,Antonio Fernández|Michel Raynal,From an intermittent rotating star to a leader.,2007,conf/opodis/AntaR07,Antonio Fernández Anta|Michel Raynal,From an Intermittent Rotating Star to a Leader.,2007,0.8,0.333333
7956,9978,journals/ijcia/ChiaT01,conf/ijcai/TanC01,True,3,journals/ijcia/ChiaT01,Henry Wai Kit Chia|Chew Lim Tan,Neural Logic Network Learning Using Genetic Pr...,2001,conf/ijcai/TanC01,Chew Lim Tan|Henry Wai Kit Chia,Neural Logic Network Learning using Genetic Pr...,2001,1.0,0.75
7957,9979,journals/monet/MerinoMSSK05,conf/wmash/MatsunagaMSK03,True,7,journals/monet/MerinoMSSK05,Ana Sanz Merino|Yasuhiko Matsunaga|Manish Shah...,Secure Authentication System for Public WLAN R...,-2005,conf/wmash/MatsunagaMSK03,Yasuhiko Matsunaga|Ana Sanz Merino|Takashi Suz...,Secure authentication system for public WLAN r...,-2003,0.833333,0.272727
7958,9982,journals/corr/cmp-lg-9706020,journals/jair/WiebeOOM98,True,6,journals/corr/cmp-lg-9706020,An Empirical Approach to Temporal Reference Re...,Janyce Wiebe|Thomas P. O'Hara|Kenneth J. McKee...,1997,journals/jair/WiebeOOM98,An Empirical Approach to Temporal Reference Re...,Janyce Wiebe|Thomas P. O'Hara|Thorsten Öhrströ...,1998,0.75,1.0
7959,9983,conf/icra/ChakrabortyPAM06,journals/trob/ChakrabortyPAM08,True,3,conf/icra/ChakrabortyPAM06,Nilanjan Chakraborty|Jufeng Peng|Srinivas Akel...,Proximity Queries between Convex Objects: an I...,-2006,journals/trob/ChakrabortyPAM08,Nilanjan Chakraborty|Jufeng Peng|Srinivas Akel...,Proximity Queries Between Convex Objects: An I...,-2008,0.7,0.714286
7960,9984,conf/cdc/ZymnisBG08,conf/icarcv/ZymnisBG08,True,6,conf/cdc/ZymnisBG08,Argyrios Zymnis|Stephen P. Boyd|Dimitry M. Gor...,Mixed state estimation for a linear Gaussian M...,2008,conf/icarcv/ZymnisBG08,Argyris Zymnis|Stephen P. Boyd|Dimitry M. Gori...,Mixed state estimation for a linear Gaussian M...,2008,0.777778,1.0
7961,9986,journals/tip/MatungkaZE09,conf/icip/MatungkaZE08,True,7,journals/tip/MatungkaZE09,Rittavee Matungka|Yuan F. Zheng|Robert L. Ewing,Image Registration Using Adaptive Polar Transf...,-2009,conf/icip/MatungkaZE08,Rittavee Matungka|Yuan F. Zheng|Robert L. Ewing,Image registration using Adaptive Polar Transf...,-2008,1.0,0.5


In [44]:
text_features.head(20)

Unnamed: 0,pkey,pauthor,ptitle,pyear
0,conf/dft/SemiaoRVSTT07,Improving the Tolerance of Pipeline Based Circ...,Jorge Semião|Juan J. Rodríguez-Andina|Fabian V...,-2007
1,conf/dagstuhl/Caire07,A Normative Multi-Agent Systems Approach to th...,Patrice Caire,-2007
2,journals/tit/BT07,Self-Orthogonality of q-Ary Images of qm-Ary C...,Sundeep B|Andrew Thangaraj,2007
3,conf/icdcsw/Pardo-Castellote03,OMG Data-Distribution Service: Architectural O...,Gerardo Pardo-Castellote,-2003
4,journals/corr/abs-0911-4329,Structural Consistency: Enabling XML Keyword S...,Ki-Hoon Lee|Kyu-Young Whang|Wook-Shin Han|Min-...,2009
5,conf/cdc/LiuHT09,Input-to-state stability for a class of hybrid...,Bin Liu|David J. Hill|Kok Lay Teo,-2009
6,journals/mcs/SunKZC03,Haiwei Sun|Ning Kang|Jun Zhang|Eric S. Carlson,A fourth-order compact difference scheme on fa...,-2003
7,conf/icic/LeeKKK06,Performance Improvement of Intelligent UWB-IR ...,Sang-Heon Lee|Nam-Sung Kim|Heau-Jo Kang|Soon-G...,-2006
8,conf/icns/Capo-ChichiGF09,IEEE 802.15.4 Performance on a Hierarchical Hy...,Eugene Pamba Capo-Chichi|Hervé Guyennet|Jean-M...,-2009
9,journals/jche/WangH09,Scaffolding preservice teachers' WebQuest desi...,Feng Wang|Michael J. Hannafin,-2009


In [38]:
dblp[['pauthor', 'ptitle']].head(20)

Unnamed: 0,pauthor,ptitle
0,Jorge Semião|Juan J. Rodríguez-Andina|Fabian V...,Improving the Tolerance of Pipeline Based Circ...
1,Patrice Caire,A Normative Multi-Agent Systems Approach to th...
2,Sundeep B|Andrew Thangaraj,Self-Orthogonality of q-Ary Images of qm-Ary C...
3,Gerardo Pardo-Castellote,OMG Data-Distribution Service: Architectural O...
4,Ki-Hoon Lee|Kyu-Young Whang|Wook-Shin Han|Min-...,Structural Consistency: Enabling XML Keyword S...
5,Bin Liu|David J. Hill|Kok Lay Teo,Input-to-state stability for a class of hybrid...
6,A fourth-order compact difference scheme on fa...,Haiwei Sun|Ning Kang|Jun Zhang|Eric S. Carlson
7,Sang-Heon Lee|Nam-Sung Kim|Heau-Jo Kang|Soon-G...,Performance Improvement of Intelligent UWB-IR ...
8,Eugene Pamba Capo-Chichi|Hervé Guyennet|Jean-M...,IEEE 802.15.4 Performance on a Hierarchical Hy...
9,Feng Wang|Michael J. Hannafin,Scaffolding preservice teachers' WebQuest desi...
