In [25]:
import pandas as pd
from matplotlib import pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer

In [26]:
# import csvs
dblp = pd.DataFrame()
for i in range(1, 5):
    csv = pd.read_csv(f'../application/data/dblp-{i}.csv')
    dblp = pd.concat([dblp, csv]).reset_index(drop=True)

test_hidden = pd.read_csv('../application/data/test_hidden.csv')
train = pd.read_csv('../application/data/train.csv')
validation_hidden = pd.read_csv('../application/data/validation_hidden.csv')

## Making Predictions

We take out partition 8 to have a validation set with labels

In [27]:
train_data = train[train['partition']!=8]
val_data = train[train['partition']==8]

In [28]:

from googletrans import Translator
translator = Translator()
def translate(c):
  return translator.translate(c, dest='en').text

We swap the title and author when the title contains '|' or is too short

In [29]:
text_features = dblp[['pkey','pauthor', 'ptitle', 'pyear']]
mask = text_features['ptitle'].str.contains('|', regex=False) | ((~text_features['pauthor'].str.contains('|', regex=False))
                                                    & (text_features['pauthor'].str.len() > text_features['ptitle'].str.len()))
text_features.loc[mask, ['ptitle', 'pauthor']] = text_features.loc[mask, ['pauthor','ptitle']].values
full_train = pd.merge(train, text_features, left_on='key1', right_on='pkey', suffixes=('', '_x'))
full_train = pd.merge(full_train, text_features, left_on='key2', right_on='pkey', suffixes=('_x', '_y'))


We use Jacard similarity to compare the titles and authors

In [30]:
def get_jaccard_sim(str1, str2): 
    a = set(str1.replace('|',' ').replace('-', ' ').split()) 
    b = set(str2.replace('|',' ').replace('-', ' ').split())
    c = a.intersection(b)
    return float(len(c)) / (len(a) + len(b) - len(c))



In [31]:
full_train['similarity'] = full_train.apply(lambda x: get_jaccard_sim(x['pauthor_x'], x['pauthor_y']), axis=1)
full_train['similarity_title'] = full_train.apply(lambda x: get_jaccard_sim(x['ptitle_x'], x['ptitle_y']), axis=1)

In [32]:
full_train

Unnamed: 0.1,Unnamed: 0,key1,key2,label,partition,pkey_x,pauthor_x,ptitle_x,pyear_x,pkey_y,pauthor_y,ptitle_y,pyear_y,similarity,similarity_title
0,0,conf/prib/AhmedF07,journals/jcc/PatraS09,False,7,conf/prib/AhmedF07,Said Hassan Ahmed|Tor Flå,Estimation of Evolutionary Average Hydrophobic...,2007,journals/jcc/PatraS09,Jagdish Chandra Patra|Onkar Singh,Artificial neural networks-based approach to d...,2009,0.000000,0.000000
1,1,conf/vlsid/ChenCC95,journals/tcad/LuoCWCCW08,True,4,conf/vlsid/ChenCC95,Yung-Yuan Chen|Ching-Hwa Cheng|Jwu-E Chen,An efficient switching network fault diagnosis...,1995,journals/tcad/LuoCWCCW08,Pei-Wen Luo|Jwu-E Chen|Chin-Long Wey|Liang-Chi...,Impact of Capacitance Correlation on Yield Enh...,2008,0.263158,0.000000
2,2,conf/prozess/Sun88,conf/isnn/SunZLCS07,True,8,conf/prozess/Sun88,Z. Sun,Anwendung graphischer Darstellungen im Rahmen ...,-1988,conf/isnn/SunZLCS07,Z. Sun|M. J. Zhang|Xiao H. Liao|Wenchuan Cai|Y...,Neuro-Adaptive Formation Control of Multi-Mobi...,-2007,0.166667,0.000000
3,3,conf/pricai/BeaumontTSM04,conf/icip/SattarAS08,False,5,conf/pricai/BeaumontTSM04,Matthew Beaumont|John Thornton|Abdul Sattar|Mi...,Solving Over-Constrained Temporal Reasoning Pr...,2004,conf/icip/SattarAS08,Abdul Sattar 0003|Yasser Aidarous|Renaud Séguier,GAGM-AAM: A genetic optimization with Gaussian...,2008,0.142857,0.000000
4,774,conf/aiide/BourneS05,conf/icip/SattarAS08,False,1,conf/aiide/BourneS05,Owen Bourne|Abdul Sattar,Applying Constraint Weighting to Autonomous Ca...,2005,conf/icip/SattarAS08,Abdul Sattar 0003|Yasser Aidarous|Renaud Séguier,GAGM-AAM: A genetic optimization with Gaussian...,2008,0.222222,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7967,9993,journals/corr/abs-1011-1127,journals/corr/abs-1011-1119,True,4,journals/corr/abs-1011-1127,Oleg Chertov|Dan Tavrov,Group Anonymity: Problems and Solutions,2010,journals/corr/abs-1011-1119,Oleg Chertov|Dan Tavrov,Group Anonymity,2010,1.000000,0.166667
7968,9994,journals/corr/abs-cs-0607015,journals/isci/Valle-LisboaM07,True,5,journals/corr/abs-cs-0607015,Juan C. Valle-Lisboa|Eduardo Mizraji,The uncovering of hidden structures by Latent ...,2006,journals/isci/Valle-LisboaM07,Juan C. Valle-Lisboa|Eduardo Mizraji,The uncovering of hidden structures by Latent ...,2007,1.000000,0.800000
7969,9995,conf/splst/KalmanHG03,journals/infsof/KalmanHG06,True,7,conf/splst/KalmanHG03,Miklós Kálmán|Ferenc Havasi|Tibor Gyimóthy,Compacting XML Documents.,2003,journals/infsof/KalmanHG06,Miklós Kálmán|Ferenc Havasi|Tibor Gyimóthy,Compacting XML documents.,2006,1.000000,0.500000
7970,9997,conf/esws/DavidK0RZZ10,journals/corr/abs-1004-3390,True,3,conf/esws/DavidK0RZZ10,Catalin David|Michael Kohlhase|Christoph Lange...,Publishing Math Lecture Notes as Linked Data.,2010,journals/corr/abs-1004-3390,Catalin David|Michael Kohlhase|Christoph Lange...,Publishing Math Lecture Notes as Linked Data,2010,1.000000,0.750000


In [33]:
prediction = full_train['similarity'] + full_train['similarity_title'] > 0.2

In [23]:
from sklearn.metrics import accuracy_score
half = int(len(prediction)/2)
accuracy_score(prediction, full_train['label'])

0.7438534872052183

In [11]:
full_train.iloc[7968]


Unnamed: 0                                                       9994
key1                                     journals/corr/abs-cs-0607015
key2                                    journals/isci/Valle-LisboaM07
label                                                            True
partition                                                           5
pkey_x                                   journals/corr/abs-cs-0607015
pauthor_x                        Juan C. Valle-Lisboa|Eduardo Mizraji
ptitle_x            The uncovering of hidden structures by Latent ...
pyear_x                                                          2006
pkey_y                                  journals/isci/Valle-LisboaM07
pauthor_y                        Juan C. Valle-Lisboa|Eduardo Mizraji
ptitle_y            The uncovering of hidden structures by Latent ...
pyear_y                                                          2007
similarity                                                        1.0
similarity_title    

In [12]:
text_features.head(50)

Unnamed: 0,pkey,pauthor,ptitle,pyear
0,conf/dft/SemiaoRVSTT07,Jorge Semião|Juan J. Rodríguez-Andina|Fabian V...,Improving the Tolerance of Pipeline Based Circ...,-2007
1,conf/dagstuhl/Caire07,Patrice Caire,A Normative Multi-Agent Systems Approach to th...,-2007
2,journals/tit/BT07,Sundeep B|Andrew Thangaraj,Self-Orthogonality of q-Ary Images of qm-Ary C...,2007
3,conf/icdcsw/Pardo-Castellote03,Gerardo Pardo-Castellote,OMG Data-Distribution Service: Architectural O...,-2003
4,journals/corr/abs-0911-4329,Ki-Hoon Lee|Kyu-Young Whang|Wook-Shin Han|Min-...,Structural Consistency: Enabling XML Keyword S...,2009
5,conf/cdc/LiuHT09,Bin Liu|David J. Hill|Kok Lay Teo,Input-to-state stability for a class of hybrid...,-2009
6,journals/mcs/SunKZC03,Haiwei Sun|Ning Kang|Jun Zhang|Eric S. Carlson,A fourth-order compact difference scheme on fa...,-2003
7,conf/icic/LeeKKK06,Sang-Heon Lee|Nam-Sung Kim|Heau-Jo Kang|Soon-G...,Performance Improvement of Intelligent UWB-IR ...,-2006
8,conf/icns/Capo-ChichiGF09,Eugene Pamba Capo-Chichi|Hervé Guyennet|Jean-M...,IEEE 802.15.4 Performance on a Hierarchical Hy...,-2009
9,journals/jche/WangH09,Feng Wang|Michael J. Hannafin,Scaffolding preservice teachers' WebQuest desi...,-2009


In [13]:
dblp[['pauthor', 'ptitle']].head(20)

Unnamed: 0,pauthor,ptitle
0,Jorge Semião|Juan J. Rodríguez-Andina|Fabian V...,Improving the Tolerance of Pipeline Based Circ...
1,Patrice Caire,A Normative Multi-Agent Systems Approach to th...
2,Sundeep B|Andrew Thangaraj,Self-Orthogonality of q-Ary Images of qm-Ary C...
3,Gerardo Pardo-Castellote,OMG Data-Distribution Service: Architectural O...
4,Ki-Hoon Lee|Kyu-Young Whang|Wook-Shin Han|Min-...,Structural Consistency: Enabling XML Keyword S...
5,Bin Liu|David J. Hill|Kok Lay Teo,Input-to-state stability for a class of hybrid...
6,A fourth-order compact difference scheme on fa...,Haiwei Sun|Ning Kang|Jun Zhang|Eric S. Carlson
7,Sang-Heon Lee|Nam-Sung Kim|Heau-Jo Kang|Soon-G...,Performance Improvement of Intelligent UWB-IR ...
8,Eugene Pamba Capo-Chichi|Hervé Guyennet|Jean-M...,IEEE 802.15.4 Performance on a Hierarchical Hy...
9,Feng Wang|Michael J. Hannafin,Scaffolding preservice teachers' WebQuest desi...


In [24]:
db = pd.read_csv('data/db/db.csv')

ParserError: Error tokenizing data. C error: Expected 1 fields in line 35, saw 3


In [15]:
text_features.head(60)

Unnamed: 0,pkey,pauthor,ptitle,pyear
0,conf/dft/SemiaoRVSTT07,Jorge Semião|Juan J. Rodríguez-Andina|Fabian V...,Improving the Tolerance of Pipeline Based Circ...,-2007
1,conf/dagstuhl/Caire07,Patrice Caire,A Normative Multi-Agent Systems Approach to th...,-2007
2,journals/tit/BT07,Sundeep B|Andrew Thangaraj,Self-Orthogonality of q-Ary Images of qm-Ary C...,2007
3,conf/icdcsw/Pardo-Castellote03,Gerardo Pardo-Castellote,OMG Data-Distribution Service: Architectural O...,-2003
4,journals/corr/abs-0911-4329,Ki-Hoon Lee|Kyu-Young Whang|Wook-Shin Han|Min-...,Structural Consistency: Enabling XML Keyword S...,2009
5,conf/cdc/LiuHT09,Bin Liu|David J. Hill|Kok Lay Teo,Input-to-state stability for a class of hybrid...,-2009
6,journals/mcs/SunKZC03,Haiwei Sun|Ning Kang|Jun Zhang|Eric S. Carlson,A fourth-order compact difference scheme on fa...,-2003
7,conf/icic/LeeKKK06,Sang-Heon Lee|Nam-Sung Kim|Heau-Jo Kang|Soon-G...,Performance Improvement of Intelligent UWB-IR ...,-2006
8,conf/icns/Capo-ChichiGF09,Eugene Pamba Capo-Chichi|Hervé Guyennet|Jean-M...,IEEE 802.15.4 Performance on a Hierarchical Hy...,-2009
9,journals/jche/WangH09,Feng Wang|Michael J. Hannafin,Scaffolding preservice teachers' WebQuest desi...,-2009


In [16]:
val = pd.read_csv('data/validation_hidden.csv')

In [17]:
val[val['key1']]

KeyError: "None of [Index(['conf/icdim/KebailiA07', 'conf/spaa/AndersonBCKSSSTWZ05',\n       'journals/comgeo/MustafaR10', 'journals/tog/RenWSZLSSBPG06',\n       'conf/lctrts/TanM05', 'conf/webi/RamanujamGKST09', 'series/asc/Kim07',\n       'conf/advis/KimKYKO04', 'journals/tip/JiangJM06',\n       'journals/jcisd/MattioniKJCDP03',\n       ...\n       'journals/ai/KellyP10', 'conf/hicss/HainesL04', 'journals/toms/Hill81c',\n       'conf/vlsid/ZhouLW07', 'conf/csreaESA/WarrenAWH08', 'conf/csse/CuiWC08',\n       'conf/micro/EssinkADGKV91', 'conf/agp/PuigsegurA98',\n       'journals/vldb/MehtaGPN98', 'journals/toms/Hill81b'],\n      dtype='object', length=994)] are in the [columns]"