In [33]:
import pandas as pd
from matplotlib import pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer

In [34]:
# import csvs
dblp = pd.DataFrame()
for i in range(1, 5):
    csv = pd.read_csv(f'../application/data/dblp-{i}.csv')
    dblp = pd.concat([dblp, csv]).reset_index(drop=True)

test_hidden = pd.read_csv('../application/data/test_hidden.csv')
train = pd.read_csv('../application/data/train.csv')
validation_hidden = pd.read_csv('../application/data/validation_hidden.csv')

## Making Predictions

We take out partition 8 to have a validation set with labels

In [35]:
train_data = train[train['partition']!=8]
val_data = train[train['partition']==8]

We swap the title and author when the title contains '|' or is too short

In [46]:
text_features = dblp[['pkey','pauthor', 'ptitle', 'pyear']]
mask = text_features['ptitle'].str.contains('|', regex=False) | ((~text_features['pauthor'].str.contains('|', regex=False))
                                                    & (text_features['pauthor'].str.len() > text_features['ptitle'].str.len()))
text_features.loc[mask, ['ptitle', 'pauthor']] = text_features.loc[mask, ['pauthor','ptitle']].values
full_train = pd.merge(train, text_features, left_on='key1', right_on='pkey', suffixes=('', '_x'))
full_train = pd.merge(full_train, text_features, left_on='key2', right_on='pkey', suffixes=('_x', '_y'))


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value[:, i].tolist(), pi)


We use Jacard similarity to compare the titles and authors

In [47]:
def get_jaccard_sim(str1, str2): 
    a = set(str1.replace('|',' ').split()) 
    b = set(str2.replace('|',' ').split())
    c = a.intersection(b)
    return float(len(c)) / (len(a) + len(b) - len(c))



In [48]:
full_train['similarity'] = full_train.apply(lambda x: get_jaccard_sim(x['pauthor_x'], x['pauthor_y']), axis=1)
full_train['similarity_title'] = full_train.apply(lambda x: get_jaccard_sim(x['ptitle_x'], x['ptitle_y']), axis=1)

In [65]:
full_train

Unnamed: 0.1,Unnamed: 0,key1,key2,label,partition,pkey_x,pauthor_x,ptitle_x,pyear_x,pkey_y,pauthor_y,ptitle_y,pyear_y,similarity,similarity_title
0,0,conf/prib/AhmedF07,journals/jcc/PatraS09,False,7,conf/prib/AhmedF07,Said Hassan Ahmed|Tor Flå,Estimation of Evolutionary Average Hydrophobic...,2007,journals/jcc/PatraS09,Jagdish Chandra Patra|Onkar Singh,Artificial neural networks-based approach to d...,2009,0.000000,0.000000
1,1,conf/vlsid/ChenCC95,journals/tcad/LuoCWCCW08,True,4,conf/vlsid/ChenCC95,Yung-Yuan Chen|Ching-Hwa Cheng|Jwu-E Chen,An efficient switching network fault diagnosis...,1995,journals/tcad/LuoCWCCW08,Pei-Wen Luo|Jwu-E Chen|Chin-Long Wey|Liang-Chi...,Impact of Capacitance Correlation on Yield Enh...,2008,0.230769,0.000000
2,2,conf/prozess/Sun88,conf/isnn/SunZLCS07,True,8,conf/prozess/Sun88,Z. Sun,Anwendung graphischer Darstellungen im Rahmen ...,-1988,conf/isnn/SunZLCS07,Z. Sun|M. J. Zhang|Xiao H. Liao|Wenchuan Cai|Y...,Neuro-Adaptive Formation Control of Multi-Mobi...,-2007,0.166667,0.000000
3,3,conf/pricai/BeaumontTSM04,conf/icip/SattarAS08,False,5,conf/pricai/BeaumontTSM04,Matthew Beaumont|John Thornton|Abdul Sattar|Mi...,Solving Over-Constrained Temporal Reasoning Pr...,2004,conf/icip/SattarAS08,Abdul Sattar 0003|Yasser Aidarous|Renaud Séguier,GAGM-AAM: A genetic optimization with Gaussian...,2008,0.142857,0.000000
4,774,conf/aiide/BourneS05,conf/icip/SattarAS08,False,1,conf/aiide/BourneS05,Owen Bourne|Abdul Sattar,Applying Constraint Weighting to Autonomous Ca...,2005,conf/icip/SattarAS08,Abdul Sattar 0003|Yasser Aidarous|Renaud Séguier,GAGM-AAM: A genetic optimization with Gaussian...,2008,0.222222,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7967,9993,journals/corr/abs-1011-1127,journals/corr/abs-1011-1119,True,4,journals/corr/abs-1011-1127,Oleg Chertov|Dan Tavrov,Group Anonymity: Problems and Solutions,2010,journals/corr/abs-1011-1119,Oleg Chertov|Dan Tavrov,Group Anonymity,2010,1.000000,0.166667
7968,9994,journals/corr/abs-cs-0607015,journals/isci/Valle-LisboaM07,True,5,journals/corr/abs-cs-0607015,Juan C. Valle-Lisboa|Eduardo Mizraji,The uncovering of hidden structures by Latent ...,2006,journals/isci/Valle-LisboaM07,Juan C. Valle-Lisboa|Eduardo Mizraji,The uncovering of hidden structures by Latent ...,2007,1.000000,0.800000
7969,9995,conf/splst/KalmanHG03,journals/infsof/KalmanHG06,True,7,conf/splst/KalmanHG03,Miklós Kálmán|Ferenc Havasi|Tibor Gyimóthy,Compacting XML Documents.,2003,journals/infsof/KalmanHG06,Miklós Kálmán|Ferenc Havasi|Tibor Gyimóthy,Compacting XML documents.,2006,1.000000,0.500000
7970,9997,conf/esws/DavidK0RZZ10,journals/corr/abs-1004-3390,True,3,conf/esws/DavidK0RZZ10,Catalin David|Michael Kohlhase|Christoph Lange...,Publishing Math Lecture Notes as Linked Data.,2010,journals/corr/abs-1004-3390,Catalin David|Michael Kohlhase|Christoph Lange...,Publishing Math Lecture Notes as Linked Data,2010,1.000000,0.750000


In [51]:
prediction = full_train['similarity'] + full_train['similarity_title'] > 0.2

In [52]:
from sklearn.metrics import accuracy_score
accuracy_score(prediction, full_train['label'])

0.7350727546412443

In [9]:
full_train[full_train['label']==True].tail(20)


Unnamed: 0.1,Unnamed: 0,key1,key2,label,partition,pkey_x,pauthor_x,ptitle_x,pyear_x,pkey_y,pauthor_y,ptitle_y,pyear_y,similarity,similarity_title
7952,9974,journals/nar/KogelnikLBNW96,journals/nar/KogelnikLBNW98,True,7,journals/nar/KogelnikLBNW96,MITOMAP: a human mitochondrial genome database.,Andreas M. Kogelnik|Marie T. Lott|Michael D. B...,-1996,journals/nar/KogelnikLBNW98,MITOMAP: a human mitochondrial genome database...,Andreas M. Kogelnik|Marie T. Lott|Michael D. B...,-1998,0.625,1.0
7953,9975,conf/escape/ShiY07,journals/tcs/ShiY08,True,3,conf/escape/ShiY07,Yongqiang Shi|Deshi Ye,On-Line Bin Packing with Arbitrary Release Times.,2007,journals/tcs/ShiY08,Yongqiang Shi|Deshi Ye,Online bin packing with arbitrary release times.,2008,1.0,0.076923
7954,9976,journals/soco/ChovanecNS10,journals/soco/ChovanecD03,True,2,journals/soco/ChovanecNS10,Preface of the guest editors.,Ferdinand Chovanec|Olga Nánásiová|Alexander P....,-2010,journals/soco/ChovanecD03,Preface of the Guest Editors.,Ferdinand Chovanec|Anatolij Dvurecenskij,-2003,0.428571,0.222222
7955,9977,conf/podc/FernandezR07,conf/opodis/AntaR07,True,4,conf/podc/FernandezR07,From an intermittent rotating star to a leader.,Antonio Fernández|Michel Raynal,2007,conf/opodis/AntaR07,From an Intermittent Rotating Star to a Leader.,Antonio Fernández Anta|Michel Raynal,2007,0.333333,0.8
7956,9978,journals/ijcia/ChiaT01,conf/ijcai/TanC01,True,3,journals/ijcia/ChiaT01,Neural Logic Network Learning Using Genetic Pr...,Henry Wai Kit Chia|Chew Lim Tan,2001,conf/ijcai/TanC01,Neural Logic Network Learning using Genetic Pr...,Chew Lim Tan|Henry Wai Kit Chia,2001,0.75,1.0
7957,9979,journals/monet/MerinoMSSK05,conf/wmash/MatsunagaMSK03,True,7,journals/monet/MerinoMSSK05,Secure Authentication System for Public WLAN R...,Ana Sanz Merino|Yasuhiko Matsunaga|Manish Shah...,-2005,conf/wmash/MatsunagaMSK03,Secure authentication system for public WLAN r...,Yasuhiko Matsunaga|Ana Sanz Merino|Takashi Suz...,-2003,0.272727,0.833333
7958,9982,journals/corr/cmp-lg-9706020,journals/jair/WiebeOOM98,True,6,journals/corr/cmp-lg-9706020,Janyce Wiebe|Thomas P. O'Hara|Kenneth J. McKee...,An Empirical Approach to Temporal Reference Re...,1997,journals/jair/WiebeOOM98,Janyce Wiebe|Thomas P. O'Hara|Thorsten Öhrströ...,An Empirical Approach to Temporal Reference Re...,1998,1.0,0.75
7959,9983,conf/icra/ChakrabortyPAM06,journals/trob/ChakrabortyPAM08,True,3,conf/icra/ChakrabortyPAM06,Proximity Queries between Convex Objects: an I...,Nilanjan Chakraborty|Jufeng Peng|Srinivas Akel...,-2006,journals/trob/ChakrabortyPAM08,Proximity Queries Between Convex Objects: An I...,Nilanjan Chakraborty|Jufeng Peng|Srinivas Akel...,-2008,0.714286,0.7
7960,9984,conf/cdc/ZymnisBG08,conf/icarcv/ZymnisBG08,True,6,conf/cdc/ZymnisBG08,Mixed state estimation for a linear Gaussian M...,Argyrios Zymnis|Stephen P. Boyd|Dimitry M. Gor...,2008,conf/icarcv/ZymnisBG08,Mixed state estimation for a linear Gaussian M...,Argyris Zymnis|Stephen P. Boyd|Dimitry M. Gori...,2008,1.0,0.777778
7961,9986,journals/tip/MatungkaZE09,conf/icip/MatungkaZE08,True,7,journals/tip/MatungkaZE09,Image Registration Using Adaptive Polar Transf...,Rittavee Matungka|Yuan F. Zheng|Robert L. Ewing,-2009,conf/icip/MatungkaZE08,Image registration using Adaptive Polar Transf...,Rittavee Matungka|Yuan F. Zheng|Robert L. Ewing,-2008,0.5,1.0


In [44]:
text_features.head(20)

Unnamed: 0,pkey,pauthor,ptitle,pyear
0,conf/dft/SemiaoRVSTT07,Improving the Tolerance of Pipeline Based Circ...,Jorge Semião|Juan J. Rodríguez-Andina|Fabian V...,-2007
1,conf/dagstuhl/Caire07,A Normative Multi-Agent Systems Approach to th...,Patrice Caire,-2007
2,journals/tit/BT07,Self-Orthogonality of q-Ary Images of qm-Ary C...,Sundeep B|Andrew Thangaraj,2007
3,conf/icdcsw/Pardo-Castellote03,OMG Data-Distribution Service: Architectural O...,Gerardo Pardo-Castellote,-2003
4,journals/corr/abs-0911-4329,Structural Consistency: Enabling XML Keyword S...,Ki-Hoon Lee|Kyu-Young Whang|Wook-Shin Han|Min-...,2009
5,conf/cdc/LiuHT09,Input-to-state stability for a class of hybrid...,Bin Liu|David J. Hill|Kok Lay Teo,-2009
6,journals/mcs/SunKZC03,Haiwei Sun|Ning Kang|Jun Zhang|Eric S. Carlson,A fourth-order compact difference scheme on fa...,-2003
7,conf/icic/LeeKKK06,Performance Improvement of Intelligent UWB-IR ...,Sang-Heon Lee|Nam-Sung Kim|Heau-Jo Kang|Soon-G...,-2006
8,conf/icns/Capo-ChichiGF09,IEEE 802.15.4 Performance on a Hierarchical Hy...,Eugene Pamba Capo-Chichi|Hervé Guyennet|Jean-M...,-2009
9,journals/jche/WangH09,Scaffolding preservice teachers' WebQuest desi...,Feng Wang|Michael J. Hannafin,-2009


In [38]:
dblp[['pauthor', 'ptitle']].head(20)

Unnamed: 0,pauthor,ptitle
0,Jorge Semião|Juan J. Rodríguez-Andina|Fabian V...,Improving the Tolerance of Pipeline Based Circ...
1,Patrice Caire,A Normative Multi-Agent Systems Approach to th...
2,Sundeep B|Andrew Thangaraj,Self-Orthogonality of q-Ary Images of qm-Ary C...
3,Gerardo Pardo-Castellote,OMG Data-Distribution Service: Architectural O...
4,Ki-Hoon Lee|Kyu-Young Whang|Wook-Shin Han|Min-...,Structural Consistency: Enabling XML Keyword S...
5,Bin Liu|David J. Hill|Kok Lay Teo,Input-to-state stability for a class of hybrid...
6,A fourth-order compact difference scheme on fa...,Haiwei Sun|Ning Kang|Jun Zhang|Eric S. Carlson
7,Sang-Heon Lee|Nam-Sung Kim|Heau-Jo Kang|Soon-G...,Performance Improvement of Intelligent UWB-IR ...
8,Eugene Pamba Capo-Chichi|Hervé Guyennet|Jean-M...,IEEE 802.15.4 Performance on a Hierarchical Hy...
9,Feng Wang|Michael J. Hannafin,Scaffolding preservice teachers' WebQuest desi...
