In [1]:
! pip install suffix_trees langdetect




In [1]:
import re
from suffix_trees import STree          # used to determine longest similar string
import unicodedata
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
pd.set_option("display.max_columns", None)


In [2]:
# import jsons
def json2dict(path):
    x = pd.read_json(path)
    return dict(zip(x['id'], x['name']))


pbooktitle = json2dict('data/pbooktitle.json')
pbooktitlefull = json2dict('data/pbooktitlefull.json')
pjournal = json2dict('data/pjournal.json')
pjournalfull = json2dict('data/pjournalfull.json')
ptype = json2dict('data/ptype.json')

# import csvs
dblp = pd.DataFrame()
for i in range(1, 5):
    csv = pd.read_csv(f'data/dblp-{i}.csv')
    dblp = pd.concat([dblp, csv]).reset_index(drop=True)

test_hidden = pd.read_csv('data/test_hidden.csv')
train = pd.read_csv('data/train.csv')
validation_hidden = pd.read_csv('data/validation_hidden.csv')


# Data Cleaning

In [3]:
def remove_accents(s):
    '''
    https://docs.python.org/3/library/unicodedata.html#module-unicodedata
    '''
    s = str(s)      # cannot assume string
    norm_form = unicodedata.normalize('NFKD', s)
    return "".join([char for char in norm_form if not unicodedata.combining(char)])


def overlap_str(string1, string2):
    '''
    determine longest containing substring beteween strings
    '''
    return STree.STree([string1, string2]).lcs()


def sort_authors(s):
    '''
    sort authors by first name
    '''

    if '|' in s:
        authors = s.split(r'[|]')
        authors = [a.strip().title() for a in authors if a!='']
        authors = sorted(authors)
        s = '|'.join(authors)

    return s
    


In [4]:
df = dblp.copy()

# correct for negative publication years
df['pyear'] = df['pyear'].abs()

# remove accents for columns
df['pauthor'] = df['pauthor'].apply(remove_accents)
df['ptitle'] = df['ptitle'].apply(remove_accents)

# remove accents and joint dictionarys by key
ptype = {k: remove_accents(v) for k, v in ptype.items()}
pjournal = {k: remove_accents(v) for k, v in pjournal.items()}
pjournalfull = {k: remove_accents(v) for k, v in pjournalfull.items()}
pbooktitle = {k: remove_accents(v) for k, v in pbooktitle.items()}
pbooktitlefull = {k: remove_accents(v) for k, v in pbooktitlefull.items()}

# replace id keys with values
id_cols = ['ptype_id', 'pjournal_id', 'pjournalfull_id',
           'pbooktitle_id', 'pbooktitlefull_id']
for col in id_cols:
    df[col].replace(eval(col[:-3]), inplace=True)


In [5]:
# split pkey for further inspection
df['pkey_length'] = df['pkey'].str.count("/") + 1
df['pkey_split'] = df['pkey'].str.split('/')
df[['pkey_type', 'pkey_author']] = [[split[0], split[-1]]
                                    for split in df['pkey_split']]


In [6]:
# clean ptitle-pauthor switch
mask = df['ptitle'].str.contains('|', regex=False) | ((~df['pauthor'].str.contains('|', regex=False))
                                                                 & (df['pauthor'].str.len() > df['ptitle'].str.len()))
df.loc[mask, ['pauthor', 'ptitle']] = df.loc[mask, ['ptitle', 'pauthor']].values


In [7]:
# determine longest containing string to identify correct pauthor values
def get_lcs(str1,str2) -> str: return STree.STree([str1, str2]).lcs()

df['title_lcs_pkeyAuthor'] = df.apply(lambda x: get_lcs(x['ptitle'],x['pkey_author']), axis=1)
df['author_lcs_pkeyAuthor'] = df.apply(lambda x: get_lcs(x['pauthor'], x['pkey_author']), axis=1)


In [13]:
title_bettermatch = df['title_lcs_pkeyAuthor'].str.len(
) > df['author_lcs_pkeyAuthor'].str.len()
cols = ['pauthor', 'ptitle', 'pkey_author',
        'author_lcs_pkeyAuthor', 'title_lcs_pkeyAuthor']
df[ title_bettermatch  ][cols].sort_values(by='title_lcs_pkeyAuthor', key=lambda x: x.str.len(), ascending=False).head(30)

Unnamed: 0,pauthor,ptitle,pkey_author,author_lcs_pkeyAuthor,title_lcs_pkeyAuthor
10683,Chart.,Hans Hinterberger,Hinterberger09,r,Hinterberger
8450,Introduction.,Maris G. Martinsons,Martinsons05,ti,Martinsons
8339,PageRank Algorithm.,Monika Rauch Henzinger,Henzinger08,ge,Henzinger
4006,Editorial.,Alan Gilchrist,Gilchrist08,ri,Gilchrist
12684,Geometric BIC.,Kenichi Kanatani,Kanatani10,t,Kanatani
6488,Editorial.,Michael Prietula,Prietula00,ri,Prietula
10731,Editorial.,Warren Harrison,Harrison98,ri,Harrison
16545,Group morphology.,J. B. T. M. Roerdink,Roerdink00,r,Roerdink
12747,Book review.,Rene Henrion,Henrion07,o,Henrion
12634,Book review.,Joaquin Abellan,Abellan06,e,Abellan


In [89]:
# re-order authors -- may help with entity recognition?
df['pauthor'] = df['pauthor'].apply(sort_authors)


In [None]:
df


Unnamed: 0.1,Unnamed: 0,pauthor,peditor,ptitle,pyear,paddress,ppublisher,pseries,pid,pkey,ptype_id,pjournal_id,pbooktitle_id,pjournalfull_id,pbooktitlefull_id,partition,pkey_length,pkey_split,pkey_type,pkey_author,match_titleKey,match_authorKey,ptitle_lg
0,4,Jorge Semiao|Juan J. Rodriguez-Andina|Fabian V...,,Improving the Tolerance of Pipeline Based Circ...,2007,,,,180843,conf/dft/SemiaoRVSTT07,inproceedings,,DFT,,,1,3,"[conf, dft, SemiaoRVSTT07]",conf,SemiaoRVSTT07,ia,Semiao,en
1,7,Patrice Caire,,A Normative Multi-Agent Systems Approach to th...,2007,,,,162991,conf/dagstuhl/Caire07,inproceedings,,Normative Multi-agent Systems,,European Grid Conference,1,3,"[conf, dagstuhl, Caire07]",conf,Caire07,r,Caire,en
2,10,Sundeep B|Andrew Thangaraj,,Self-Orthogonality of q-Ary Images of qm-Ary C...,2007,,,,2261406,journals/tit/BT07,article,IEEE Transactions on Information Theory,WiMob,International Journal of Ambient Computing and...,ACM Symposium on Parallel Algorithms and Archi...,1,3,"[journals, tit, BT07]",journals,BT07,,B,en
3,18,Gerardo Pardo-Castellote,,OMG Data-Distribution Service: Architectural O...,2003,,,,349720,conf/icdcsw/Pardo-Castellote03,inproceedings,,ICDCS Workshops,,International Agent Technology Conference,1,3,"[conf, icdcsw, Pardo-Castellote03]",conf,Pardo-Castellote03,te,Pardo-Castellote,en
4,19,Ki-Hoon Lee|Kyu-Young Whang|Wook-Shin Han|Min-...,,Ki-Hoon Lee|Kyu-Young Whang|Wook-Shin Han|Min-...,2009,,,,1922328,journals/corr/abs-0911-4329,article,CoRR,,International Journal of Wireless Information ...,Messung,1,3,"[journals, corr, abs-0911-4329]",journals,abs-0911-4329,ab,-,tl
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17160,9975,Online bin packing with arbitrary release times.,,Online bin packing with arbitrary release times.,2008,,,,2245299,journals/tcs/ShiY08,article,Theor. Comput. Sci.,,ACM Transactions on Computation Theory (TOCT),Messung,4,3,"[journals, tcs, ShiY08]",journals,ShiY08,Shi,i,en
17161,9978,Chew Lim Tan|Henry Wai Kit Chia,,Neural Logic Network Learning using Genetic Pr...,2001,,,,460233,conf/ijcai/TanC01,inproceedings,,IJCAI,,inroads (ACM SIGCSE Bulletin)|ACM SIGCSE Bulletin,4,3,"[conf, ijcai, TanC01]",conf,TanC01,a,Tan,en
17162,9981,Sue Newell|Jacky Swan|Joseph Weiss,,Project Management: Minitrack Introduction.,2004,,,,289610,conf/hicss/NewellSW04,inproceedings,,HICSS,,Grid and Cooperative Computing,4,3,"[conf, hicss, NewellSW04]",conf,NewellSW04,e,Newell,en
17163,9988,Martijn Hendriks|Barend Van Den Nieuwelaar|Fri...,,Model checker aided design of a controller for...,2006,,,,2224414,journals/sttt/HendriksNV06,article,STTT,,374,Messung,4,3,"[journals, sttt, HendriksNV06]",journals,HendriksNV06,d,Hendriks,en


# Inspect ptype_id and pkey_type

**ptype_id**
- *article* = A paper that has been published in conference proceedings. The usage of conference and inproceedings is the same
- *book* = A thesis written for the PhD level degree.
- *incollection* = A titled section of a book. Such as a short story within the larger collection of short stories that make up the book
- *inproceedings* = A paper that has been published in conference proceedings. The usage of conference and inproceedings is the same
- *phdthesis* = A thesis written for the PhD level degree.

**pkey_type**
- *conf* = 
- *journals* = 
- *reference* = 
- *series* = could be series of books
- *tr* = technical report -> An institutionally published report such as a report from a school, a government organization, an organization, or a company. This entry type is also frequently used for white papers and working papers.
- *phd* = should likely be a phdthesis



In [None]:
df.groupby(by=['ptype_id','pkey_type']).size().reset_index()

Unnamed: 0,ptype_id,pkey_type,0
0,article,conf,490
1,article,journals,4477
2,article,reference,2
3,article,series,1
4,article,tr,9
5,book,books,4
6,book,conf,2
7,book,journals,1
8,book,series,4
9,incollection,books,21


In [None]:

df.groupby(by=['pkey_type', 'ptype_id']).size().reset_index()



Unnamed: 0,pkey_type,ptype_id,0
0,books,book,4
1,books,incollection,21
2,books,inproceedings,3
3,conf,article,490
4,conf,book,2
5,conf,incollection,126
6,conf,inproceedings,9229
7,conf,phdthesis,1107
8,journals,article,4477
9,journals,book,1
