In [1]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
import Levenshtein as Lv
import time

import numpy as np
import json
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import Binarizer
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import FunctionTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

# Analysis of AutoPhrase results

In [2]:
fp_uniquebyyear = '../results/dblp-v10-phrases-uniquebyyear.csv'
fp_unique = '../results/dblp-v10-phrases-unique.csv'

In [3]:
# Start from year 1968 - years before had too little training data
# We only kept multi-word phrases above 0.6 and single-word above 0.8

In [4]:
# Contains the phrases unique overall (no duplicates)
un_all = pd.read_csv(fp_unique, index_col=0)
un_all = un_all[un_all['Year'] >= 1968]
un_all

Unnamed: 0,Phrase Quality,Phrase,Year
9,0.890167,time sharing,1968
10,0.610000,real time,1970
11,0.964167,pattern recognition,1972
12,0.866167,data base,1972
13,0.850167,programming languages,1972
...,...,...,...
39336,0.603183,great attention,2017
39337,0.602875,limited training,2017
39338,0.602173,public datasets,2017
39339,0.602145,existing works,2017


In [5]:
# Contains the phrases unique by year (there can be duplicates across years)
uby = pd.read_csv(fp_uniquebyyear, index_col=0)
uby = uby[uby['Year'] >= 1968]
uby

Unnamed: 0,Phrase Quality,Phrase,Year
20,0.946833,context free,1968
21,0.890167,time sharing,1968
22,0.909000,context free,1969
23,0.896000,time sharing,1969
24,0.993000,context free,1970
...,...,...,...
238718,0.600501,practical implementation,2017
238719,0.600363,next generation,2017
238720,0.600185,deep convolutional neural,2017
238721,0.600087,network nodes,2017


In [6]:
# Only keeps the phrases that show up multiple times across years (so we can look for trends)
uby_dups = uby.copy()
uby_dups = uby_dups[uby_dups['Phrase'].duplicated(keep=False)]
uby_dups

Unnamed: 0,Phrase Quality,Phrase,Year
20,0.946833,context free,1968
21,0.890167,time sharing,1968
22,0.909000,context free,1969
23,0.896000,time sharing,1969
24,0.993000,context free,1970
...,...,...,...
238718,0.600501,practical implementation,2017
238719,0.600363,next generation,2017
238720,0.600185,deep convolutional neural,2017
238721,0.600087,network nodes,2017


In [7]:
uby_dups['Phrase'].value_counts()[:30]

context free               46
high level                 42
programming language       42
natural language           41
pattern recognition        41
programming languages      41
data structures            41
sufficient conditions      39
data structure             39
problem solving            39
np complete                38
general purpose            38
artificial intelligence    38
signal processing          38
linear programming         38
lower bounds               38
large scale                37
dynamic programming        37
image processing           36
information retrieval      36
data flow                  36
lower bound                36
software development       35
user interface             35
knowledge base             35
data types                 35
database systems           35
high speed                 35
software engineering       35
worst case                 35
Name: Phrase, dtype: int64

In [8]:
uby_dups[uby_dups['Phrase'] == 'image processing']

Unnamed: 0,Phrase Quality,Phrase,Year
140,0.978969,image processing,1981
172,0.984815,image processing,1982
216,0.984825,image processing,1983
289,0.969924,image processing,1984
367,0.969975,image processing,1985
455,0.976549,image processing,1986
542,0.980447,image processing,1987
692,0.983969,image processing,1988
950,0.965337,image processing,1989
1244,0.972183,image processing,1990


In [9]:
# Can look at phrases that first appear in a year, then look at future years in which they appear
data = {}

# Phrase matching/similarity for input papers (title + abstract)

In [10]:
# Given an input paper (title + abstract), extract the phrases within it and return the
# similar phrases found in the AutoPhrase results.
# Can use Levenshtein distance to find similar strings, or just use direct phrase matching

In [11]:
test_fp = '../data/arxiv/csv/2016.csv'

In [12]:
test_data = pd.read_csv(test_fp)
test_data.head()

Unnamed: 0,Title,Abstract
0,Sequential Short-Text Classification with Recu...,Recent approaches based on artificial neural n...
1,Multiresolution Recurrent Neural Networks An A...,We introduce the multiresolution recurrent neu...
2,Document Image Coding and Clustering for Scrip...,The paper introduces a new method for discrimi...
3,Tutorial on Answering Questions about Images w...,Together with the development of more accurate...
4,Building Machines That Learn and Think Like Pe...,Recent progress in artificial intelligence AI ...


In [13]:
test_data['Title'][0] + ' ' + test_data['Abstract'][0]

'Sequential Short-Text Classification with Recurrent and Convolutional  Neural Networks Recent approaches based on artificial neural networks ANNs have shownpromising results for short-text classification However many short textsoccur in sequences eg sentences in a document or utterances in a dialogand most existing ANN-based systems do not leverage the preceding short textswhen classifying a subsequent one In this work we present a model based onrecurrent neural networks and convolutional neural networks that incorporatesthe preceding short texts Our model achieves state-of-the-art results on threedifferent datasets for dialog act prediction'

In [14]:
uby

Unnamed: 0,Phrase Quality,Phrase,Year
20,0.946833,context free,1968
21,0.890167,time sharing,1968
22,0.909000,context free,1969
23,0.896000,time sharing,1969
24,0.993000,context free,1970
...,...,...,...
238718,0.600501,practical implementation,2017
238719,0.600363,next generation,2017
238720,0.600185,deep convolutional neural,2017
238721,0.600087,network nodes,2017


In [15]:
input_phrase = 'convolutional neural networks'

In [16]:
valid_ix = uby.apply(lambda x: x['Phrase'][0] == 'c' if isinstance(x['Phrase'], str) else False, axis=1)
valid_ix

20         True
21        False
22         True
23        False
24         True
          ...  
238718    False
238719    False
238720    False
238721    False
238722    False
Length: 238703, dtype: bool

In [17]:
unique_phrases = uby[valid_ix]['Phrase'].unique()
unique_phrases

array(['context free', 'computer science', 'computer graphics', ...,
       'complete characterization', 'co occurrence matrix',
       'computational approach'], dtype=object)

In [18]:
candidate = ''
dist = float('inf')
for phrase in unique_phrases:
    diff = Lv.distance(input_phrase, phrase)
    if diff < dist:
        candidate = phrase
        dist = diff

In [19]:
candidate

'convolutional neural networks'

In [20]:
dist

0

In [21]:
uby[uby['Phrase'] == 'convolutional neural networks']

Unnamed: 0,Phrase Quality,Phrase,Year
136900,0.865809,convolutional neural networks,2012
152925,0.915629,convolutional neural networks,2013
172010,0.937014,convolutional neural networks,2014
190695,0.931728,convolutional neural networks,2015
212473,0.917273,convolutional neural networks,2016
233574,0.904261,convolutional neural networks,2017


In [22]:
# Same approach, but keeping track of all candidates this time
candidates = []
for phrase in unique_phrases:
    dist = Lv.distance(input_phrase, phrase)
    candidates.append((dist, phrase))

In [23]:
candidates.sort()

In [24]:
candidates[:10]

[(0, 'convolutional neural networks'),
 (1, 'convolutional neural network'),
 (3, 'convolution neural network'),
 (4, 'convolutional neural network cnn'),
 (4, 'convolutional neural networks cnn'),
 (5, 'convolutional neural networks cnns'),
 (7, 'convolutional networks'),
 (8, 'convolutional network'),
 (9, 'cellular neural networks'),
 (9, 'chaotic neural networks')]

# Model generation

In [25]:
model = LogisticRegression()

In [26]:
X_train, X_test, y_train, y_test = train_test_split(uby_dups['Phrase'], uby_dups['Year'])

In [27]:
#model.fit(X_train, y_train)

In [28]:
# Can try using correlation between phrases
# Don't have to use a model, just calculate the correlation of input phrases against phrases by year

In [29]:
# Using string similarity
# https://towardsdatascience.com/calculating-string-similarity-in-python-276e18a7d33a
