In [1]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
import Levenshtein as Lv
import time
import re
from glob import glob

import numpy as np
import json
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import Binarizer
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import FunctionTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

## Loading in AutoPhrase and phrasal segmentation results dataframes

In [None]:
# AutoPhrase results dataframe
fp_phrases = '../results/dblp-v10-grouped/phrases.csv'
phrases = pd.read_csv(fp_phrases, index_col=0)

In [None]:
# Processed phrasal segmentation results dataframe
infolder = '../results/dblp-v10-grouped'
subfolders = glob(infolder + '/*.csv')
subfolders = list(filter(lambda x: 'segmented' in x, subfolders))
seg = pd.DataFrame(columns=['Phrases', 'Year', 'Num Phrases'])
for fp in subfolders:
    df = pd.read_csv(fp, index_col=0)
    df = df.dropna()
    df['Num Phrases'] = df.apply(lambda x: len(x['Phrases'].split(',')), axis=1)
    #df = df.drop('Phrases', axis=1)
    seg = seg.append(df, ignore_index=True)
seg = seg.dropna()

In [None]:
# Maybe add code for creating counts dictionary if it can be used with the model

## Integrating phrase quality with phrasal segmentation results

In [16]:
def find_qualities(x):
    """
    Helper function to process the segmentation.csv files
    Only keeps quality phrases (multi >= 0.5, single >= 0.8)
    Obtains the phrase quality of each phrase by matching with phrases.csv
    """
    x = x['Phrases'].split(',')
    out_phrases = []
    out_quality = []
    for phrase in x:
        # Phrase will not show up in the phrases df if the quality is too low
        # We only kept multi >= 0.5 and single >= 0.8
        # NOTE: Potential issue with phrase having dashes in seg when they don't in phrases
        #       (i.e. user-controlled vs. user controlled)
        match = phrases[phrases['Phrase'] == phrase]
        if len(match) == 0:
            continue
        else:
            out_phrases.append(phrase)
            out_quality.append(match['Phrase Quality'].values[0])
    return out_phrases, out_quality

In [17]:
test = seg.copy()
test = seg.loc[len(seg)-10:len(seg)]
test

Unnamed: 0,Phrases,Year,Num Phrases
2548121,"variational autoencoders,data,probabilistic,da...",2015-2017,25
2548122,"today,advanced,computational science,data,dist...",2015-2017,26
2548123,"paper,numerical solution,one-dimensional,semi-...",2015-2017,30
2548124,"recent years,gnss,navigation,satellites,constr...",2015-2017,55
2548125,"evolution,liquid,gas,level-set,ls,volume of fl...",2015-2017,34
2548126,"research,fi,indoor,location-based service lbs,...",2015-2017,44
2548127,"icts,eliminating,gender,entrepreneurship,gover...",2015-2017,18
2548128,"infinite horizon,inventory,model,general,reven...",2015-2017,12
2548129,"infrared,technology,deep-space,radiation,earth...",2015-2017,51
2548130,"net,power output,exergy,organic,rankine cycle,...",2015-2017,34


In [18]:
test['Phrases'], test['Phrase Qualities'] = zip(*test.apply(find_qualities, axis=1))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test['Phrases'], test['Phrase Qualities'] = zip(*test.apply(find_qualities, axis=1))


In [19]:
test

Unnamed: 0,Phrases,Year,Num Phrases,Phrase Qualities
2548121,"[variational autoencoders, probabilistic, grap...",2015-2017,25,"[0.9449002689, 0.8166917289, 0.7755210485, 0.5..."
2548122,"[today, advanced, computational science, distr...",2015-2017,26,"[0.8001977554, 0.8170605506, 0.8585270157, 0.9..."
2548123,"[numerical solution, schrodinger equation, unb...",2015-2017,30,"[0.5490036162, 0.7999191231, 0.7916130062, 0.8..."
2548124,"[recent years, gnss, navigation, constrained e...",2015-2017,55,"[0.5832008349, 0.851142233, 0.8022377586, 0.63..."
2548125,"[liquid, gas, ls, volume of fluid, sharp, phys...",2015-2017,34,"[0.8092998956, 0.8071359871, 0.8232093176, 0.6..."
2548126,"[research, fi, indoor, cascaded, indoor, inert...",2015-2017,44,"[0.8071345493, 0.8448244418, 0.83323151, 0.814..."
2548127,"[icts, eliminating, gender, entrepreneurship, ...",2015-2017,18,"[0.8633365439, 0.8008664255, 0.8056553868, 0.8..."
2548128,"[infinite horizon, inventory, discount rate, b...",2015-2017,12,"[0.9018355738, 0.8087514964, 0.8258210996, 0.8..."
2548129,"[infrared, radiation, earth, pushing, power co...",2015-2017,51,"[0.8256317027, 0.8054114773, 0.810824659, 0.81..."
2548130,"[net, power output, exergy, organic, working f...",2015-2017,34,"[0.8324536295, 0.5708393502, 0.8773958416, 0.8..."


In [20]:
test.loc[2548129]['Phrases']

['infrared',
 'radiation',
 'earth',
 'pushing',
 'power consumption',
 'operating conditions',
 'khz',
 'adc',
 'mw',
 'instrumentation',
 'mirror',
 'low sensitivity',
 'modulator',
 'physical implementation',
 'm 1',
 'poly',
 'metal',
 'cmos',
 'db',
 'sndr',
 'clock',
 'ir',
 'imager',
 'per minute',
 'effective number of bits enob',
 'mw',
 'khz',
 'ir',
 'instrumentation']

In [21]:
test.loc[2548129]['Phrase Qualities']

[0.8256317027,
 0.8054114773,
 0.810824659,
 0.8148357817,
 0.9322819448,
 0.6685492507,
 0.8225274735,
 0.8244446791,
 0.8231960364,
 0.8145576518,
 0.8339330792,
 0.6613013316,
 0.8281263484,
 0.5607881058,
 0.7981997513,
 0.8184902657,
 0.8007140811,
 0.8217445899,
 0.8271253755,
 0.8425095531,
 0.8073903977,
 0.8065003229,
 0.8192171443,
 0.5306616221,
 0.7624752751,
 0.8231960364,
 0.8225274735,
 0.8065003229,
 0.8145576518]

In [22]:
phrases[phrases['Phrase'] == 'user controlled']

Unnamed: 0,Phrase Quality,Phrase,Year,Num Words
23440,0.557913,user controlled,1995-1999,2
45754,0.716653,user controlled,2000-2004,2
93921,0.792673,user controlled,2005-2009,2
195600,0.748656,user controlled,2010-2014,2
281156,0.715213,user controlled,2015-2017,2


In [23]:
re.sub(r'[^A-Za-z0-9- ]+', '', 'user-controlled')

'user-controlled'

In [24]:
# Need to replace dashes (and potentially other chars) with a space
re.sub(r'[-]+', ' ', 'user-controlled')

'user controlled'

# Analysis of AutoPhrase results

In [25]:
fp_uniquebyyear = '../results/dblp-v10/phrases.csv'
#fp_unique = '../results/dblp-v10-phrases-unique.csv'

In [26]:
# Start from year 1968 - years before had too little training data
# We only kept multi-word phrases above 0.6 and single-word above 0.8

In [27]:
# Contains the phrases unique overall (no duplicates)
# un_all = pd.read_csv(fp_unique, index_col=0)
# un_all = un_all[un_all['Year'] >= 1968]
# un_all

In [28]:
# Contains the phrases unique by year (there can be duplicates across years)
uby = pd.read_csv(fp_uniquebyyear, index_col=0)
uby = uby[uby['Year'] >= 1968]
uby = uby.dropna()
uby

Unnamed: 0,Phrase Quality,Phrase,Year
20,0.946833,context free,1968
21,0.890167,time sharing,1968
22,0.909000,context free,1969
23,0.896000,time sharing,1969
24,0.993000,context free,1970
...,...,...,...
238718,0.600501,practical implementation,2017
238719,0.600363,next generation,2017
238720,0.600185,deep convolutional neural,2017
238721,0.600087,network nodes,2017


In [29]:
# Only keeps the phrases that show up multiple times across years (so we can look for trends)
uby_dups = uby.copy()
uby_dups = uby_dups[uby_dups['Phrase'].duplicated(keep=False)]
uby_dups

Unnamed: 0,Phrase Quality,Phrase,Year
20,0.946833,context free,1968
21,0.890167,time sharing,1968
22,0.909000,context free,1969
23,0.896000,time sharing,1969
24,0.993000,context free,1970
...,...,...,...
238718,0.600501,practical implementation,2017
238719,0.600363,next generation,2017
238720,0.600185,deep convolutional neural,2017
238721,0.600087,network nodes,2017


In [30]:
uby_dups['Phrase'].value_counts()[:10]

context free             46
high level               42
programming language     42
natural language         41
pattern recognition      41
programming languages    41
data structures          41
sufficient conditions    39
data structure           39
problem solving          39
Name: Phrase, dtype: int64

In [31]:
uby_dups[uby_dups['Phrase'] == 'image processing']['Year'].values

array([1981, 1982, 1983, 1984, 1985, 1986, 1987, 1988, 1989, 1990, 1991,
       1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003,
       2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014,
       2015, 2016, 2017], dtype=int64)

# Phrase matching/similarity for input papers (title + abstract)

In [32]:
# Given an input paper (title + abstract), extract the phrases within it and return the
# similar phrases found in the AutoPhrase results.
# Can use Levenshtein distance to find similar strings, or just use direct phrase matching

In [33]:
test_fp = '../data/arxiv/csv/2016.csv'

In [34]:
test_data = pd.read_csv(test_fp)
test_data.head()

Unnamed: 0,Title,Abstract
0,Sequential Short-Text Classification with Recu...,Recent approaches based on artificial neural n...
1,Multiresolution Recurrent Neural Networks An A...,We introduce the multiresolution recurrent neu...
2,Document Image Coding and Clustering for Scrip...,The paper introduces a new method for discrimi...
3,Tutorial on Answering Questions about Images w...,Together with the development of more accurate...
4,Building Machines That Learn and Think Like Pe...,Recent progress in artificial intelligence AI ...


In [35]:
test_data['Title'][0] + ' ' + test_data['Abstract'][0]

'Sequential Short-Text Classification with Recurrent and Convolutional  Neural Networks Recent approaches based on artificial neural networks ANNs have shownpromising results for short-text classification However many short textsoccur in sequences eg sentences in a document or utterances in a dialogand most existing ANN-based systems do not leverage the preceding short textswhen classifying a subsequent one In this work we present a model based onrecurrent neural networks and convolutional neural networks that incorporatesthe preceding short texts Our model achieves state-of-the-art results on threedifferent datasets for dialog act prediction'

In [36]:
uby

Unnamed: 0,Phrase Quality,Phrase,Year
20,0.946833,context free,1968
21,0.890167,time sharing,1968
22,0.909000,context free,1969
23,0.896000,time sharing,1969
24,0.993000,context free,1970
...,...,...,...
238718,0.600501,practical implementation,2017
238719,0.600363,next generation,2017
238720,0.600185,deep convolutional neural,2017
238721,0.600087,network nodes,2017


In [37]:
input_phrase = 'convolutional neural networks'

In [38]:
valid_ix = uby.apply(lambda x: x['Phrase'][0] == 'c' if isinstance(x['Phrase'], str) else False, axis=1)
valid_ix

20         True
21        False
22         True
23        False
24         True
          ...  
238718    False
238719    False
238720    False
238721    False
238722    False
Length: 238701, dtype: bool

In [39]:
unique_phrases = uby[valid_ix]['Phrase'].unique()
unique_phrases

array(['context free', 'computer science', 'computer graphics', ...,
       'complete characterization', 'co occurrence matrix',
       'computational approach'], dtype=object)

In [40]:
candidate = ''
dist = float('inf')
for phrase in unique_phrases:
    diff = Lv.distance(input_phrase, phrase)
    if diff < dist:
        candidate = phrase
        dist = diff

In [41]:
candidate

'convolutional neural networks'

In [42]:
dist

0

In [43]:
uby[uby['Phrase'] == 'convolutional neural networks']

Unnamed: 0,Phrase Quality,Phrase,Year
136900,0.865809,convolutional neural networks,2012
152925,0.915629,convolutional neural networks,2013
172010,0.937014,convolutional neural networks,2014
190695,0.931728,convolutional neural networks,2015
212473,0.917273,convolutional neural networks,2016
233574,0.904261,convolutional neural networks,2017


In [44]:
# Same approach, but keeping track of all candidates this time

In [45]:
candidates = []
for phrase in unique_phrases:
    dist = Lv.distance(input_phrase, phrase)
    candidates.append((dist, phrase))

In [46]:
candidates.sort()

In [47]:
candidates[:10]

[(0, 'convolutional neural networks'),
 (1, 'convolutional neural network'),
 (3, 'convolution neural network'),
 (4, 'convolutional neural network cnn'),
 (4, 'convolutional neural networks cnn'),
 (5, 'convolutional neural networks cnns'),
 (7, 'convolutional networks'),
 (8, 'convolutional network'),
 (9, 'cellular neural networks'),
 (9, 'chaotic neural networks')]

In [48]:
# Using df.apply so we can look at all phrases, not just phrases that start with the same letter

In [49]:
uby_test = uby.copy()
uby_test['Dist'] = uby_test.apply(lambda x: Lv.distance(input_phrase, x['Phrase']) if isinstance(x['Phrase'], str) else float('inf'), axis=1)
uby_test

Unnamed: 0,Phrase Quality,Phrase,Year,Dist
20,0.946833,context free,1968,22
21,0.890167,time sharing,1968,24
22,0.909000,context free,1969,22
23,0.896000,time sharing,1969,24
24,0.993000,context free,1970,22
...,...,...,...,...
238718,0.600501,practical implementation,2017,22
238719,0.600363,next generation,2017,21
238720,0.600185,deep convolutional neural,2017,14
238721,0.600087,network nodes,2017,21


In [50]:
uby_test.sort_values('Dist')[:10]

Unnamed: 0,Phrase Quality,Phrase,Year,Dist
212473,0.917273,convolutional neural networks,2016,0
233574,0.904261,convolutional neural networks,2017,0
152925,0.915629,convolutional neural networks,2013,0
190695,0.931728,convolutional neural networks,2015,0
136900,0.865809,convolutional neural networks,2012,0
172010,0.937014,convolutional neural networks,2014,0
190443,0.93688,convolutional neural network,2015,1
174069,0.898522,convolutional neural network,2014,1
212157,0.922546,convolutional neural network,2016,1
153942,0.900172,convolutional neural network,2013,1


In [51]:
# Testing on the unique overall df

In [52]:
# un_all_test = un_all.copy()
# un_all_test['Dist'] = un_all_test.apply(lambda x: Lv.distance(input_phrase, x['Phrase']) if isinstance(x['Phrase'], str) else float('inf'), axis=1)
# un_all_test.head()

In [53]:
#un_all_test.sort_values('Dist')[:20]

In [54]:
# Consolidating results
# Can start with the most common phrases and change phrases that are close enough (distance <= 5?)
uby_counts = uby.groupby('Phrase').size()

In [55]:
# num_years tells us how many years the phrase has shown up in
uby['num_years'] = uby.apply(lambda x: uby_counts[x['Phrase']], axis=1)

In [56]:
uby.head()

Unnamed: 0,Phrase Quality,Phrase,Year,num_years
20,0.946833,context free,1968,46
21,0.890167,time sharing,1968,2
22,0.909,context free,1969,46
23,0.896,time sharing,1969,2
24,0.993,context free,1970,46


# Model generation

In [57]:
# x: Phrase, Phrase Quality, num_years
# Phrase needs to be one hot encoded
# y: Only the year (may need to use the unique overall dataframe?)

# For phrase quality - standard scaler by year?
# For num_years - normalize overall

In [58]:
from sklearn.preprocessing import StandardScaler
std_pipe = Pipeline([('scale', StandardScaler())])
ohe_pipe = Pipeline([('one-hot', OneHotEncoder(handle_unknown='ignore'))])
ct = ColumnTransformer(transformers=[('ohe', ohe_pipe, ['Phrase']),
                                     ('scale', std_pipe, ['num_years']),
                                    ('keep', 'passthrough', ['Phrase Quality'])])
pl = Pipeline([('transform', ct), ('classifier', DecisionTreeClassifier())])

In [59]:
# Set random seed
X_train, X_test, y_train, y_test = train_test_split(uby[['Phrase', 'num_years', 'Phrase Quality']],
                                                    uby['Year'],
                                                    random_state=1)

In [60]:
_ = pl.fit(X_train, y_train)

In [61]:
# Mean accuracy - 8% accurate
pl.score(X_test, y_test)

0.08058515986326162

In [62]:
X_test['Predicted Year'] = pl.predict(X_test)
X_test['Year'] = y_test
X_test['Abs Year Diff'] = abs(X_test['Year'] - X_test['Predicted Year'])

In [63]:
X_test

Unnamed: 0,Phrase,num_years,Phrase Quality,Predicted Year,Year,Abs Year Diff
198146,band selection,11,0.831558,2009,2015,6
152976,mixed criticality,7,0.914890,2015,2013,2
200739,rna seq data,7,0.807498,2012,2015,3
238274,final result,1,0.642976,2016,2017,1
142017,presidential,4,0.806453,2015,2012,3
...,...,...,...,...,...,...
64293,treatment planning,12,0.792733,2010,2007,3
237679,default mode,10,0.696536,2009,2017,8
198475,user behaviour,12,0.828791,2012,2015,3
5578,temporal reasoning,17,0.905742,2009,1996,13


In [64]:
X_test['Abs Year Diff'].mean()

4.81349286145184

In [65]:
# THINGS TO TEST:
# Using the unique by year dataframe, then only keeping one instance of duplicate phrases
# but replace the year with the average (or median) of the years
# Normalizing the numeric features

# Try using phrasal segmentation model to run on a single paper title + abstract
# Or a single paper full paper text

# Try grouping by papers by a range of years (maybe 5)

In [66]:
X_train[X_train['Phrase']=='convolutional neural networks']

Unnamed: 0,Phrase,num_years,Phrase Quality
212473,convolutional neural networks,6,0.917273
152925,convolutional neural networks,6,0.915629
190695,convolutional neural networks,6,0.931728
136900,convolutional neural networks,6,0.865809


In [67]:
test = pd.DataFrame([[1.0, 'convolutional neural networks', 2005, 6]], columns=['Phrase Quality', 'Phrase', 'Year', 'num_years'])
test

Unnamed: 0,Phrase Quality,Phrase,Year,num_years
0,1.0,convolutional neural networks,2005,6


In [68]:
pl.predict(test[['Phrase', 'num_years', 'Phrase Quality']])

array([1991], dtype=int64)

In [69]:
uby[uby['Phrase'] == 'convolutional neural networks']

Unnamed: 0,Phrase Quality,Phrase,Year,num_years
136900,0.865809,convolutional neural networks,2012,6
152925,0.915629,convolutional neural networks,2013,6
172010,0.937014,convolutional neural networks,2014,6
190695,0.931728,convolutional neural networks,2015,6
212473,0.917273,convolutional neural networks,2016,6
233574,0.904261,convolutional neural networks,2017,6


In [70]:
test = pd.DataFrame([[1.0, 'artificial intelligence', 2005, 6]], columns=['Phrase Quality', 'Phrase', 'Year', 'num_years'])
test

Unnamed: 0,Phrase Quality,Phrase,Year,num_years
0,1.0,artificial intelligence,2005,6


In [71]:
pl.predict(test[['Phrase', 'num_years', 'Phrase Quality']])

array([1991], dtype=int64)

# Baseline model testing with grouped phrases

In [72]:
from sklearn.preprocessing import StandardScaler

In [73]:
df = pd.read_csv('../results/dblp-v10-grouped/phrases.csv', index_col=0)
df = df[4:]
df = df.dropna()

In [74]:
# num_years is slightly different since we grouped years now
phr_counts = df.groupby('Phrase').size()
df['num_years'] = df.apply(lambda x: phr_counts[x['Phrase']], axis=1)

In [75]:
df.head()

Unnamed: 0,Phrase Quality,Phrase,Year,Num Words,num_years
4,0.981,tunnel diode,1960-1964,2,1
5,0.9285,differential equations,1960-1964,2,12
6,0.897,high speed,1960-1964,2,12
7,0.884,data processing,1960-1964,2,12
8,0.55575,per cent,1960-1964,2,5


In [76]:
std_pipe = Pipeline([('scale', StandardScaler())])
ohe_pipe = Pipeline([('one-hot', OneHotEncoder(handle_unknown='ignore'))])
ct = ColumnTransformer(transformers=[('ohe', ohe_pipe, ['Phrase']),
                                     ('scale', std_pipe, ['num_years']),
                                    ('keep', 'passthrough', ['Phrase Quality'])])
pl = Pipeline([('transform', ct), ('classifier', DecisionTreeClassifier())])

In [77]:
X_train, X_test, y_train, y_test = train_test_split(df[['Phrase', 'num_years', 'Phrase Quality']],
                                                    df['Year'],
                                                    random_state=1)

In [78]:
_ = pl.fit(X_train, y_train)

In [79]:
# 42.5% accuracy
pl.score(X_test, y_test)

0.3394138008186257

In [80]:
X_test['Predicted Year'] = pl.predict(X_test)
X_test['Year'] = y_test

In [81]:
X_test

Unnamed: 0,Phrase,num_years,Phrase Quality,Predicted Year,Year
219344,ieee international workshop,1,0.624294,2010-2014,2010-2014
124040,risk based,4,0.581190,2015-2017,2005-2009
107111,operational profile,5,0.700898,1995-1999,2005-2009
195616,ultra high density,2,0.748598,2005-2009,2010-2014
166948,k2,2,0.850586,2015-2017,2010-2014
...,...,...,...,...,...
197445,objective image quality assessment,1,0.738843,2010-2014,2010-2014
234118,cellular automaton based,1,0.544212,2010-2014,2010-2014
25754,shared memory,8,0.940638,2000-2004,2000-2004
119251,intrusion detection system,4,0.616531,2010-2014,2005-2009


In [82]:
# How does the accuracy compare to just guessing the most common?
X_test['Year'].value_counts()

2010-2014    26820
2005-2009    19170
2015-2017    15194
2000-2004     8514
1995-1999     3855
1990-1994     1591
1985-1989      545
1980-1984      166
1975-1979       75
1970-1974       37
1965-1969       12
1960-1964        2
Name: Year, dtype: int64

In [83]:
(y_test == '2010-2014').mean()

0.3529829825877522

# Refined model

In [84]:
# Potential other classifiers to use
# K nearest neighbor
# Naive Bayes
# Linear Discriminant
# Support Vector Machines