In [1]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
import Levenshtein as Lv
import time
import re

import numpy as np
import json
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import Binarizer
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import FunctionTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

## Looking at segmentation + AutoPhrase results for grouped years

In [2]:
grouped_fp = '../results/dblp-v10-grouped'
seg_fp = grouped_fp + '/2000-2004_segmented.csv'
auto_fp = grouped_fp + '/phrases.csv'

In [3]:
phrases = pd.read_csv(auto_fp, index_col=0)
#phrases = phrases[phrases['Year']=='2000-2004']
phrases

Unnamed: 0,Phrase Quality,Phrase,Year,Num Words
0,0.991500,operations research,1950-1959,2
1,0.650500,operations research society of america,1950-1959,5
2,0.573500,high speed,1950-1959,2
3,0.525500,operations research society,1950-1959,3
4,0.981000,tunnel diode,1960-1964,2
...,...,...,...,...
303923,0.500036,target sites,2015-2017,2
303924,0.500033,biological information,2015-2017,2
303925,0.500027,non cooperative game,2015-2017,3
303926,0.500012,coding technique,2015-2017,2


In [4]:
phrases['Year'].unique()

array(['1950-1959', '1960-1964', '1965-1969', '1970-1974', '1975-1979',
       '1980-1984', '1985-1989', '1990-1994', '1995-1999', '2000-2004',
       '2005-2009', '2010-2014', '2015-2017'], dtype=object)

In [5]:
seg = pd.read_csv(seg_fp, index_col=0)
seg = seg.dropna() # Need to drop null values to prevent issues with string processing
seg

Unnamed: 0,Phrases,Year
0,"hamiltonian,wave,periodic boundary conditions,...",2000-2004
1,"embedded systems,memory,paper,architecture,dat...",2000-2004
2,"language modeling,information,single,word orde...",2000-2004
3,"mirror,software,perl,user-controlled,mirror,we...",2000-2004
4,"covariance,regression,reproducing kernel hilbe...",2000-2004
...,...,...
309699,"prototype,walking robot,ship,legged robot,acti...",2000-2004
309700,"paper,parallel program,in grid,order,user inte...",2000-2004
309701,"search-based,cbr,ca,ca,search process,cbr,ca,g...",2000-2004
309702,"dram,production,tests,single,single,memory,tes...",2000-2004


In [6]:
seg.loc[0]['Phrases']

'hamiltonian,wave,periodic boundary conditions,paper,nonlinear schrodinger,symplectic,symplectic,data,multi-phase,spectral methods'

In [7]:
def find_qualities(x):
    x = x['Phrases'].split(',')
    out_phrases = []
    out_quality = []
    for phrase in x:
        # Phrase will not show up in the df if the quality is too low
        # We only kept multi >= 0.5 and single >= 0.8
        # NOTE: Potential issue with phrase having dashes in seg when they don't in phrases
        #       (i.e. user-controlled vs. user controlled)
        match = phrases[phrases['Phrase'] == phrase]
        if len(match) == 0:
            continue
        else:
            out_phrases.append(phrase)
            out_quality.append(match['Phrase Quality'].values[0])
    return out_phrases, out_quality

In [8]:
test = seg.copy()
test = seg.loc[1:10]

In [9]:
test['Phrases'], test['Phrase Qualities'] = zip(*test.apply(find_qualities, axis=1))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test['Phrases'], test['Phrase Qualities'] = zip(*test.apply(find_qualities, axis=1))


In [10]:
test

Unnamed: 0,Phrases,Year,Phrase Qualities
1,"[embedded systems, control flow, program trans...",2000-2004,"[0.751535651, 0.9272791598, 0.8154624019, 0.68..."
2,"[language modeling, word order, language model...",2000-2004,"[0.7877440842, 0.8562616285, 0.7877440842, 0.8..."
3,"[mirror, perl, mirror, web cache, http, mirror...",2000-2004,"[0.8339330792, 0.824974653, 0.8339330792, 0.72..."
4,"[covariance, regression, reproducing kernel hi...",2000-2004,"[0.8073973255, 0.8220048591, 0.9075834826, 0.6..."
5,"[modular exponentiation, cornerstone, rsa, mod...",2000-2004,"[0.8909553133, 0.8050896539, 0.807428333, 0.74..."
6,"[question answering, ir, semantic, semantic, n...",2000-2004,"[0.9793318108, 0.8065003229, 0.8092162159, 0.8..."
7,"[todays web, web services, computers, markup, ...",2000-2004,"[0.5229487085, 0.8034459036, 0.8001758076, 0.8..."
8,"[mediaeval, logic, aristotelian, negation, neg...",2000-2004,"[0.8598218899, 0.8114334306, 0.8674139965, 0.8..."
9,"[intelligent systems, previous attempts, searc...",2000-2004,"[0.7989019338, 0.5292607943, 0.6592930534, 0.8..."
10,"[noise robust, speech recognition, aurora, pre...",2000-2004,"[0.6364751042, 0.9559212685, 0.8590855236, 0.7..."


In [11]:
test.loc[1]['Phrases']

['embedded systems',
 'control flow',
 'program transformations',
 'embedded applications',
 'library',
 'code size']

In [12]:
test.loc[1]['Phrase Qualities']

[0.751535651,
 0.9272791598,
 0.8154624019,
 0.6875766847,
 0.8212738409,
 0.7391568024]

In [13]:
seg

Unnamed: 0,Phrases,Year
0,"hamiltonian,wave,periodic boundary conditions,...",2000-2004
1,"[embedded systems, control flow, program trans...",2000-2004
2,"[language modeling, word order, language model...",2000-2004
3,"[mirror, perl, mirror, web cache, http, mirror...",2000-2004
4,"[covariance, regression, reproducing kernel hi...",2000-2004
...,...,...
309699,"prototype,walking robot,ship,legged robot,acti...",2000-2004
309700,"paper,parallel program,in grid,order,user inte...",2000-2004
309701,"search-based,cbr,ca,ca,search process,cbr,ca,g...",2000-2004
309702,"dram,production,tests,single,single,memory,tes...",2000-2004


In [14]:
phrases[phrases['Phrase'] == 'user controlled']

Unnamed: 0,Phrase Quality,Phrase,Year,Num Words
23440,0.557913,user controlled,1995-1999,2
45754,0.716653,user controlled,2000-2004,2
93921,0.792673,user controlled,2005-2009,2
195600,0.748656,user controlled,2010-2014,2
281156,0.715213,user controlled,2015-2017,2


In [15]:
re.sub(r'[^A-Za-z0-9- ]+', '', 'user-controlled')

'user-controlled'

In [16]:
re.sub(r'[-]+', ' ', 'user-controlled')

'user controlled'

# Analysis of AutoPhrase results

In [17]:
fp_uniquebyyear = '../results/dblp-v10-phrases-uniquebyyear.csv'
fp_unique = '../results/dblp-v10-phrases-unique.csv'

In [18]:
# Start from year 1968 - years before had too little training data
# We only kept multi-word phrases above 0.6 and single-word above 0.8

In [19]:
# Contains the phrases unique overall (no duplicates)
un_all = pd.read_csv(fp_unique, index_col=0)
un_all = un_all[un_all['Year'] >= 1968]
un_all

FileNotFoundError: [Errno 2] No such file or directory: '../results/dblp-v10-phrases-unique.csv'

In [None]:
# Contains the phrases unique by year (there can be duplicates across years)
uby = pd.read_csv(fp_uniquebyyear, index_col=0)
uby = uby[uby['Year'] >= 1968]
uby = uby.dropna()
uby

In [None]:
# Only keeps the phrases that show up multiple times across years (so we can look for trends)
uby_dups = uby.copy()
uby_dups = uby_dups[uby_dups['Phrase'].duplicated(keep=False)]
uby_dups

In [None]:
uby_dups['Phrase'].value_counts()[:10]

In [None]:
uby_dups[uby_dups['Phrase'] == 'image processing']['Year'].values

# Phrase matching/similarity for input papers (title + abstract)

In [None]:
# Given an input paper (title + abstract), extract the phrases within it and return the
# similar phrases found in the AutoPhrase results.
# Can use Levenshtein distance to find similar strings, or just use direct phrase matching

In [None]:
test_fp = '../data/arxiv/csv/2016.csv'

In [None]:
test_data = pd.read_csv(test_fp)
test_data.head()

In [None]:
test_data['Title'][0] + ' ' + test_data['Abstract'][0]

In [None]:
uby

In [None]:
input_phrase = 'convolutional neural networks'

In [None]:
valid_ix = uby.apply(lambda x: x['Phrase'][0] == 'c' if isinstance(x['Phrase'], str) else False, axis=1)
valid_ix

In [None]:
unique_phrases = uby[valid_ix]['Phrase'].unique()
unique_phrases

In [None]:
candidate = ''
dist = float('inf')
for phrase in unique_phrases:
    diff = Lv.distance(input_phrase, phrase)
    if diff < dist:
        candidate = phrase
        dist = diff

In [None]:
candidate

In [None]:
dist

In [None]:
uby[uby['Phrase'] == 'convolutional neural networks']

In [None]:
# Same approach, but keeping track of all candidates this time

In [None]:
candidates = []
for phrase in unique_phrases:
    dist = Lv.distance(input_phrase, phrase)
    candidates.append((dist, phrase))

In [None]:
candidates.sort()

In [None]:
candidates[:10]

In [None]:
# Using df.apply so we can look at all phrases, not just phrases that start with the same letter

In [None]:
uby_test = uby.copy()
uby_test['Dist'] = uby_test.apply(lambda x: Lv.distance(input_phrase, x['Phrase']) if isinstance(x['Phrase'], str) else float('inf'), axis=1)
uby_test

In [None]:
uby_test.sort_values('Dist')[:10]

In [None]:
# Testing on the unique overall df

In [None]:
un_all_test = un_all.copy()
un_all_test['Dist'] = un_all_test.apply(lambda x: Lv.distance(input_phrase, x['Phrase']) if isinstance(x['Phrase'], str) else float('inf'), axis=1)
un_all_test.head()

In [None]:
un_all_test.sort_values('Dist')[:20]

In [None]:
# Consolidating results
# Can start with the most common phrases and change phrases that are close enough (distance <= 5?)
counts = uby.groupby('Phrase').size()

In [None]:
# num_years tells us how many years the phrase has shown up in
uby['num_years'] = uby.apply(lambda x: counts[x['Phrase']], axis=1)

In [None]:
uby.head()

# Model generation

In [None]:
# x: Phrase, Phrase Quality, num_years
# Phrase needs to be one hot encoded
# y: Only the year (may need to use the unique overall dataframe?)

# For phrase quality - standard scaler by year?
# For num_years - normalize overall

In [None]:
from sklearn.preprocessing import StandardScaler
std_pipe = Pipeline([('scale', StandardScaler())])
ohe_pipe = Pipeline([('one-hot', OneHotEncoder(handle_unknown='ignore'))])
ct = ColumnTransformer(transformers=[('ohe', ohe_pipe, ['Phrase']),
                                     ('scale', std_pipe, ['num_years']),
                                    ('keep', 'passthrough', ['Phrase Quality'])])
pl = Pipeline([('transform', ct), ('classifier', DecisionTreeClassifier())])

In [None]:
# Set random seed
X_train, X_test, y_train, y_test = train_test_split(uby[['Phrase', 'num_years', 'Phrase Quality']],
                                                    uby['Year'],
                                                    random_state=1)

In [None]:
_ = pl.fit(X_train, y_train)

In [None]:
# Mean accuracy - 8% accurate
pl.score(X_test, y_test)

In [None]:
X_test['Predicted Year'] = pl.predict(X_test)
X_test['Year'] = y_test
X_test['Abs Year Diff'] = abs(X_test['Year'] - X_test['Predicted Year'])

In [None]:
X_test

In [None]:
X_test['Abs Year Diff'].mean()

In [None]:
# THINGS TO TEST:
# Using the unique by year dataframe, then only keeping one instance of duplicate phrases
# but replace the year with the average (or median) of the years
# Normalizing the numeric features

# Try using phrasal segmentation model to run on a single paper title + abstract
# Or a single paper full paper text

# Try grouping by papers by a range of years (maybe 5)

In [None]:
X_train[X_train['Phrase']=='convolutional neural networks']

In [None]:
test = pd.DataFrame([[1.0, 'convolutional neural networks', 2005, 6]], columns=['Phrase Quality', 'Phrase', 'Year', 'num_years'])
test

In [None]:
pl.predict(test[['Phrase', 'num_years', 'Phrase Quality']])

In [None]:
uby[uby['Phrase'] == 'convolutional neural networks']

In [None]:
test = pd.DataFrame([[1.0, 'artificial intelligence', 2005, 6]], columns=['Phrase Quality', 'Phrase', 'Year', 'num_years'])
test

In [None]:
pl.predict(test[['Phrase', 'num_years', 'Phrase Quality']])

# Baseline model testing with grouped phrases

In [None]:
df = pd.read_csv('../results/dblp-v10-grouped/dblp-v10-grouped-phrases.csv', index_col=0)
df = df[4:]
df = df.dropna()

In [None]:
# num_years is slightly different since we grouped years now
counts = df.groupby('Phrase').size()
df['num_years'] = df.apply(lambda x: counts[x['Phrase']], axis=1)

In [None]:
df.head()

In [None]:
std_pipe = Pipeline([('scale', StandardScaler())])
ohe_pipe = Pipeline([('one-hot', OneHotEncoder(handle_unknown='ignore'))])
ct = ColumnTransformer(transformers=[('ohe', ohe_pipe, ['Phrase']),
                                     ('scale', std_pipe, ['num_years']),
                                    ('keep', 'passthrough', ['Phrase Quality'])])
pl = Pipeline([('transform', ct), ('classifier', DecisionTreeClassifier())])

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df[['Phrase', 'num_years', 'Phrase Quality']],
                                                    df['Year'],
                                                    random_state=1)

In [None]:
_ = pl.fit(X_train, y_train)

In [None]:
# 42.5% accuracy
pl.score(X_test, y_test)

In [None]:
X_test['Predicted Year'] = pl.predict(X_test)
X_test['Year'] = y_test

In [None]:
X_test

In [None]:
# How does the accuracy compare to just guessing the most common?
X_test['Year'].value_counts()

In [None]:
(y_test == '2010-2014').mean()

# Refined model

In [None]:
# Potential other classifiers to use
# K nearest neighbor
# Naive Bayes
# Linear Discriminant
# Support Vector Machines