In [1]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
import Levenshtein as Lv
import time
import re
from glob import glob

import numpy as np
import json
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import Binarizer
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import FunctionTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

## Loading in AutoPhrase and phrasal segmentation results dataframes

In [2]:
# AutoPhrase results dataframe
fp_phrases = '../results/dblp-v10-grouped/phrases.csv'
phrases = pd.read_csv(fp_phrases, index_col=0)

In [3]:
# Processed phrasal segmentation results dataframe
infolder = '../results/dblp-v10-grouped'
subfolders = glob(infolder + '/*.csv')
subfolders = list(filter(lambda x: 'segmented' in x, subfolders))
seg = pd.DataFrame(columns=['Phrases', 'Year', 'Num Phrases'])
for fp in subfolders:
    df = pd.read_csv(fp, index_col=0)
    df = df.dropna()
    df['Num Phrases'] = df.apply(lambda x: len(x['Phrases'].split(',')), axis=1)
    #df = df.drop('Phrases', axis=1)
    seg = seg.append(df, ignore_index=True)
seg = seg.dropna()

In [4]:
# Maybe add code for creating counts dictionary if it can be used with the model

## Integrating phrase quality with phrasal segmentation results

In [5]:
seg['Phrases'] = seg['Phrases'].map(lambda x: x.split(','))

In [6]:
seg

Unnamed: 0,Phrases,Year,Num Phrases
0,"[paper, wheatstone, bridge, tangent, triangle,...",1950-1959,14
1,"[numerical integration, differential equations...",1950-1959,6
2,[fur],1950-1959,1
3,"[computing, computing, amplifier, high, amplif...",1950-1959,8
4,"[operations research, journal, operations rese...",1950-1959,5
...,...,...,...
2548126,"[research, fi, indoor, location-based service ...",2015-2017,44
2548127,"[icts, eliminating, gender, entrepreneurship, ...",2015-2017,18
2548128,"[infinite horizon, inventory, model, general, ...",2015-2017,12
2548129,"[infrared, technology, deep-space, radiation, ...",2015-2017,51


In [7]:
def find_qualities(x):
    """
    Helper function to process the segmentation.csv files
    Only keeps quality phrases (multi >= 0.5, single >= 0.8)
    Obtains the phrase quality of each phrase by matching with phrases.csv
    """
    x = x['Phrases']
    out_phrases = []
    out_quality = []
    for phrase in x:
        # Phrase will not show up in the phrases df if the quality is too low
        # We only kept multi >= 0.5 and single >= 0.8
        # NOTE: Potential issue with phrase having dashes in seg when they don't in phrases
        #       (i.e. user-controlled vs. user controlled)
        match = phrases[phrases['Phrase'] == phrase]
        # NOTE: Doesn't do a year match for the phrase quality
        if len(match) == 0:
            continue
        else:
            out_phrases.append(phrase)
            out_quality.append(match['Phrase Quality'].values[0])
    return out_phrases, out_quality

In [8]:
def filter_phrases(x):
    """
    Helper function for the segmentation results to remove any low quality phrases
    """
    x = x['Phrases']
    out = []
    for phrase in x:
        match = phrases[phrases['Phrase'] == phrase]
        if len(match) == 0:
            continue
        else:
            out.append(phrase)
    return out

In [9]:
seg['Year'].value_counts()

2010-2014    872492
2005-2009    651846
2015-2017    430039
2000-2004    309663
1995-1999    144373
1990-1994     71394
1985-1989     32741
1980-1984     16836
1975-1979      9377
1970-1974      5541
1965-1969      2642
1960-1964       853
1950-1959       334
Name: Year, dtype: int64

In [10]:
seg.sample(frac=0.1)['Year'].value_counts()

2010-2014    87153
2005-2009    65341
2015-2017    43003
2000-2004    30905
1995-1999    14397
1990-1994     7152
1985-1989     3268
1980-1984     1656
1975-1979      981
1970-1974      580
1965-1969      251
1960-1964       94
1950-1959       32
Name: Year, dtype: int64

In [11]:
seg_normal = pd.DataFrame(columns=['Phrase', 'Year'])

In [12]:
#seg.apply(filter_phrases, axis=1)

In [13]:
#zip(*seg.apply(find_qualities, axis=1))

In [14]:
# The segmentation phrases have some phrases with dashes like "user-controlled"
phrases[phrases['Phrase'] == 'user controlled']

Unnamed: 0,Phrase Quality,Phrase,Year,Num Words
23440,0.557913,user controlled,1995-1999,2
45754,0.716653,user controlled,2000-2004,2
93921,0.792673,user controlled,2005-2009,2
195600,0.748656,user controlled,2010-2014,2
281156,0.715213,user controlled,2015-2017,2


In [15]:
re.sub(r'[^A-Za-z0-9- ]+', '', 'user-controlled')

'user-controlled'

In [16]:
# Need to replace dashes (and potentially other chars) with a space
re.sub(r'[-]+', ' ', 'user-controlled')

'user controlled'

# Analysis of AutoPhrase results

In [17]:
fp_uniquebyyear = '../results/dblp-v10/phrases.csv'
#fp_unique = '../results/dblp-v10-phrases-unique.csv'

In [18]:
# Start from year 1968 - years before had too little training data
# We only kept multi-word phrases above 0.6 and single-word above 0.8

In [19]:
# Contains the phrases unique overall (no duplicates)
# un_all = pd.read_csv(fp_unique, index_col=0)
# un_all = un_all[un_all['Year'] >= 1968]
# un_all

In [20]:
# Contains the phrases unique by year (there can be duplicates across years)
uby = pd.read_csv(fp_uniquebyyear, index_col=0)
uby = uby[uby['Year'] >= 1968]
uby = uby.dropna()
uby

Unnamed: 0,Phrase Quality,Phrase,Year
20,0.946833,context free,1968
21,0.890167,time sharing,1968
22,0.909000,context free,1969
23,0.896000,time sharing,1969
24,0.993000,context free,1970
...,...,...,...
238718,0.600501,practical implementation,2017
238719,0.600363,next generation,2017
238720,0.600185,deep convolutional neural,2017
238721,0.600087,network nodes,2017


In [21]:
# Only keeps the phrases that show up multiple times across years (so we can look for trends)
uby_dups = uby.copy()
uby_dups = uby_dups[uby_dups['Phrase'].duplicated(keep=False)]
uby_dups

Unnamed: 0,Phrase Quality,Phrase,Year
20,0.946833,context free,1968
21,0.890167,time sharing,1968
22,0.909000,context free,1969
23,0.896000,time sharing,1969
24,0.993000,context free,1970
...,...,...,...
238718,0.600501,practical implementation,2017
238719,0.600363,next generation,2017
238720,0.600185,deep convolutional neural,2017
238721,0.600087,network nodes,2017


In [22]:
uby_dups['Phrase'].value_counts()[:10]

context free             46
high level               42
programming language     42
natural language         41
pattern recognition      41
programming languages    41
data structures          41
sufficient conditions    39
data structure           39
problem solving          39
Name: Phrase, dtype: int64

In [23]:
uby_dups[uby_dups['Phrase'] == 'image processing']['Year'].values

array([1981, 1982, 1983, 1984, 1985, 1986, 1987, 1988, 1989, 1990, 1991,
       1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003,
       2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014,
       2015, 2016, 2017], dtype=int64)

# Phrase matching/similarity for input papers (title + abstract)

In [24]:
# Given an input paper (title + abstract), extract the phrases within it and return the
# similar phrases found in the AutoPhrase results.
# Can use Levenshtein distance to find similar strings, or just use direct phrase matching

In [25]:
test_fp = '../data/arxiv/csv/2016.csv'

In [26]:
test_data = pd.read_csv(test_fp)
test_data.head()

Unnamed: 0,Title,Abstract
0,Sequential Short-Text Classification with Recu...,Recent approaches based on artificial neural n...
1,Multiresolution Recurrent Neural Networks An A...,We introduce the multiresolution recurrent neu...
2,Document Image Coding and Clustering for Scrip...,The paper introduces a new method for discrimi...
3,Tutorial on Answering Questions about Images w...,Together with the development of more accurate...
4,Building Machines That Learn and Think Like Pe...,Recent progress in artificial intelligence AI ...


In [27]:
test_data['Title'][0] + ' ' + test_data['Abstract'][0]

'Sequential Short-Text Classification with Recurrent and Convolutional  Neural Networks Recent approaches based on artificial neural networks ANNs have shownpromising results for short-text classification However many short textsoccur in sequences eg sentences in a document or utterances in a dialogand most existing ANN-based systems do not leverage the preceding short textswhen classifying a subsequent one In this work we present a model based onrecurrent neural networks and convolutional neural networks that incorporatesthe preceding short texts Our model achieves state-of-the-art results on threedifferent datasets for dialog act prediction'

In [28]:
uby

Unnamed: 0,Phrase Quality,Phrase,Year
20,0.946833,context free,1968
21,0.890167,time sharing,1968
22,0.909000,context free,1969
23,0.896000,time sharing,1969
24,0.993000,context free,1970
...,...,...,...
238718,0.600501,practical implementation,2017
238719,0.600363,next generation,2017
238720,0.600185,deep convolutional neural,2017
238721,0.600087,network nodes,2017


In [29]:
input_phrase = 'convolutional neural networks'

In [30]:
valid_ix = uby.apply(lambda x: x['Phrase'][0] == 'c' if isinstance(x['Phrase'], str) else False, axis=1)
valid_ix

20         True
21        False
22         True
23        False
24         True
          ...  
238718    False
238719    False
238720    False
238721    False
238722    False
Length: 238701, dtype: bool

In [31]:
unique_phrases = uby[valid_ix]['Phrase'].unique()
unique_phrases

array(['context free', 'computer science', 'computer graphics', ...,
       'complete characterization', 'co occurrence matrix',
       'computational approach'], dtype=object)

In [32]:
candidate = ''
dist = float('inf')
for phrase in unique_phrases:
    diff = Lv.distance(input_phrase, phrase)
    if diff < dist:
        candidate = phrase
        dist = diff

In [33]:
candidate

'convolutional neural networks'

In [34]:
dist

0

In [35]:
uby[uby['Phrase'] == 'convolutional neural networks']

Unnamed: 0,Phrase Quality,Phrase,Year
136900,0.865809,convolutional neural networks,2012
152925,0.915629,convolutional neural networks,2013
172010,0.937014,convolutional neural networks,2014
190695,0.931728,convolutional neural networks,2015
212473,0.917273,convolutional neural networks,2016
233574,0.904261,convolutional neural networks,2017


In [36]:
# Same approach, but keeping track of all candidates this time

In [37]:
candidates = []
for phrase in unique_phrases:
    dist = Lv.distance(input_phrase, phrase)
    candidates.append((dist, phrase))

In [38]:
candidates.sort()

In [39]:
candidates[:10]

[(0, 'convolutional neural networks'),
 (1, 'convolutional neural network'),
 (3, 'convolution neural network'),
 (4, 'convolutional neural network cnn'),
 (4, 'convolutional neural networks cnn'),
 (5, 'convolutional neural networks cnns'),
 (7, 'convolutional networks'),
 (8, 'convolutional network'),
 (9, 'cellular neural networks'),
 (9, 'chaotic neural networks')]

In [40]:
# Using df.apply so we can look at all phrases, not just phrases that start with the same letter

In [41]:
uby_test = uby.copy()
uby_test['Dist'] = uby_test.apply(lambda x: Lv.distance(input_phrase, x['Phrase']) if isinstance(x['Phrase'], str) else float('inf'), axis=1)
uby_test

Unnamed: 0,Phrase Quality,Phrase,Year,Dist
20,0.946833,context free,1968,22
21,0.890167,time sharing,1968,24
22,0.909000,context free,1969,22
23,0.896000,time sharing,1969,24
24,0.993000,context free,1970,22
...,...,...,...,...
238718,0.600501,practical implementation,2017,22
238719,0.600363,next generation,2017,21
238720,0.600185,deep convolutional neural,2017,14
238721,0.600087,network nodes,2017,21


In [42]:
uby_test.sort_values('Dist')[:10]

Unnamed: 0,Phrase Quality,Phrase,Year,Dist
212473,0.917273,convolutional neural networks,2016,0
233574,0.904261,convolutional neural networks,2017,0
152925,0.915629,convolutional neural networks,2013,0
190695,0.931728,convolutional neural networks,2015,0
136900,0.865809,convolutional neural networks,2012,0
172010,0.937014,convolutional neural networks,2014,0
190443,0.93688,convolutional neural network,2015,1
174069,0.898522,convolutional neural network,2014,1
212157,0.922546,convolutional neural network,2016,1
153942,0.900172,convolutional neural network,2013,1


In [43]:
# Testing on the unique overall df

In [44]:
# un_all_test = un_all.copy()
# un_all_test['Dist'] = un_all_test.apply(lambda x: Lv.distance(input_phrase, x['Phrase']) if isinstance(x['Phrase'], str) else float('inf'), axis=1)
# un_all_test.head()

In [45]:
#un_all_test.sort_values('Dist')[:20]

In [46]:
# Consolidating results
# Can start with the most common phrases and change phrases that are close enough (distance <= 5?)
uby_counts = uby.groupby('Phrase').size()

In [47]:
# num_years tells us how many years the phrase has shown up in
uby['num_years'] = uby.apply(lambda x: uby_counts[x['Phrase']], axis=1)

In [48]:
uby.head()

Unnamed: 0,Phrase Quality,Phrase,Year,num_years
20,0.946833,context free,1968,46
21,0.890167,time sharing,1968,2
22,0.909,context free,1969,46
23,0.896,time sharing,1969,2
24,0.993,context free,1970,46


# Model generation

In [49]:
# x: Phrase, Phrase Quality, num_years
# Phrase needs to be one hot encoded
# y: Only the year (may need to use the unique overall dataframe?)

# For phrase quality - standard scaler by year?
# For num_years - normalize overall

In [50]:
from sklearn.preprocessing import StandardScaler
std_pipe = Pipeline([('scale', StandardScaler())])
ohe_pipe = Pipeline([('one-hot', OneHotEncoder(handle_unknown='ignore'))])
ct = ColumnTransformer(transformers=[('ohe', ohe_pipe, ['Phrase']),
                                     ('scale', std_pipe, ['num_years']),
                                    ('keep', 'passthrough', ['Phrase Quality'])])
pl = Pipeline([('transform', ct), ('classifier', DecisionTreeClassifier())])

In [51]:
# Set random seed
X_train, X_test, y_train, y_test = train_test_split(uby[['Phrase', 'num_years', 'Phrase Quality']],
                                                    uby['Year'],
                                                    random_state=1)

In [52]:
# _ = pl.fit(X_train, y_train)

In [53]:
# Mean accuracy - 8% accurate
# pl.score(X_test, y_test)

In [54]:
# X_test['Predicted Year'] = pl.predict(X_test)
# X_test['Year'] = y_test
# X_test['Abs Year Diff'] = abs(X_test['Year'] - X_test['Predicted Year'])

In [55]:
#X_test['Abs Year Diff'].mean()

In [56]:
# THINGS TO TEST:
# Using the unique by year dataframe, then only keeping one instance of duplicate phrases
# but replace the year with the average (or median) of the years
# Normalizing the numeric features

# Try using phrasal segmentation model to run on a single paper title + abstract
# Or a single paper full paper text

# Try grouping by papers by a range of years (maybe 5)

In [57]:
#X_train[X_train['Phrase']=='convolutional neural networks']

In [58]:
# test = pd.DataFrame([[1.0, 'convolutional neural networks', 2005, 6]], columns=['Phrase Quality', 'Phrase', 'Year', 'num_years'])
# test

In [59]:
# pl.predict(test[['Phrase', 'num_years', 'Phrase Quality']])

In [60]:
uby[uby['Phrase'] == 'convolutional neural networks']

Unnamed: 0,Phrase Quality,Phrase,Year,num_years
136900,0.865809,convolutional neural networks,2012,6
152925,0.915629,convolutional neural networks,2013,6
172010,0.937014,convolutional neural networks,2014,6
190695,0.931728,convolutional neural networks,2015,6
212473,0.917273,convolutional neural networks,2016,6
233574,0.904261,convolutional neural networks,2017,6


# Baseline model testing with grouped phrases

In [61]:
from sklearn.preprocessing import StandardScaler

In [62]:
df = pd.read_csv('../results/dblp-v10-grouped/phrases.csv', index_col=0)
df = df[4:]
df = df.dropna()

In [63]:
# num_years is slightly different since we grouped years now
phr_counts = df.groupby('Phrase').size()
df['num_years'] = df.apply(lambda x: phr_counts[x['Phrase']], axis=1)

In [64]:
df.head()

Unnamed: 0,Phrase Quality,Phrase,Year,Num Words,num_years
4,0.981,tunnel diode,1960-1964,2,1
5,0.9285,differential equations,1960-1964,2,12
6,0.897,high speed,1960-1964,2,12
7,0.884,data processing,1960-1964,2,12
8,0.55575,per cent,1960-1964,2,5


In [65]:
std_pipe = Pipeline([('scale', StandardScaler())])
ohe_pipe = Pipeline([('one-hot', OneHotEncoder(handle_unknown='ignore'))])
ct = ColumnTransformer(transformers=[('ohe', ohe_pipe, ['Phrase']),
                                     ('scale', std_pipe, ['num_years']),
                                    ('keep', 'passthrough', ['Phrase Quality'])])
pl = Pipeline([('transform', ct), ('classifier', DecisionTreeClassifier())])

In [66]:
X_train, X_test, y_train, y_test = train_test_split(df[['Phrase', 'num_years', 'Phrase Quality']],
                                                    df['Year'],
                                                    random_state=1)

In [67]:
# _ = pl.fit(X_train, y_train)

In [68]:
# 42.5% accuracy
# pl.score(X_test, y_test)

In [69]:
# X_test['Predicted Year'] = pl.predict(X_test)
# X_test['Year'] = y_test

In [70]:
# How does the accuracy compare to just guessing the most common?
#X_test['Year'].value_counts()

In [71]:
#(y_test == '2010-2014').mean()

# Refined model - only using grouped phrases

In [72]:
# Potential other classifiers to use
# K nearest neighbor
# Naive Bayes
# Linear Discriminant
# Support Vector Machines

In [96]:
df = pd.read_csv('../results/dblp-v10-grouped/phrases.csv', index_col=0)

In [97]:
# num_years is slightly different since we grouped years now
phr_counts = df.groupby('Phrase').size()
df['num_years'] = df.apply(lambda x: phr_counts[x['Phrase']], axis=1)

In [98]:
df

Unnamed: 0,Phrase Quality,Phrase,Year,Num Words,num_years
0,0.991500,operations research,1950-1959,2,10
1,0.650500,operations research society of america,1950-1959,5,1
2,0.573500,high speed,1950-1959,2,13
3,0.525500,operations research society,1950-1959,3,1
4,0.981000,tunnel diode,1960-1964,2,1
...,...,...,...,...,...
303923,0.500036,target sites,2015-2017,2,3
303924,0.500033,biological information,2015-2017,2,4
303925,0.500027,non cooperative game,2015-2017,3,3
303926,0.500012,coding technique,2015-2017,2,6


In [99]:
df['Num Words'].value_counts()

2    204289
3     45174
1     39275
4     11557
5      2772
6       861
Name: Num Words, dtype: int64

In [100]:
X_train, X_test, y_train, y_test = train_test_split(df[['Phrase', 'num_years', 'Phrase Quality', 'Num Words']],
                                                    df['Year'])
                                                    #random_state=1)

In [101]:
from sklearn.ensemble import AdaBoostClassifier

In [116]:
std_pipe = Pipeline([('scale', StandardScaler())])
ohe_pipe = Pipeline([('one-hot', OneHotEncoder(handle_unknown='ignore'))])
ct = ColumnTransformer(transformers=[('ohe', ohe_pipe, ['Phrase']),
                                     ('scale', std_pipe, ['num_years', 'Num Words', 'Phrase Quality']),
                                    ('keep', 'passthrough', ['Phrase Quality'])])
pl = Pipeline([('transform', ct), ('classifier', AdaBoostClassifier(n_estimators=100, learning_rate=1.1))])

In [117]:
pl.fit(X_train, y_train)

Pipeline(steps=[('transform',
                 ColumnTransformer(transformers=[('ohe',
                                                  Pipeline(steps=[('one-hot',
                                                                   OneHotEncoder(handle_unknown='ignore'))]),
                                                  ['Phrase']),
                                                 ('scale',
                                                  Pipeline(steps=[('scale',
                                                                   StandardScaler())]),
                                                  ['num_years', 'Num Words',
                                                   'Phrase Quality']),
                                                 ('keep', 'passthrough',
                                                  ['Phrase Quality'])])),
                ('classifier',
                 AdaBoostClassifier(learning_rate=1.1, n_estimators=100))])

In [118]:
# 0.339 with DecisionTreeClassifier
# RandomForest took too long to train
# 0.353 with AdaBoostClassifier
# 0.356 with n_estimators 80, learning_rate 1.1
pl.score(X_test, y_test)

0.3563080729646495

In [84]:
# Accuracy to beat: 0.353 (if we just guessed the most common year label)
(y_test == '2010-2014').mean()

0.3530704640572767

## Testing with new segmentation results - may not be very effective

In [85]:
# Processes the new segmentation csvs (high quality phrases, no duplicates)
infolder = '../results/test'
subfolders = glob(infolder + '/*.csv')
#subfolders = list(filter(lambda x: 'segmented' in x, subfolders))
data = pd.DataFrame(columns=['Phrases', 'Year Range'])
for fp in subfolders:
    df = pd.read_csv(fp, index_col=0)
    df = df.dropna()
    #df['Num Phrases'] = df.apply(lambda x: len(x['Phrases'].split(',')), axis=1)
    #df = df.drop('Phrases', axis=1)
    data = data.append(df, ignore_index=True)
data = data.dropna()
data['Phrases'] = data['Phrases'].map(lambda x: x.split(','))

In [86]:
data['Year Range'].value_counts()

2010-2014    872482
2005-2009    651795
2015-2017    429999
2000-2004    309058
1995-1999    143710
1990-1994     70637
1985-1989     31911
1980-1984     16063
1975-1979      8794
1970-1974      5133
1965-1969      2507
1960-1964       807
1950-1959       323
Name: Year Range, dtype: int64

In [87]:
data

Unnamed: 0,Phrases,Year Range
0,"[triangle, wheatstone, bridge, tangent]",1950-1959
1,"[computable, differential equations, numerical...",1950-1959
2,[fur],1950-1959
3,"[amplifiers, electronic, computing]",1950-1959
4,"[society, operations research, america, journal]",1950-1959
...,...,...
2543214,"[research, computational burden, inertial navi...",2015-2017
2543215,"[media, women, innovation, eliminating, gender...",2015-2017
2543216,"[batch, inventory, dynamic pricing, infinite h...",2015-2017
2543217,"[poly, imager, earth, khz, m 1, clock, cmos, m...",2015-2017


In [88]:
data[data['Year Range'] == '2010-2014']

Unnamed: 0,Phrases,Year Range
1240738,"[purpose, as, net, basic, an empirical study, ...",2010-2014
1240739,"[research, as, specialist, reconstruction, tes...",2010-2014
1240740,"[as, visual, analyzing, f, identifying, backgr...",2010-2014
1240741,"[optimal, as, recent years, pedestrian navigat...",2010-2014
1240742,"[as, master secret, composite order, doi, stan...",2010-2014
...,...,...
2113215,"[as, multi, comparison, china, understanding, ...",2010-2014
2113216,"[as, simultaneous, reactive, wide spectrum, ne...",2010-2014
2113217,"[schwartz, as, condorcet, visualization, proto...",2010-2014
2113218,"[axiomatization, expected regret, decision mak...",2010-2014


In [89]:
phrases[phrases['Phrase']=='as']

Unnamed: 0,Phrase Quality,Phrase,Year,Num Words
178384,0.818842,as,2010-2014,1


In [90]:
phrases

Unnamed: 0,Phrase Quality,Phrase,Year,Num Words
0,0.991500,operations research,1950-1959,2
1,0.650500,operations research society of america,1950-1959,5
2,0.573500,high speed,1950-1959,2
3,0.525500,operations research society,1950-1959,3
4,0.981000,tunnel diode,1960-1964,2
...,...,...,...,...
303923,0.500036,target sites,2015-2017,2
303924,0.500033,biological information,2015-2017,2
303925,0.500027,non cooperative game,2015-2017,3
303926,0.500012,coding technique,2015-2017,2


In [91]:
X_train, X_test, y_train, y_test = train_test_split(data[['Phrases']],
                                                    data['Year Range'],
                                                    random_state=1)

In [92]:
ohe = OneHotEncoder()

In [93]:
X_test['Phrases']

2223922    [host language, multiple target, ruby, compile...
1851609    [fading multiple access, full csi, eavesdroppe...
1311483    [adapting, imager, as, lung cancer, localizati...
1555840    [prediction, as, a case study, potential benef...
553527     [innovation, nyquist rate, digital, nyquist, p...
                                 ...                        
2484129    [018m cmos technology, capacitive, rms, ota, d...
817212     [access edca, ieee, ieee 80211e, quality of se...
2175372    [object appearance, tracker, inspired approach...
971005     [bridge, analyzing, learning, data gathering, ...
1696727    [video, perfectly matched, previously proposed...
Name: Phrases, Length: 635805, dtype: object

In [94]:
ohe.fit_transform(X_test['Phrases'])

ValueError: Expected 2D array, got 1D array instead:
array=[list(['host language', 'multiple target', 'ruby', 'compiler', 'execution environments', 'compilation', 'virtual machine', 'python', 'task migration', 'ideally', 'javascript', 'java', 'web applications', 'cultural', 'execution speed', 'php', 'orders of magnitude faster'])
 list(['fading multiple access', 'full csi', 'eavesdropper', 'onoff', 'wt', 'channel state information csi', 'wire', 'employing', 'intended receiver', 'csi', 'if', 'eve', 'snr', 'do', 'eavesdropper eve', 'easily computable', 'power control', 'optimal power control', 'sum rate', 'next', 'achievable secrecy', 'cooperative jamming', 'mac'])
 list(['adapting', 'imager', 'as', 'lung cancer', 'localization', 'lung', 'gpu', 'tracking', 'image reconstruction', 'tumor', 'real time', 'radiotherapy', 'principal component analysis', 'nvidia', 'reconstructed image', 'c1060', 'pca', 'first', 'clinical application', 'reference image', 'digital', 'projection based', 'vector fields', 'mm', 'phantom', 'reconstructing', 'accurate', 'patient data'])
 ...
 list(['object appearance', 'tracker', 'inspired approach', 'object tracking', 'ransac', 'cognitive psychology', 'extensively evaluated', 'keypoint', 'correlation filter', 'dual', 'art', 'partial occlusion', 'tracked object', 'additional information'])
 list(['bridge', 'analyzing', 'learning', 'data gathering', 'proposed approach', 'governor', 'situated', 'network interfaces', 'home automation', 'power line communication', 'home gateway', 'home', 'hierarchical', 'proposed framework', 'control protocol', 'top', 'upnp', 'global', 'multi'])
 list(['video', 'perfectly matched', 'previously proposed', 'as', 'visual', 'identification', 'i', 'cognizant', 'user study', 'video streaming', 'so', 'visually salient', 'forward error correction', 'loop', 'error concealment', 'lr', 'if', 'side information', 'roi', 'overall', 'concealed', 'streaming video', 'replacement', 'first', 'fec', 'increased dramatically', 'uep', 'regularization', 'low saliency', 'concealment', 'unequal error protection uep scheme', 'block', 'subjective quality', 'psnr', 'saliency', 'ii', 'hr'])].
Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.

In [None]:
from sklearn.preprocessing import MultiLabelBinarizer

In [None]:
mlb = MultiLabelBinarizer()

In [None]:
mlb.fit(X_test['Phrases'])

MultiLabelBinarizer()

In [None]:
mlb.classes_

array(['0 1', '0 1n', '000 000', ..., 'zy', 'zynq', 'zynq soc'],
      dtype=object)

In [None]:
len(mlb.classes_)

95059

In [None]:
mlb.fit_transform(X_test['Phrases'])

MemoryError: Unable to allocate 225. GiB for an array with shape (635805, 95059) and data type int32

In [None]:
X_test['Phrases']

2223922    [host language, multiple target, ruby, compile...
1851609    [fading multiple access, full csi, eavesdroppe...
1311483    [adapting, imager, as, lung cancer, localizati...
1555840    [prediction, as, a case study, potential benef...
553527     [innovation, nyquist rate, digital, nyquist, p...
                                 ...                        
2484129    [018m cmos technology, capacitive, rms, ota, d...
817212     [access edca, ieee, ieee 80211e, quality of se...
2175372    [object appearance, tracker, inspired approach...
971005     [bridge, analyzing, learning, data gathering, ...
1696727    [video, perfectly matched, previously proposed...
Name: Phrases, Length: 635805, dtype: object

In [None]:
std_pipe = Pipeline([('scale', StandardScaler())])
ohe_pipe = Pipeline([('one-hot', OneHotEncoder(handle_unknown='ignore'))])
ct = ColumnTransformer(transformers=[('ohe', ohe_pipe, ['Phrases']),
                                     ('scale', std_pipe, ['Num Phrases'])
                                    ])
pl = Pipeline([('transform', ct), ('classifier', DecisionTreeClassifier())])

In [None]:
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import LabelBinarizer

In [None]:
#_ = pl.fit(X_train, y_train)

In [None]:
#pl.score(X_test, y_test)