In [161]:
import glob, os, re, string, itertools

from nltk.stem.porter import *
from collections import defaultdict
from nltk import word_tokenize, pos_tag, pos_tag_sents
from nltk.corpus import wordnet, stopwords
from nltk.chunk.regexp import RegexpParser
from nltk.chunk import tree2conlltags
from pandas import DataFrame
from itertools import chain
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.feature_extraction import text
import numpy as np
from scipy.sparse.csr import csr_matrix

In [162]:
def clean(input_list):
    result=[]
    trash=[]
    legit=[]
    #remove unwanted character per line
    for line in input_list:
        clean=re.sub("(\.)?\n",'', line) #remove \n
        #check if line only contains 2 words, is it exist on wordnet?
        if len(word_tokenize(clean))<=2 and not wordnet.synsets(clean):
            trash.append(clean)
        elif len(word_tokenize(clean))==1:
            trash.append(clean)
        else:
            clean=re.sub("\S([\=|\+|\:|\*|\/]\S)+",'', clean) #check this
            clean=re.sub("\[([0-9]{1,2}\,?\s?)+\]",'', clean) #remove [2]
            clean=re.sub("\(([0-9]{1,2}\,?\s?)+\)",'', clean) #remove (2)
            #remove fig. 2 etc, need improvement to catch the sentence after it
            clean=re.sub("[[Ff]ig.|[Ff]igure|[Tt]ab.|[Tt]able]\s?[0-9]{1,2}",'', clean) #remove fig. 2 etc
            legit.append(clean)
        result.append(clean)
    return result

In [163]:
def load_files(path):
    raw=[]
    for file in path:
        dict_doc={'doc_id': None, 'title': None, 'abstract': None, 'full-text': None, 'candidates': None}
        file_id=os.path.basename(file).rstrip('.txt.final') #catch only file name  
        dict_doc['doc_id']=file_id
        
        source=open(file,encoding='utf-8').readlines()
        source=clean(source)
        
        ##########detect title
        beginning=re.sub("\n", "", source[0]) #retrieve title
        candidate=re.sub("\n", "", source[1]) # retrieve title candidate
        h_candidate=word_tokenize(re.sub("-",' ',candidate)) #tokenize the candidate
        
        title=[]
        name=[]
        for word in h_candidate:
            if wordnet.synsets(word): #check if title candidate exist on wordnet
                title.append(word)
            else:
                name.append(word)
            #if title>
            if len(title)>len(name): 
                newtitle=beginning+' '+candidate
            elif len(title)==len(name):
                newtitle=beginning
            else:
                newtitle=beginning

        dict_doc['title']=newtitle
        
        content=source[2:]
        ######check header, inconsistency all file
        r_intro=re.compile("^1\.?\s[A-Z]+")
        r_ref=re.compile("[0-9]{1,2}?\.?\s?R[EFERENCES|eferences]") #detect reference
        #r_header=re.compile("[0-9]{1,2}?\.?\s?[A-Z]")
        
        in_abstract=content.index('ABSTRACT')
        in_authorkey=content.index('Categories and Subject Descriptors')
        
        list_intro=[i for i, item in enumerate(content) if re.search(r_intro, item)]
        in_intro=list_intro[0]
        list_ref=[i for i, item in enumerate(content) if re.search(r_ref, item)]
        in_ref=list_ref[0]
        
        abstract=content[in_abstract+1:in_authorkey] #eliminate keyword and category
        body=content[in_intro+1:in_ref] #remove reference       
        
        list_title=[]
        list_title.append(newtitle)
        
        full_text=list(chain(list_title,abstract, body))
        dict_doc['abstract']=abstract
        dict_doc['body']=body
        dict_doc['full_text']=full_text
        
        #per sentence in a document
        raw.append(dict_doc)
    return raw

In [164]:
def extract_abstract(raw_data):
    abstracts=[' '.join(doc['abstract']) for doc in raw_data]
    return abstracts


In [165]:
def tfidf_corpus(raw_data):
    train_data=[]
    for doc in raw_data:
        #add to list and join all element in full text into a text
        train_data.append(' '.join(doc['full_text']))
    return train_data

def calculate_ngram_tfidf(corpus):
    
    #porter stemmer
    stemmer=PorterStemmer()
    '''
    class StemmedTfidfVectorizer(TfidfVectorizer):
        def build_analyzer(self):
            analyzer=super(TfidfVectorizer, self).build_analyzer()
            return lambda doc: (stemmer.stem(word) for word in analyzer(doc))
    '''
    #eliminate ngram which starts or ends from stopwords
    class NewTfidfVectorizer(TfidfVectorizer):
        def _word_ngrams(self, tokens, stop_words=None):
            # First get tokens without stop words
            tokens = super(TfidfVectorizer, self)._word_ngrams(tokens, None)
            if stop_words is not None:
                new_tokens=[]
                for token in tokens:
                    split_words = token.split(' ')
                    # Only check the first and last word for stop words
                    if split_words[0] not in stop_words and split_words[-1] not in stop_words:
                        #stem every word in token
                        token1=' '.join(list(stemmer.stem(word) for word in word_tokenize(token)))
                        new_tokens.append(token1)
                return new_tokens
            return tokens
    
    additional_stopwords = ['lrb', 'rrb', 'lcb', 'rcb', 'lsb','rsb']
    stop_words=text.ENGLISH_STOP_WORDS.union(additional_stopwords)
    
    #pattern="(?u)\\b[\\w-]+\\b"
    tfidf=NewTfidfVectorizer(ngram_range=(1,5), stop_words=stop_words,
                                token_pattern=r"(?u)\b[A-z-]+\b|'")
    
    matrix=tfidf.fit_transform(corpus)
    feature_names=tfidf.get_feature_names()

    #how to print tf-idf from https://stackoverflow.com/questions/34449127/
    #sklearn-tfidf-transformer-how-to-get-tf-idf-values-of-given-words-in-documen
    candidates=[]
    for doc in range(0,len(corpus)):
        feature_index=matrix[doc,:].nonzero()[1]
        tfidf_doc=zip(feature_index, [matrix[doc, x] for x in feature_index])
        names_tfidf=[(w, s) for w, s in [(feature_names[i], s) for (i, s) in tfidf_doc]]
        candidates.append(names_tfidf)
    
    #this is the candidates per document
    #vocab_perdoc=tfidf.inverse_transform(matrix)
    
    return candidates


In [211]:
def calculate_term_frequency(section):
     #porter stemmer
    stemmer=PorterStemmer()
    
    #eliminate ngram which starts or ends from stopwords
    class NewCountVectorizer(CountVectorizer):
        def _word_ngrams(self, tokens, stop_words=None):
            # First get tokens without stop words
            tokens = super(CountVectorizer, self)._word_ngrams(tokens, None)
            if stop_words is not None:
                new_tokens=[]
                for token in tokens:
                    split_words = token.split(' ')
                    # Only check the first and last word for stop words
                    if split_words[0] not in stop_words and split_words[-1] not in stop_words:
                        #stem every word in token
                        token1=' '.join(list(stemmer.stem(word) for word in word_tokenize(token)))
                        new_tokens.append(token1)
                return new_tokens
            return tokens
    
    count_vect=NewCountVectorizer(ngram_range=(1,5), stop_words='english',
                                token_pattern=r"(?u)\b[A-Za-z-]+\b|'")
    
    matrix=count_vect.fit_transform(section)
    feature_names=count_vect.get_feature_names()

    #how to print tf-idf from https://stackoverflow.com/questions/34449127/
    #sklearn-tfidf-transformer-how-to-get-tf-idf-values-of-given-words-in-document
    ngrams=[]
    for doc in range(0,len(section)):
        feature_index=matrix[doc,:].nonzero()[1]
        count_vect_doc=zip(feature_index, [matrix[doc, x] for x in feature_index])
        names_count_vect=[(w, s) for w, s in [(feature_names[i], s) for (i, s) in count_vect_doc]]
        ngrams.append(names_count_vect)
    
    return ngrams

In [212]:
def extract_goldkeyphrases(gold_data):
    r_plus=re.compile("^.*\+.*$")
    r_slash=re.compile("^.*\s.*\/.*$")
    gold_standard=[]
    for line in gold_data.split('\n'):
        doc=[]
        for key in line[7:].split(','):
            if re.search(r_plus, key):
                split=[]
                for element in key.split('+'):
                    doc.append(element)
            elif re.search(r_slash, key):
                split=[]
                for element in key.split('/'):
                    doc.append(element)
            else:
                doc.append(key)
        gold_standard.append(doc)
    return gold_standard

In [1]:
##FEATURE for checking how many times each candidate is in abstract
#candidates of training or testing, corpus of training or testing
def feature_is_abstract(candidates, raw_data):
    #abstracts=extract_abstract(raw_data)
    abstracts=[' '.join(doc['abstract']) for doc in raw_data]
    abstract_ngram=calculate_term_frequency(abstracts)
    
    #compare tfidf_value with value in section
    feature3=[]
    for n_doc in range(len(candidates)):
        doc=[]
        for n_feature in range(len(candidates[n_doc])):
            features_perdoc=[feature for feature, value in abstract_ngram[n_doc]]
            if candidates[n_doc][n_feature][0] not in features_perdoc:
                doc.append(0)
            else:
                doc.append(1)
        feature3.append(doc)
    return feature3[0]

In [214]:
##how many times each candidate is in title
def feature_is_title(candidates, raw_data):
    titles=[doc['title'] for doc in raw_data]
    title_ngram=calculate_term_frequency(titles)
    
    feature2=[]
    for n_doc in range(len(candidates)):
        doc=[]
        for n_feature in range(len(candidates[n_doc])):
            features_perdoc=[feature for feature, value in title_ngram[n_doc]]
            if candidates[n_doc][n_feature][0] not in features_perdoc:
                doc.append(0)
            else:
                doc.append(1)
        feature2.append(doc)
    return feature2[0]

In [215]:
def feature_candidate_length(candidates):
    feature4=[]
    for n_doc in range(len(candidates)):
        doc=[]
        for n_feature in range(len(candidates[n_doc])):
            doc.append(len(candidates[n_doc][n_feature][0]))
        feature4.append(doc)
    return feature4[0]

In [216]:
def feature_term_frequency(corpus):
    corpus_ngram=calculate_term_frequency(corpus) #save as pickle for term frequency, it can be used for counting n title or n abstract
    
    feature5=[]
    for n_doc in range(len(corpus_ngram)):
        doc=[]
        for n_feature in range(len(corpus_ngram[n_doc])):
            doc.append(corpus_ngram[n_doc][n_feature][1])
        feature5.append(doc)
    return feature5[0]

In [217]:
def feature_supervised_keyphraseness(corpus, gold_data): #make sure this is only keyphrase per document or all keyphrase compare
    corpus_ngram=calculate_term_frequency(corpus)
    
    feature6=[]
    for n_doc in range(len(corpus_ngram)):
        doc=[]
        for n_feature in range(len(corpus_ngram[n_doc])):
            gold_keyphrase=list(gold_data[n_doc])
            if corpus_ngram[n_doc][n_feature][0] not in gold_keyphrase:
                doc.append(0)
            else:
                doc.append(corpus_ngram[n_doc][n_feature][1])
        feature6.append(doc)
    return feature6[0]

In [218]:
train_directory=glob.glob('./se_txt/train/dummy/*.txt.final')
gold_standard=open('./se_txt/train/train.combined.stem.final', encoding='utf-8').read()
source=load_files(train_directory)
gold_data=extract_goldkeyphrases(gold_standard)
train_data=tfidf_corpus(source)
tfidf_ngram=calculate_ngram_tfidf(train_data)
f=feature_is_abstract(tfidf_ngram, source)
t=feature_is_title(tfidf_ngram, source)
l=feature_candidate_length(tfidf_ngram)
tf=feature_term_frequency(train_data)
sk=feature_supervised_keyphraseness(train_data, gold_data)
print("\n asbtract:", f)
print("\n title:", t)
print("\n length:", l)
print("\n tf:", tf)
print("\n supervised:", sk)


 asbtract: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 

In [20]:
##check legitimate candidates
cands=[['hyarm', 'video', 'resourc', 'dre', 'qo', 'resourc util', 'uav', 'multimedia', 'qo requir', 'base station', 'applic', 'util', 'station', 'end receiv', 'dre multimedia'], ['tigr', 'gridway', 'grid', 'enkf', 'licens', 'job', 'ensembl', 'grid-en', 'mpi', 'reservoir', 'geolog', 'geolog model', 'eclips', 'metaschedul', 'sub-job'], ['msp', 'node', 'scan', 'node sequenc', 'sequenc', 'anchor', 'target node', 'event', 'basic msp', 'flip', 'sequence-bas', 'sequence-bas msp', 'protect band', 'band', 'figur'], ['node', 'sensor node', 'color', 'sensor', 'local', 'label', 'imag', 'imag process', 'relax', 'light', 'stardust', 'radio', 'label relax', 'connect', 'node id'], ['sensor', 'skip graph', 'proxi', 'tsar', 'skip', 'data', 'index', 'storag', 'interv', 'queri', 'interv skip', 'interv skip graph', 'flash', 'lookup', 'summari'], ['zone', 'dim', 'queri', 'node', 'event', 'code', 'gpsr', 'zone code', 'rang queri', 'insert', 'owner', 'network', 'sensor', 'perimet', 'messag'], ['node', 'messag', 'contact', 'rout', 'protocol', 'rout protocol', 'mobil', 'deliveri', 'opportunist network', 'epidem', 'opportunist', 'destin', 'network', 'contact probabl', 'deliveri ratio'], ['hiker', 'wit', 'cenwit', 'node', 'sensor', 'record', 'wit inform', 'wit record', 'leader', 'partit', 'locat', 'memori', 'inform', 'partit leader', 'access point'], ['dr vector', 'dr', 'export', 'error', 'export error', 'receiv', 'vector', 'player', 'sender', 'schedul', 'delay', 'dri', 'game', 'sent', 'base case'], ['site', 'lag', 'gs-dr-ll', 'network transmiss', 'network transmiss delay', 'inconsist', 'transmiss delay', 'remot site', 'share entiti', 'transmiss', 'state updat', 'gs-dr', 'entiti', 'delay', 'ms'], ['client', 'sand', 'server', 'appoint', 'spatial', 'data', 'client-serv', 'browser', 'internet browser', 'spatial data', 'sand internet', 'sand internet browser', 'databas', 'spatial databas', 'activ client'], ['locat', 'sensor', 'symbol locat', 'user', 'group', 'context', 'group interact', 'messeng', "'", 'symbol', 'proxim', 'giss', 'fusion', 'chapter', "' attribut"], ['execut', 'process', 'process coordin', 'activ', 'grid', 'control', 'prime', 'process execut', 'group', 'process element', 'prime number', 'group manag', 'support infrastructur', 'infrastructur', 'activ manag'], ['cglf', 'congest', 'agent', 'strategi profil', 'congest game', 'resourc', 'game', 'profil', 'strategi', 'potenti function', 'pure strategi', 'nondecreas', 'ds-stabl', 'pure strategi nash', 'strategi nash'], ['aggreg', 'node', 'probe', 'attribut', 'sdim', 'dht', 'aggreg valu', 'aggreg function', 'tree', 'instal', 'virtual node', 'reconfigur', 'level-i', 'adht', 'mib'], ['game', 'client', 'server', 'player', 'proxi', 'commun proxi', 'region', 'author', 'architectur', 'movement', 'game space', 'mmog', 'state inform', 'authorit', 'rpgquest'], ['contract', 'monitor', 'path', 'node', 'isp', 'innov', 'qualiti', 'punish', 'cheat', 'data path', 'rout', 'rest of path', 'hop', 'cheater', 'path qualiti'], ['sensor', 'shockwav', 'muzzl', 'shot', 'muzzl blast', 'weapon', 'trajectori', 'blast', 'calib', 'detect', 'shooter', 'estim', 'microphon', 'mote', 'acoust'], ['workflow', 'servic', 'busi', 'busi process', 'busi valu', 'servic provid', 'web servic', 'schedul', 'servic type', 'ga', 'chromosom', 'provid', 'web', 'qo', 'assign'], ['xgrid', 'grid', 'maya', 'render', 'agent', 'job', 'plug-in', 'comput', 'command', 'grid comput', 'appl', 'directori', 'client', 'instal', 'mac os'], ['zebroid', 'data item', 'latenc', 'item', 'avail latenc', 'replica', 'car', 'client', 'one-instantan', 'data', 'one-instantan zebroid', 'storag', 'request', 'replac', 'z-relay'], ['pthinc', 'pda', 'web', 'display', 'video', 'client', 'screen', 'browser', 'resiz', 'thin-client', 'server', 'devic', 'ica', 'web browser', 'session'], ['sensor', 'sensor node', 'node', 'wsn', 'sens area', 'icqa', 'coverag qualiti', 'sens', 'coverag', 'group', 'mind', 'sensor-group', 'sens model', 'area', 'sens neighbor'], ['spectrum', 'gossip', 'spectrum sens', 'increment', 'radio', 'devic', 'walk', 'coordin spectrum', 'coordin spectrum sens', 'band', 'signal', 'protocol', 'cognit radio', 'signal level', 'random walk'], ['jm', 'messag', 'mobil', 'hoc', 'ad hoc', 'host', 'epidem', 'mobil ad', 'hoc network', 'mobil ad hoc', 'ad hoc network', 'network', 'buffer', 'notif', 'middlewar'], ['idss', 'didsog', 'resourc', 'intrus', 'grid', 'intrus detect', 'integr', 'nativ id', 'detect', 'servic', 'id', 'alert', 'scope', 'nativ', 'heterogen idss'], ['servic', 'event correl', 'event', 'correl', 'servic event', 'custom', 'subservic', 'resourc', 'manag', 'service-ori', 'fault', 'activ probe', 'service-ori event', 'service-ori event correl', 'probe'], ['relev event', 'event', 'k', 'ipi', 'm', 'ipt', 'send lrb m', 'immedi predecessor', 'v ci', 'pred', 'predecessor', 'e', 'pi', 'causal', 'ci'], ['event', 'sentient', 'sentient object', 'event channel', 'channel', 'actuat', 'dissemin', 'object', 'smart', 'sensor', 'network', 'cosmic', 'layer', 'event layer', 'tempor'], ['node', 'packet', 'resourc', 'replica', 'mesh', 'pheromon', 'manet', 'queri packet', 'valid mesh', 'rout', 'resourc discoveri', 'invalid', 'discoveri', 'updat', 'queri'], ['ganesh', 'client', 'test client', 'nativ', 'respons time', 'databas', 'throughput', 'mb s', 'rabin', 'cach', 'mb', 'driver', 'benchmark', 'averag respons time', "ganesh '"], ['energi', 'duti cycl', 'batteri', 'duti', 'slot', 'harvest', 'cycl', 'energi neutral', 'power', 'nw', 'power manag', 'neutral', 'use', 'energi avail', 'solar'], ['charact', 'document', 'editor', 'databas', 'server compon', 'edit', 'meta data', 'real-tim server', 'real-tim server compon', 'meta', 'real-tim', 'folder', 'edit and manag', 'server', 'document edit'], ['replica', 'server', 'poa', 'nash', 'anarchi', 'price of anarchi', 'game', 'cach', 'basic game', 'replic the object', 'social optimum', 'replic', 'payment', 'object', 'cost'], ['adarank', 'rank', 'ndcg', 'train', 'weak ranker', 'perform measur', 'document', 'ranker', 'queri', 'rankboost', 'rank svm', 'rank model', 'adarank map', 'loss function', 'measur'], ['svm', 'spam', 'onlin svm', 'blog', 'onlin', 'spam detect', 'rosvm', 'email spam', 'splog', 'email', 'data set', 'content-bas spam', 'detect', 'data', 'blog comment'], ['heat', 'diffusionrank', 'pagerank', 'graph', 'page', 'web', 'diffus', 'node', 'random graph', 'manipul', 'manifold', 'vi', 'heat diffus', 'vj', 'rank'], ['queri', 'clq', 'translat', 'monolingu', 'clir', 'cross-lingu', 'queri suggest', 'target languag', 'cross-lingu queri', 'queri similar', 'qe', 'queri translat', 'languag', 'monolingu queri', 'suggest'], ['pagerank', 'hit', 'queri', 'link-bas', 'featur', 'url', 'link-bas featur', 'score', 'page', 'link', 'in-degre', 'web graph', 'web', 'rank', 'graph'], ['topic', 'hub', 'aap', 'apa', 'apm', 'ap', 'topic eas', 'trec', 'system', 'sect', 'author', 'eas', 'tab', 'ap valu', 'valu'], ['link', 'web page', 'page', 'web', 'latent', 'matrix', 'factor', 'link structur', 'matrix factor', 'content', 'classif', 'z', 'svm', 'content inform', 'document'], ['index', 'coalesc', 'ti', 'sublist', 'time-travel', 'post', 'tempor coalesc', 'invert', 'queri', 'tempor', 'document', 'index list', 'index size', 'materi', 'invert file'], ['queri', 'wig', 'predict', 'np', 'np queri', 'rank', 'document', 'retriev', 'queri type', 'robust score', 'content-bas', 'content-bas queri', 'score', 'dt', 'qi'], ['expert', 'topic', 'expertis', 'expert find', 'candid', 'model', 'uvt', 'q', 'profil', 'document', 'expertis retriev', 'queri', 'ca', "q '", 'organiz'], ['page', 'ad', 'advertis', 'taxonomi', 'phrase', 'score', 'match', 'class', "'", 'classifi', 'keyword', 'contextu advertis', 'bid phrase', 'contextu', 'page and ad'], ['queri', 'term mismatch', 'document', 'qe', 'queri term', 'mismatch', 'term', 'ir', 'relev document', 'ir system', 'collect', 'remov', 'query-docu term', 'query-docu term mismatch', 'term remov'], ['retriev', 'predictor', 'autocorrel', 'document', 'score', 'spatial', 'correl', 'y', 'rank', 'predict', 'clariti', 'spatial autocorrel', 'perform', 'measur', 'y y'], ['rank', 'input rank', 'document', 'di', 'outrank', 'rank aggreg', 'consensu rank', 'input', 'method', 'concord', 'perform', 'consensu', 'rank aggreg method', 'aggreg', 'aggreg method'], ['transcript', 'phonet', 'queri', 'oov', 'speech', 'wcn', 'word', 'phone', 'retriev', 'spoken', 'index', 'occurr', 'lattic', 'term', 'phonet transcript'], ['queri', 'stem', 'word', "'", 'plural', 'dcg', 'document', "' '", 'long queri', 'context sensit', "comparison '", "comparison ' '", 'languag model', 'plural handl', 'hotel'], ['gene', "'", 'passag', 'concept', 'queri', 'biomed', 'domain-specif knowledg', 'lexic', 'lexic variant', 'gene symbol', 'document', 'retriev', 'abbrevi', 'relat concept', 'domain-specif'], ['document', 'nois', 'probabl', 'poisson', 'nois probabl', 'idf', 'frequency-bas', 'term', 'n lrb t', 'occurr', 't', 'noisi', 'n', 'inform', 'poisson-bas'], ['advertis', 'page', 'trigger page', 'ad', 'aak', 'keyword', 'imped', 'page p', 'trigger', 'strategi', 'match', 'web page', 'web', 'p', "'"], ['user model', 'user', 'ucair', 'queri', 'document', 'search', "'", 'implicit', 'implicit user model', 'retriev', 'implicit user', 'model', 'inform', 'result', "user ' s"], ['index', 'broadcast', 'scheme', 'data', 'tune time', 'ldd', 'bucket', 'minor index', 'ic', 'major index', 'locat', 'index scheme', 'tune', 'lbi', 'lh'], ['document', 'cluster', 'bl', "editor ' s", "editor '", 's desk', "' s desk", "editor ' s desk", 'desk', 'editor', 'topic', 'relat browser', 'subject head', 'classifi', 'browser'], ['photo', 'forum', 'score', 'web', 'web forum', 'duplic photo', 'fusion', 'rank', 'duplic', 'fusion method', 'photo forum', 'high-qual photo', 'web object', 'search', 'method'], ['databas', 'document', 'uum', 'document score', 'central document', 'central document score', 'retriev', 'resourc select', 'probabl of relev', 'score', 'testb', 'select', 'document retriev', 'central', 'select databas'], ['titl', 'document', 'extract', 'titl extract', 'font size', 'powerpoint', 'font', 'extract titl', 'gener document', 'perceptron', 'metadata', 'file properti', 'unit', 'model', 'featur'], ['page', 'pagerank', 'frank', 'featur', 'static rank', 'rank', 'web', 'static', 'featur set', 'popular', 'url', 'popular data', 'pairwis accuraci', 'web page', 'popular featur'], ['distanc measur', 'descriptor', 'distanc', 'measur', 'mpeg-', 'predicate-bas', 'quantis', 'quantis model', 'predicate-bas measur', 'media', 'quantit', 'textur', 'cluster', 'visual', 'pattern differ'], ['queri', 'qi', 'site', 'web', 'hidden web', 'page', 'crawler', 'download', 'web site', 'hidden-web', 'polici', 'hidden', 'qi-', 'p lrb qi', 'document'], ['pagerank', 'page', 'local domain', 'global pagerank', 'local', 'global', 'crawl', 'domain', 'page j', 'fj', 'local search engin', 'outlink', 'f', 'node', 'search engin'], ['event', 'stori', 'topic', 'cluster', 'model', 'depend', 'news', 'co', 'thread', 'event model', 'model m', 'tdt', 'event thread', 'news topic', 'similar'], ['click', 'queri', 'clickthrough', 'user', 'cd', 'user behavior', 'sa', 'search', 'result', 'web search', 'userbehavior', 'strategi', 'behavior', 'featur', 'predict'], ['rocchio', 'lr', 'ctrk', 'topic', 'af', 'tdt', 'adapt filter', 'document', 'relev feedback', 'cross-corpu', 'train', 'tune', 'feedback', 'paramet', 'evalu'], ['element', 'inex', 'xml', 'score', 'inex-', 're-rank', 're-rank algorithm', 'quantiz', 'quantiz function', 'rank', 'node', 'overlap', 'retriev', 'document', 'paragraph'], ['queri', 'clickthrough', 'queri histori', 'histori', 'document', 'current queri', 'retriev', 'click', 'clickthrough histori', 'click summari', 'batchup', 'queri model', 'implicit feedback', 'model', 'use'], ['implicit feedback', 'implicit', 'rank', 'feedback', 'web search', 'featur', 'queri', 'web', 'search', 'user', 'result', 'rn', 'feedback featur', 'implicit feedback featur', 'ndcg'], ['geograph', 'queri', 'ontolog', 'place refer', 'gir', 'string', 'disambigu', "'", 'refer', 'geograph refer', 'queri string', 'search engin', 'engin queri', 'search engin queri', 'ontolog concept'], ['irf', 'subject', 'searcher', 'rf', 'search', 'task', 'search task', 'erf', 'task complex', "'", 'term', 'use', 'document', 'complex', 'search experi'], ['action-item', 'sentenc', 'e-mail', 'sentence-level', 'document-level', 'n-gram', 'classifi', 'detect', 'document', "'", 'classif', 'segment', 'annot', 'sentenc detect', 'action-item detect'], ['asymmetr', 'laplac', 'estim', 'probabl estim', 'classifi', 'na bay', 'method', 'score', 'fit', 'class', 'bay', 'gaussian', 'log-odd', 'distribut', 'na'], ['learn', 'learn process', 'agent', 'learn step', 'learner', 'learn agent', 'h', 'cluster', "'", 'use', 'vessel', 'hj', 'ship', 'process', 'data'], ['bid', 'auction', 'bidset', 'item', 'agent', 'cachedpausebid', 'pausebid', 'paus', 'paus auction', 'revenu', 'algorithm', 's', 'bidder', 'winner determin', 'valuat'], ['pseudotre', 'node', 'dpop', 'cross-edg', 'dcpop', 'cross-edg pseudotre', 'hypercub', 'branch', 'dcop', 'messag', 'tradit pseudotre', 'x', 'dcop instanc', 'node x', 'util'], ['dbc', 'emt', 'quarri', 'pomdp', 'target dynam', 'emt-bas', 'emt-bas control', 'dynam', 'environ', 'control', 'target', 'hunter', 'tag', 'agent', 'action'], ['agent', 'commit', 'action', 'interact', 'commit machin', 'belief', 'monoton', 'agent interact', 'messag', 'plan', 'precondit', "'", 'saapl', 'context condit', 'netbil'], ['q', 'st', 'agent', 'sti', 'modular', 'state', 'model', 'interpret system', 'atl', 'qi', 'model check', 'epistem', "'", 'agt', 'modular interpret'], ['interact', 'agent', 'event', 'action', 'protocol', 'student', 'member', 'cours', 'social interact', 'field', 'sub-interact', 'oblig', 'rule', 'social', 'dispatch'], ['norm', 'agent', 'kripk', 'kripk structur', 'k', 'norm system', 's', 'ctl', 'goal', 'ui lrb k', 'compli', "'", 'nash implement', 'goal hierarchi', 'hierarchi'], ['negoti', 'agent', 'hospit', 'player', 'mediat', 'myriad', 'protocol', 'propos', 'criteria', 'crisi', 'decis criteria', 'burn', 'parti', 'negoti protocol', 'surgeri'], ['plausibl', 'q', 'ctlkp', 'ctlk', 'belief', 'path', 'm', 'formula', "'", 'plausibl path', 'm q', 'agent', 'logic', 'model', 'r'], ['counterargu', 'agent', 'argument', 'predict', 'learn', 'counterexampl', 'justifi predict', 'justif', 'amal', 'ai', 'cbr', 'argument process', 'solut class', 'case base', 'lid'], ['argument', 'dialogu', 'agent', 'negoti', 'offer', 'negoti dialogu', 'o', 'theori', 'argumentation-bas', 'accept', 'arg lrb l', 'argumentation-bas negoti', 'oi', 'reject', 'iff'], ['shapley valu', 'shapley', 'player', 'coalit', 'vote game', 'game', 'margin contribut', 'error', 'vote', 'valu', 'e lrb x', 'q', 'approxim shapley', 'approxim shapley valu', 'weight'], ['negoti', 'agent', 'knapsack', 'equilibrium', 'offer', 'issu', 'knapsack problem', 'time period', 't', 't n', 'approxim equilibrium', 'time', 'approxim', 'issu c', 'equilibrium offer'], ['negoti', 'agent', 'attribut', 'xh', 'mediat', 'simp', 'tent agreement', 'oc', 'outcom', 'simpl attribut', 'tent', 'x', 'parti', 'agreement', 'opposit'], ['agent', 'intent', 'negoti', 'dcsp', 'ds', 'variabl assign', 'advic', 'uma', 'abt', 'negoti messag', 'awc', 'mediat', 'neighbor', 'bdi', 'bi'], ['contract', 'contract outcom', 'agent', 'reput', 'direct experi', 'outcom', 'dirichlet', 'dirichlet distribut', 'estim', 'beta', 'success fulfil', 'dimens', 'beta distribut', 'cov', 'cov lrb p'], ['robber', 'secur agent', 'leader', 'secur', 'strategi', 'game', 'agent', 'pure strategi', 'reward', 'patrol', 'robber type', 'asap', 'hous', 'multiset', 'bayesian game'], ['recommend', 'agent', 'filter', 'profil', 'provid profil', 'entiti', 'relay', 'user profil', 'user', 'inform filter', 'privacy-preserv', 'filter techniqu', 'provid', 'inform', 'tfe'], ['agent', 'self-interest agent', 'self-interest', 'gossip', 'road', 'gossip agent', 'journey', 'journey length', 'tg', 'network', 'tavg', 'car', 'ts', 'regular gossip', 'spread'], ['agent', 'aircraft', 'reward', 'air traffic', 'congest', 'air', 'traffic', 'facet', 'z', 'f lrb z', 'flight', 'sector', 'differ reward', 'flow', 'e lrb f'], ['resourc', 'state', 'ta', 're', 's', 'bound', 'task', 'q-valu', 'agent', 'action', 'resourc type', 'q-decomposit', 'alloc', 'state s', 'sta'], ['mdp', 'agent', 'resourc', 'milp', 'time', 'problem', 'resourc assign', 'agent m', 'optim resourc', 'constraint', 'augment', "'", 'global', 'schedul', 'resource-schedul'], ['agent', 'organiz', 'osd', 'contract net', 'role', 'spawn', 'task', 'contract', 'net', 'organiz structur', 'organiz chang', 'net organ', 'contract net organ', 'local role', 'net agent'], ['agent', 'i-did', 'i-id', 'node', 'chanc node', 'j', 'model node', 'model', 'card', 'chanc', "j '", "j ' s", 'tiger', "'", 'action'], ['polici', 'spider', 'joint polici', "r '", 'agent', 'expect valu', 'solut qualiti', 'spider-ab', 'vax', 'pax', 'joint', 'prune', 'tree', 'horizon', 'heurist'], ['opportun cost', 'method', 'oc-dec-mdp', 'vfp', 'valu function', 'mj', 'function propag', 'opportun', 'function', 'propag', 't', 'probabl function', 'opportun cost function', 'time', 'propag phase'], ['ontolog', 'ontologist', "'", 'agent', 'dynamo', 'taxonomi', 'algorithm', 'cluster', 'head', 'parent', 'brother', 'multi-ag', 'concept agent', 'head coverag', "' '"], ['typ', 'tok', 'e', 'agent', 'channel', 'c', 'ontolog', 'th', 'th lrb log', 'log lrb e', 'token', 'align', 'log', 'ai', 'logic'], ['wine', 'servic', 'consum', 'negoti', 'red', 'learn', 'produc', 'ontolog', 'similar', 'request', 'offer', 'cea', "consum ' s", "consum '", 'similar metric'], ['reput', 'reput model', 'agent', 'fore', 'ontolog', 'art', 'model', 'agent reput', 'art testb', 'agent reput model', 'concept', 'secondaryreput', 'trust', 'testb', 'agent b'], ['dialogu', 'argument', 'argument graph', 'undercut', '-relev', 'graph', 'node', 'ag', "'", 'v', "v '", 'agent', 'protocol', 'root', 'move'], ['agent', "t '", 'protocol', 'mcon', 'commun', "'", 'aj', "t ' '", 'ai', 'request', 't', 'con', 'mcon lrb ai', 'exchang', 'ai aj'], ['abduct', "g '", 'propos', 'negoti', 'answer set', 'agent', 'neighborhood propos', 'g', 'neighborhood', 'critiqu', 'counter-propos', 'program', "'", 'answer', 'l'], ['intimaci', 'negoti', 'agent', 'utter', "'", 'dialogu', 'pt', 'equiti', 'ontolog', 'relationship', 'balanc', 'commit', 'logic', 'option', 'pt lrb acc'], ['trader', 'trade', 'quantiti', 'offer set', 'seller', 'mmp', 'aggreg', 'offer', 'configur', 'attribut', 'flow', 'buyer', 'divis', 'arc', 'network flow'], ['book', 'order', 'price', "e '", 'limit order', 'execut', 'e', 'buy', 'sequenc', 'buy order', 'order book', 'sell', 'rel model', 'absolut', 'volum'], ['money', 'agent', 'request', 'threshold strategi', 'strategi', 'sybil', 'equilibrium', 'sk', 'm', 'n', 'k', 'altruist', 'scrip', 'fulfil a request', 'volunt'], ['socrat', 'socrat game', 'game', 'player', 'qi', 'queri', 'world', 'nash', 'strateg', 'equilibria', 'ai', 'q', 'strateg zero', 'strateg zero sum', 'strateg zero-sum'], ['game', 'signal', 'game tree', 'order game', 'tree', 'gameshrink', 'isomorph', 'player', 'game isomorph', 'signal tree', 'order game isomorph', 'equilibrium', 'abstract', 'poker', 'node'], ['game', 'attribut', 'coalit', 'shapley valu', 'coalit game', 'shapley', 'core', 'aggreg', 'macg', 'aggreg valu function', 'agent', 'valu', 'aggreg valu', 'n v', 'valu function'], ['auction', 'proxi', 'bidder', 'option', 'bid', 'ebay', 'price', 'buyer', 's proxi', "' s proxi", 'exercis', 'item', 'price match', 'exercis price', 'win'], ['mutant', 'ess', 'incumb', 'fit', 'f', 'f lrb s', 's', 'n', 'f lrb s s', 't', 'classic ess', 'f lrb t', 'ef', 'e rrb f', 'gn'], ['slot', 'rbb', 'rbr', 'revenu', 'auction', 'bid', 'slot auction', 'bidder', 'equilibrium', 'alloc', 'agent', 'rbr auction', "'", 'rbb and rbr', 'second-pric'], ['recommend', 'purchas', 'product', 'book', 'dvd', 'recommend network', 'number', 'number of recommend', 'cascad', 'friend', 'viral market', 'viral', 'person', 'peopl', 'node'], ['scout', 'rate', 'recommend', 'user', 'movi', 'promot', 'role', 'connector', 'neighborhood', 'tom', 'predict', 'mask', 'bad', 'good scout', 'rv'], ['day-', 'day- procur', 'procur', 'game', 'storag cost', 'nash', 'tac', 'equilibria', 'set', 'scm', 'design', 'tac scm', 'ui', 'profil', 'nash equilibria'], ['auction', 'ascend auction', 'ascend', 'item-pric', 'bundl', 'queri', 'bidder', 'valuat', 'demand queri', 'elicit', 'price', 'demand', 'bundle-pric', 'polynomi', 'optim alloc'], ['opinion', 'inform market', 'opinion pool', 'pool', 'market', 'predict', 'expert', 'predict accuraci', 'game', 'logop', 'contest', 'accuraci', 'score', 'linop', 'inform'], ['vote', 'voter', 'vote vector', 'candid', 'fool set', 'fool', 'vector', 'm', 'n', "'", 'commun complex', 'lj', 'vote rule', "m '", 'commun'], ['strategi', 'domin', 'player', 'elimin', 'v', 'pure strategi', 'ur', 'weak domin', 'tc', 'row player', 'mix strategi', 'iter domin', 'pure', 'column player', 'game'], ['princip', 'node', 'payment', 'contract', 'packet', 'transit cost', 'forward', 'cost', 'lcp', 'transit', 'fpss', 'expect payment', 'expect', 'intermedi node', 'per-hop'], ['user', 'machin', 'parallel model', 'equilibrium', 'bid', 'alloc', 'parallel', 'util uniform', 'infinit parallel', 'util', 'game', 'best respons', 'social optimum', 'converg', 'finit parallel'], ['auction', 'bidder', 'competit ratio', 'competit', 'optim auction', 'bid', 'aggreg auction', 'ratio', 'f', 'lower bound', 'optim', 'profit', 'b', 'bound', 'k'], ['bid', 'repair', 'revenu', 'solut', 'repair solut', 'withdraw', 'robust solut', 'bid-tak', 'bidder', 'auction', 'robust', 'win', 'item', 'bid withdraw', 'win bid'], ['game', 'coalit', 'coalit game', 'agent', 'represent', 'mc-net', 's', 'rule', 'represent scheme', 'core', 'shapley valu', 'tree decomposit', 'shapley', 'v', 'liter'], ['cut valu', 'c', 'alloc rule', 'o', 'cut', 'alloc rule o', 'rule o', 'mp', 'demand game', 'satisfi mp', 'payment', 'ci', 'alloc', 'output', 'binari demand'], ['job', 'cost share', 'shapley valu', 'shapley', 'cost', 'axiom', 'q', 'effici order', 'valu rule', 'shapley valu rule', 'share', 'e n', 'e lrb q', 'e q', 'n'], ['bob', 'alic', 'agent', 'mechan', 'player', "'", 'domin strategi', 'stage', 'load', 'file', 'inform', 'domin', 'certif', 'strategi', 'payoff'], ['trade', 'bidder', 'price', 'node', 'rp', 'exchang', 'valuat', 'payment', 'activ rule', 'threshold payment', 'vi', 'bid', 'provision', 'round', 'valu'], ['v', 'v w', 'choic function', 'social choic function', 'social choic', 'u', 'w', 'b', 'b c', 'function', 'truth', 'path', 'dc', 'convex', 'c'], ['buyer', 'seller', 'alloc', 'j', 'gain from trade', 'match', 'valuat', 'color', 'trade', 'mechan', 'zopa', 'negoti', 'negotiation-rang', 'vj', 't'], ['privaci', 'protect', 'individu', 'risk', 'person inform', 'cost', 'discount', 'intrus', 'inform', 'privaci intrus', 'self-control', 'time-inconsist', 'person', 'transact', 'gratif'], ['chariti', 'donat', 'bidder', 'bid', 'payment', 'quasilinear', 'parti', 'uij', 'clear problem', 'concav', 'match offer', 'donation-clear', 'wj', 'bj', 'mechan'], ['job', 'declar', 'f', 'agent', 'competit ratio', 'tp', 'releas', 'time', 'li', 'mechan', 'deadlin', 'complet', 'activ job', 'competit', 'job f'], ['stranger', 'player', 'defect', 'cooper', 'share histori', 'histori', 'turnov', 'reciproc', 'maxflow', 'popul', 'reput', 'peer', 'privat histori', "'", 'traitor'], ['agent', 'payment', 'mechan', 'design', 'amd', 'outcom', 'v', 'autom mechan', 'autom mechan design', 'mechan design', 'type', 'auction', 'minsat', 'o', "agent '"], ['wager', 'trader', 'market', 'price', 'share', 'dpm', 'price function', 'pari-mutuel', 'market maker', 'aftermarket', 'cda', 'money', 'maker', 'purchas', 'payoff'], ['valuat', 'elicit', 'queri', 'equival queri', 'lindahl', 'demand queri', 'lindahl price', 'polynomi', 'bundl', 'learn', 'elicit algorithm', 'bid', 'prefer elicit', 'agent', 'learn algorithm'], ['vwap', 'limit order', 'volum', 'sell', 'price', 'market', 'share', 'order', 'competit ratio', 'order book', 'trade', 'algorithm', 'book', 'owt', 'n share'], ['auction', 'cheat', 'bid', 'agent', 'seller', 'second-pric auction', 'bidder', 'second-pric', 'bid strategi', 'equilibrium', 'first-pric auction', 'first-pric', 'honest', 'shave', 'strategi']]

new_cands=[]
for n_doc in range(len(cands)):
    doc=[]
    for n_key in range(len(cands[n_doc])):
        if len(cands[n_doc][n_key])<=2:
            del n_key
        else:
            doc.append(cands[n_doc][n_key])
    new_cands.append(doc)
print(new_cands)

[['hyarm', 'video', 'resourc', 'dre', 'resourc util', 'uav', 'multimedia', 'qo requir', 'base station', 'applic', 'util', 'station', 'end receiv', 'dre multimedia'], ['tigr', 'gridway', 'grid', 'enkf', 'licens', 'job', 'ensembl', 'grid-en', 'mpi', 'reservoir', 'geolog', 'geolog model', 'eclips', 'metaschedul', 'sub-job'], ['msp', 'node', 'scan', 'node sequenc', 'sequenc', 'anchor', 'target node', 'event', 'basic msp', 'flip', 'sequence-bas', 'sequence-bas msp', 'protect band', 'band', 'figur'], ['node', 'sensor node', 'color', 'sensor', 'local', 'label', 'imag', 'imag process', 'relax', 'light', 'stardust', 'radio', 'label relax', 'connect', 'node id'], ['sensor', 'skip graph', 'proxi', 'tsar', 'skip', 'data', 'index', 'storag', 'interv', 'queri', 'interv skip', 'interv skip graph', 'flash', 'lookup', 'summari'], ['zone', 'dim', 'queri', 'node', 'event', 'code', 'gpsr', 'zone code', 'rang queri', 'insert', 'owner', 'network', 'sensor', 'perimet', 'messag'], ['node', 'messag', 'contact'

In [19]:
##check legitimate candidates
cands=[['aa','a','ball','dorr'],
      ['bbbb','b','cccc','ddd']]
new_cands=[]
for n_doc in range(len(cands)):
    doc=[]
    for n_key in range(len(cands[n_doc])):
        if len(cands[n_doc][n_key])<=2:
            del n_key
        else:
            doc.append(cands[n_doc][n_key])
    new_cands.append(doc)
print(new_cands)

[['ball', 'dorr'], ['bbbb', 'cccc', 'ddd']]
