In [1]:
import spacy
nlp = spacy.load("en_core_web_md")

In [5]:
import wikipedia
import urllib
from urllib.error import HTTPError
import difflib
import datetime
from collections import defaultdict

In [4]:
doc = nlp(u'dual recurrent attention units for visual question answering\nwe propose an architecture for vqa which utilizes recurrent layers to generate visual and textual attention. the memory characteristic of the proposed recurrent attention units offers a rich joint embedding of visual and textual features and enables the model to reason relations between several parts of the image and question. our single model outperforms the first place winner on the vqa 1.0 dataset performs within margin to the current state of the art ensemble model. we also experiment with replacing attention mechanisms in other state of the art models with our implementation and show increased accuracy. in both cases our recurrent attention mechanism improves performance in tasks requiring sequential or relational reasoning on the vqa dataset. sequential short text classification with recurrent and convolutional neural networks\n recent approaches based on artificial neural networks anns have shown promising results for short text classification. however many short texts occur in sequences e.g. sentences in a document or utterances in a dialog and most existing ann based systems do not leverage the preceding short texts when classifying a subsequent one. in this work we present a model based on recurrent neural networks and convolutional neural networks that incorporates the preceding short texts. our model achieves state of the art results on three different datasets for dialog act prediction.')

In [19]:
def partition(file, size = 1000000):
    '''
    partition the input file into block with maximum size of 1000000, since SpaCy v2.x parser may have issues allocating memory with size larger than 1000000
    '''
    while True:
        data = file.read(size)
        if not data:
            break
        yield data

In [21]:
np_phrases = set()
np_phrase_frequency = defaultdict(int)
with open('../final_stuff/data/arxiv_titles_and_abstracts.txt', 'r') as file:
    file_chunks = partition(file)
    for chunk in file_chunks:
        doc = nlp(chunk)
        for np in doc.noun_chunks:
            np_phrases.add(np.text)
            np_phrase_frequency[np.text] += 1

KeyboardInterrupt: 

In [8]:
np_phrases = set()
np_phrase_frequency = defaultdict(int)
for np in doc.noun_chunks:
    np_phrases.add(np.text)
    np_phrase_frequency[np.text] += 1

In [22]:
sorted_phrases = sorted(np_phrase_frequency, key=np_phrase_frequency.__getitem__, reverse=True)
sorted_phrases_w_freq = [(p, np_phrase_frequency[p]) for p in sorted_phrases if len(p.split(" ")) > 1]

In [23]:
sorted_phrases_w_freq

[('this paper', 14866),
 ('the problem', 3598),
 ('this work', 3197),
 ('our method', 3013),
 ('our approach', 2959),
 ('the performance', 2531),
 ('the number', 2515),
 ('the state', 2307),
 ('a set', 1969),
 ('the model', 1940),
 ('the proposed method', 1785),
 ('the results', 1784),
 ('the art', 1742),
 ('the data', 1606),
 ('this problem', 1582),
 ('experimental results', 1565),
 ('our model', 1549),
 ('the effectiveness', 1519),
 ('the use', 1471),
 ('the algorithm', 1374),
 ('the task', 1284),
 ('the method', 1218),
 ('a method', 1203),
 ('the network', 1193),
 ('deep neural networks', 1164),
 ('deep learning', 1105),
 ('our algorithm', 1078),
 ('a number', 1074),
 ('our results', 1056),
 ('this approach', 1012),
 ('neural networks', 1011),
 ('a variety', 999),
 ('the system', 996),
 ('the art methods', 991),
 ('the proposed approach', 929),
 ('the context', 867),
 ('convolutional neural networks', 853),
 ('our experiments', 845),
 ('the case', 807),
 ('the accuracy', 748),
 ('an

In [22]:
good_np_phrases = []
for npp in np_phrases:
    if len(npp.split(" ")) in [2, 3]:
        good_np_phrases.append(npp)
        

In [24]:
len(good_np_phrases)

13806

In [6]:
def get_sets(words_set, mode):
    good = set()
    bad = set()
    very_bad = set()
    
    prefix = "prdr_"
    if mode == 0:
        prefix = "prdr_"
    elif mode == 1:
        prefix = "ap_"      
    
    f = open(prefix + datetime.datetime.now().strftime("%Y_%m_%d_%H_%M_%S") + ".txt", 'w+')
    if mode == 0:
        f.write("PR Dual Rank Logs:\n")
    elif mode == 1:
        f.write("Auto Phrase Logs:\n")
    
    for i, word in enumerate(words_set):
        try:
            url_suffix = (word.replace(" ", "_").replace("-", "_")).capitalize()
            url = "https://en.wikipedia.org/wiki/" + url_suffix
            status = 0
            try:
                code = urllib.request.urlopen(url).getcode()
                if code == 200:
                    good.add(word)
                    status = 1
            except HTTPError:
#                 bad.add(word)
#             print("Ran " + str(i + 1) + "... Keyword: " + word + " URL: " + url + " Status: " + str(status))
                if status == 0 and word[-1] == 's' and len(word.split(" ")) > 1:
                    url = url[:-1]
                    try:
                        code = urllib.request.urlopen(url).getcode()
                        if code == 200:
                            good.add(word)
                            status = 1
                    except HTTPError:
                        bad.add(word)
            print("Ran " + str(i + 1) + "... Keyword: " + word + " URL: " + url + " Status: " + str(status))
            f.write("Ran " + str(i + 1) + "... Keyword: " + word + " URL: " + url + " Status: " + str(status) + "\n")
        except:
            very_bad.add(word)
    
    probably_good = set()
    for word in words_set.difference(good):
        query = word.replace("-", " ").lower()
        query_tok = nlp(query)
        values = []
        for result in wikipedia.search(query):
            result_tok = nlp(result.lower().replace("-", " "))
            d = result_tok.similarity(query_tok) * 100 
            values.append(d)
        if len(values) != 0:
            print("Rechecking... " + "Keyword: " + word + " Max Similarity: " + str(max(values)))
            f.write("Rechecking... " + "Keyword: " + word + " Max Similarity: " + str(max(values)) + "\n")
            if max(values) > 80.0:
                probably_good.add(word)
    
    f.write("Good Set:\n" + str(good) + "\n")
    f.write("Probably Good Set:\n" + str(probably_good) + "\n")
    
    f.close()
    
    return good, probably_good

In [7]:
good, prob_good = get_sets(np_phrases, 2)

Ran 1... Keyword: short text classification URL: https://en.wikipedia.org/wiki/Short_text_classification Status: 0
Ran 2... Keyword: dialog act prediction URL: https://en.wikipedia.org/wiki/Dialog_act_prediction Status: 0
Ran 3... Keyword: utterances URL: https://en.wikipedia.org/wiki/Utterances Status: 1
Ran 4... Keyword: a subsequent one URL: https://en.wikipedia.org/wiki/A_subsequent_one Status: 0
Ran 5... Keyword: the memory characteristic URL: https://en.wikipedia.org/wiki/The_memory_characteristic Status: 0
Ran 6... Keyword: both cases URL: https://en.wikipedia.org/wiki/Both_case Status: 0
Ran 7... Keyword: sequential or relational reasoning URL: https://en.wikipedia.org/wiki/Sequential_or_relational_reasoning Status: 0
Ran 8... Keyword: recurrent layers URL: https://en.wikipedia.org/wiki/Recurrent_layer Status: 0
Ran 9... Keyword: the preceding short texts URL: https://en.wikipedia.org/wiki/The_preceding_short_text Status: 0
Ran 10... Keyword: several parts URL: https://en.wikip

  "__main__", mod_spec)


Rechecking... Keyword: both cases Max Similarity: 70.77190214261844
Rechecking... Keyword: sequential or relational reasoning Max Similarity: 66.99154796573562


  "__main__", mod_spec)


Rechecking... Keyword: recurrent layers Max Similarity: 75.25120583258973
Rechecking... Keyword: the vqa dataset Max Similarity: 58.17074930042823


  "__main__", mod_spec)


Rechecking... Keyword: the preceding short texts Max Similarity: 76.15267374284905
Rechecking... Keyword: several parts Max Similarity: 72.70842216901737
Rechecking... Keyword: visual question Max Similarity: 81.1343819583379


  "__main__", mod_spec)


Rechecking... Keyword: visual and textual features Max Similarity: 68.79217819744602
Rechecking... Keyword: the art ensemble model Max Similarity: 69.96634540922038
Rechecking... Keyword: recurrent and convolutional neural networks Max Similarity: 91.04069916342951
Rechecking... Keyword: three different datasets Max Similarity: 79.95617826005859


  "__main__", mod_spec)


Rechecking... Keyword: anns Max Similarity: 99.9999946345186
Rechecking... Keyword: this work Max Similarity: 92.05076584483292
Rechecking... Keyword: our model Max Similarity: 72.11339483724379


  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)


Rechecking... Keyword: vqa Max Similarity: 100.0
Rechecking... Keyword: attention mechanisms Max Similarity: 74.7096191847275
Rechecking... Keyword: other state Max Similarity: 76.90893358393089
Rechecking... Keyword: many short texts Max Similarity: 71.80237450895696
Rechecking... Keyword: the model Max Similarity: 88.8227505614238
Rechecking... Keyword: the vqa 1.0 dataset Max Similarity: 61.490323902651866
Rechecking... Keyword: a dialog Max Similarity: 88.04968929188814
Rechecking... Keyword: the current state Max Similarity: 89.0773541667505
Rechecking... Keyword: the image Max Similarity: 100.0
Rechecking... Keyword: our recurrent attention mechanism Max Similarity: 71.07254040864713
Rechecking... Keyword: the art models Max Similarity: 88.66569808894764
Rechecking... Keyword: a document Max Similarity: 85.6001907158494


  "__main__", mod_spec)
  "__main__", mod_spec)


Rechecking... Keyword: our single model outperforms Max Similarity: 65.6082051108381


  "__main__", mod_spec)


Rechecking... Keyword: increased accuracy Max Similarity: 80.71003428749522
Rechecking... Keyword: an architecture Max Similarity: 90.92635369238926
Rechecking... Keyword: recent approaches Max Similarity: 57.87216147240627
Rechecking... Keyword: the first place winner Max Similarity: 91.66368955390558
Rechecking... Keyword: our implementation Max Similarity: 79.88313192181164
Rechecking... Keyword: a rich joint embedding Max Similarity: 47.16808753618129
Rechecking... Keyword: visual and textual attention Max Similarity: 76.28319777189215
Rechecking... Keyword: most existing ann based systems Max Similarity: 76.84651770326725
Rechecking... Keyword: a model Max Similarity: 99.99999990995853
Rechecking... Keyword: the proposed recurrent attention units Max Similarity: 67.49328037110158
Rechecking... Keyword: dual recurrent attention units Max Similarity: 57.83173867614494
Rechecking... Keyword: e.g. sentences Max Similarity: 86.4238133020316
Rechecking... Keyword: sequential short text 

In [14]:
len(np_phrases)

59

In [15]:
len(good)

14

In [16]:
len(prob_good)

19

In [9]:
prob_good

{'a dialog',
 'a document',
 'a model',
 'a subsequent one',
 'an architecture',
 'anns',
 'e.g. sentences',
 'increased accuracy',
 'recurrent and convolutional neural networks',
 'the art models',
 'the art results',
 'the current state',
 'the first place winner',
 'the image',
 'the model',
 'this work',
 'visual question',
 'vqa'}