In [3]:
import spacy
nlp = spacy.load("en_core_web_md")

In [11]:
import wikipedia
import urllib
from urllib.error import HTTPError
import difflib
import datetime

In [6]:
doc = nlp(u'dual recurrent attention units for visual question answering\nwe propose an architecture for vqa which utilizes recurrent layers to generate visual and textual attention. the memory characteristic of the proposed recurrent attention units offers a rich joint embedding of visual and textual features and enables the model to reason relations between several parts of the image and question. our single model outperforms the first place winner on the vqa 1.0 dataset performs within margin to the current state of the art ensemble model. we also experiment with replacing attention mechanisms in other state of the art models with our implementation and show increased accuracy. in both cases our recurrent attention mechanism improves performance in tasks requiring sequential or relational reasoning on the vqa dataset. sequential short text classification with recurrent and convolutional neural networks\n recent approaches based on artificial neural networks anns have shown promising results for short text classification. however many short texts occur in sequences e.g. sentences in a document or utterances in a dialog and most existing ann based systems do not leverage the preceding short texts when classifying a subsequent one. in this work we present a model based on recurrent neural networks and convolutional neural networks that incorporates the preceding short texts. our model achieves state of the art results on three different datasets for dialog act prediction.')

In [8]:
np_phrases = set()
for np in doc.noun_chunks:
    np_phrases.add(np.text)

In [12]:
def get_sets(words_set, mode):
    good = set()
    bad = set()
    very_bad = set()
    
    prefix = "prdr_"
    if mode == 0:
        prefix = "prdr_"
    elif mode == 1:
        prefix = "ap_"      
    
    f = open(prefix + datetime.datetime.now().strftime("%Y_%m_%d_%H_%M_%S") + ".txt", 'w+')
    if mode == 0:
        f.write("PR Dual Rank Logs:\n")
    elif mode == 1:
        f.write("Auto Phrase Logs:\n")
    
    for i, word in enumerate(words_set):
        try:
            url_suffix = (word.replace(" ", "_").replace("-", "_")).capitalize()
            url = "https://en.wikipedia.org/wiki/" + url_suffix
            status = 0
            try:
                code = urllib.request.urlopen(url).getcode()
                if code == 200:
                    good.add(word)
                    status = 1
            except HTTPError:
#                 bad.add(word)
#             print("Ran " + str(i + 1) + "... Keyword: " + word + " URL: " + url + " Status: " + str(status))
                if status == 0 and word[-1] == 's' and len(word.split(" ")) > 1:
                    url = url[:-1]
                    try:
                        code = urllib.request.urlopen(url).getcode()
                        if code == 200:
                            good.add(word)
                            status = 1
                    except HTTPError:
                        bad.add(word)
            print("Ran " + str(i + 1) + "... Keyword: " + word + " URL: " + url + " Status: " + str(status))
            f.write("Ran " + str(i + 1) + "... Keyword: " + word + " URL: " + url + " Status: " + str(status) + "\n")
        except:
            very_bad.add(word)
    
    probably_good = set()
    for word in words_set.difference(good):
        query = word.replace("-", " ").lower()
        query_tok = nlp(query)
        values = []
        for result in wikipedia.search(query):
            result_tok = nlp(result.lower().replace("-", " "))
            d = result_tok.similarity(query_tok) * 100 
            values.append(d)
        if len(values) != 0:
            print("Rechecking... " + "Keyword: " + word + " Max Similarity: " + str(max(values)))
            f.write("Rechecking... " + "Keyword: " + word + " Max Similarity: " + str(max(values)) + "\n")
            if max(values) > 80.0:
                probably_good.add(word)
    
    f.write("Good Set:\n" + str(good) + "\n")
    f.write("Probably Good Set:\n" + str(probably_good) + "\n")
    
    f.close()
    
    return good, probably_good

In [13]:
good, prob_good = get_sets(np_phrases, 2)

Ran 1... Keyword: attention mechanisms URL: https://en.wikipedia.org/wiki/Attention_mechanism Status: 0
Ran 2... Keyword: we URL: https://en.wikipedia.org/wiki/We Status: 1
Ran 3... Keyword: artificial neural networks URL: https://en.wikipedia.org/wiki/Artificial_neural_networks Status: 1
Ran 4... Keyword: the current state URL: https://en.wikipedia.org/wiki/The_current_state Status: 0
Ran 5... Keyword: sequences URL: https://en.wikipedia.org/wiki/Sequences Status: 1
Ran 6... Keyword: the preceding short texts URL: https://en.wikipedia.org/wiki/The_preceding_short_text Status: 0
Ran 7... Keyword: a subsequent one URL: https://en.wikipedia.org/wiki/A_subsequent_one Status: 0
Ran 8... Keyword: the proposed recurrent attention units URL: https://en.wikipedia.org/wiki/The_proposed_recurrent_attention_unit Status: 0
Ran 9... Keyword: the art results URL: https://en.wikipedia.org/wiki/The_art_result Status: 0
Ran 10... Keyword: show URL: https://en.wikipedia.org/wiki/Show Status: 1
Ran 11...

  "__main__", mod_spec)


Rechecking... Keyword: the preceding short texts Max Similarity: 76.15267374284905
Rechecking... Keyword: the proposed recurrent attention units Max Similarity: 67.49328037110158
Rechecking... Keyword: the art results Max Similarity: 84.7494198649971
Rechecking... Keyword: our recurrent attention mechanism Max Similarity: 71.07254040864713
Rechecking... Keyword: e.g. sentences Max Similarity: 86.4238133020316
Rechecking... Keyword: a rich joint embedding Max Similarity: 47.16808753618129


  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)


Rechecking... Keyword: vqa Max Similarity: 100.0
Rechecking... Keyword: sequential short text classification Max Similarity: 70.33879563420919


  "__main__", mod_spec)


Rechecking... Keyword: both cases Max Similarity: 70.77190214261844


  "__main__", mod_spec)
  "__main__", mod_spec)


Rechecking... Keyword: our single model outperforms Max Similarity: 65.6082051108381


  "__main__", mod_spec)


Rechecking... Keyword: increased accuracy Max Similarity: 80.71003428749522
Rechecking... Keyword: many short texts Max Similarity: 84.34424781565463
Rechecking... Keyword: dual recurrent attention units Max Similarity: 57.83173867614494
Rechecking... Keyword: visual and textual attention Max Similarity: 76.28319777189215
Rechecking... Keyword: the model Max Similarity: 88.8227505614238
Rechecking... Keyword: the art models Max Similarity: 88.66569808894764
Rechecking... Keyword: a document Max Similarity: 85.6001907158494
Rechecking... Keyword: a model Max Similarity: 99.99999990995853
Rechecking... Keyword: a dialog Max Similarity: 83.31067237602088


  "__main__", mod_spec)


Rechecking... Keyword: visual and textual features Max Similarity: 68.79217819744602
Rechecking... Keyword: other state Max Similarity: 76.90893358393089
Rechecking... Keyword: dialog act prediction Max Similarity: 53.649728504849826
Rechecking... Keyword: the first place winner Max Similarity: 91.65172066641108
Rechecking... Keyword: the art ensemble model Max Similarity: 69.96634540922038
Rechecking... Keyword: most existing ann based systems Max Similarity: 76.84651770326725
Rechecking... Keyword: short text classification Max Similarity: 67.63988329850508
Rechecking... Keyword: our model Max Similarity: 72.11339483724379
Rechecking... Keyword: several parts Max Similarity: 72.70842216901737
Rechecking... Keyword: the image Max Similarity: 100.0
Rechecking... Keyword: recent approaches Max Similarity: 57.87216147240627
Rechecking... Keyword: visual question Max Similarity: 81.1343819583379
Rechecking... Keyword: the memory characteristic Max Similarity: 78.14820155489384
Rechecking.

  "__main__", mod_spec)


Rechecking... Keyword: anns Max Similarity: 99.9999946345186
Rechecking... Keyword: our implementation Max Similarity: 79.88313192181164
Rechecking... Keyword: sequential or relational reasoning Max Similarity: 66.99154796573562
Rechecking... Keyword: three different datasets Max Similarity: 79.95617826005859
Rechecking... Keyword: this work Max Similarity: 92.05076584483292


  "__main__", mod_spec)


Rechecking... Keyword: recurrent layers Max Similarity: 75.25120583258973
Rechecking... Keyword: an architecture Max Similarity: 90.92635369238926


In [14]:
len(np_phrases)

59

In [15]:
len(good)

14

In [16]:
len(prob_good)

19

In [23]:
prob_good

{'a dialog',
 'a document',
 'a model',
 'a subsequent one',
 'an architecture',
 'anns',
 'e.g. sentences',
 'increased accuracy',
 'many short texts',
 'recurrent and convolutional neural networks',
 'the art models',
 'the art results',
 'the current state',
 'the first place winner',
 'the image',
 'the model',
 'this work',
 'visual question',
 'vqa'}