In [1]:
# https://spacy.io/usage/vectors-similarity

# install spacy
# install torchtext

import spacy
import torchtext
import json

# python -m spacy download
nlp = spacy.load('en_core_web_lg')
spacy.load('en_vectors_web_lg', vocab=nlp.vocab)

# Have project be a few scripts to run to avoid having everything in one environment
# (also that would be a lot in memory)
# (but mainly I don't want to put everything on one environment rn)

  "__main__", mod_spec)


<spacy.lang.en.English at 0x1d8157476d8>

In [22]:
# get lists
with open('results.json', 'r') as it:
    results = json.load(it)

with open('failed.json', 'r') as it:
    failed = json.load(it)
    
with open('everything.json', 'r') as it:
    everything = json.load(it)
len(everything)

195

# Some Testing Stuff

In [3]:
tokens = nlp('bicycle human top on')

for token in tokens:
    print(token.text, token.has_vector, token.vector_norm, token.is_oov)

bicycle True 6.655211 True
human True 6.869779 True
top True 5.5138364 True
on True 5.218119 True


In [4]:
[print(token1.text, token2.text, token1.similarity(token2)) for token1 in tokens for token2 in tokens if token1 != token2];

bicycle human 0.18300328
bicycle top 0.16640997
bicycle on 0.233741
human bicycle 0.18300328
human top 0.18022928
human on 0.25648427
top bicycle 0.16640997
top human 0.18022928
top on 0.385453
on bicycle 0.233741
on human 0.25648427
on top 0.385453


In [5]:
l = [
'bicycle top 0.16640997',
'bicycle on 0.233741',
'human top 0.18022928',
'human on 0.25648427']

for i in l:
    print(i)

bicycle top 0.16640997
bicycle on 0.233741
human top 0.18022928
human on 0.25648427


In [6]:
text = "Human is on bike. Human is on top of bike."
hi = nlp(text)
sentences = list(hi.sents) # list of sentences

for sent in sentences:
    print(sent)
sent.sentiment

Human is on bike.
Human is on top of bike.


0.0

In [7]:
words = set()
for row in results: # grab each word from results
    words.add(row[0])
    words.add(row[1])
words

{'bowl', 'orange'}

In [8]:
nlp(results[0][0]).similarity(nlp(results[0][1]))

1.0

# Trying to find the 'best' sentence

In [9]:
# Construct the "best" sentence
for i in range(len(results)):

    word1 = nlp(results[i][0])
    word2 = nlp(results[i][1])

    prep_similarity = {}
    for prep in results[i][4]:
        #print(prep)
        nlprep = nlp(prep)

        avg = nlprep.similarity(word1) + nlprep.similarity(word2)
        avg = avg / 2

        #print("{:.4f}".format(avg))
        prep_similarity[prep] = avg

    prep = max(prep_similarity, key = prep_similarity.get)

    # might just want the first one
    if word1.similarity(nlp(prep)) > word2.similarity(nlp(prep)):
        sentence = word1.text + " " + prep + " " + word2.text
    else:
        sentence = word2.text + " " + prep + " " + word1.text
    print(sentence)

orange is beside orange
orange is beside orange
orange is on top of orange
orange is on top of bowl
orange is beside orange
orange is beside orange
bowl is holding orange
orange is beside orange
orange is beside orange
bowl is holding orange
orange is beside orange
orange is beside orange


In [10]:
# Or pick the "best" sentence ... not sure how yet
# just getting all sentences right now
i = 0

sentences = []
for prep in results[i][4]:
    sentences.append(word1.text + " " + prep + " " + word2.text)
    sentences.append(word2.text + " " + prep + " " + word1.text)

nl = nlp(". ".join(sentences))
sents = list(nl.sents)
sents

[orange is beside orange.,
 orange is beside orange.,
 orange is to the left of orange.,
 orange is to the left of orange.,
 orange is adjacent to orange.,
 orange is adjacent to orange.,
 orange is holding orange.,
 orange is holding orange.,
 orange overlaps orange.,
 orange overlaps orange]

In [79]:
# Construct the "best" sentence
# Put color in sentences
def getBestSentences(results, colors, labels):
    sentences = []
    indices = set() # put index here once it's gotten a color in a sentence
    many = set()
    
    for i in range(len(results)):
        word1 = nlp(results[i][0])
        word2 = nlp(results[i][1])
        
        # Get article to use (takes care of punctuation as well)
        art1 = "A" if labels.count(word1.text) > 1 else "The"
        art2 = "a" if labels.count(word1.text) > 1 else "the"
        
        # If first time running into this label
        if results[i][2] not in indices:
            color1 = colors[results[i][2]]
            if word1.text != "person": art1 = art1 + " " + color1
            indices.add(results[i][2])
            
            if labels.count(word1.text) > 3 and word1.text not in many:
                adj = "multiple"
                if labels.count(word1.text) > 5: adj = "many"
                if word1.text != "person":
                    sentences.append("There are " + adj + " " + color1 + " " + word1.text + ".")
                else:
                    sentences.append("There are " + adj +  " " + word1.text + ".")
                many.add(word1.text)
                continue
            
        if results[i][3] not in indices:
            color2 = colors[results[i][3]]
            if word2.text != "person": art2 = art2 + " " + color2
            indices.add(results[i][3])
            
            if labels.count(word2.text) > 3 and word2.text not in many:
                adj = "multiple"
                if labels.count(word2.text) > 5: adj = "many"
                if word2.text != "person":
                    sentences.append("There are " + adj + " " + color2 + " " + word2.text + ".")
                else:
                    sentences.append("There are " + adj +  " " + word2.text + ".")
                many.add(word2.text)
                continue
        
        if word1.text in many and word2.text in many: continue
        
        # Find best preposition
        prep_similarity = {}
        for prep in results[i][4]:
            nlprep = nlp(prep)
            
            avg = nlprep.similarity(word1) + nlprep.similarity(word2)
            avg = avg / 2
            
            prep_similarity[prep] = avg
            
        prep = max(prep_similarity, key = prep_similarity.get) # get preposition
        sentence = art1 + " " + word1.text + " " + prep + " " + art2 + " " + word2.text + "." # make sentence
        sentences.append(sentence)
    
    if len(sentences) == 0:
        c = getColorSentences(colors, labels)
        return c, c
    elif len(sentences) > 5:
        short = sentences[:5].copy()
        return short, sentences
    
    return sentences, sentences

In [80]:
# Construct the "best" sentence
# Add color sentences after

def getColorSentences(colors, labels):
    sentences = []
    many = set()
    for i in range(len(colors)):
        art = "A" if labels.count(labels[i]) > 1 else "The"
        
        if labels[i] == 'person': continue
        
        if labels.count(labels[i]) > 3 and labels[i] not in many:
            many.add(labels[i])
            adj = "multiple"
            if labels.count(labels[i]) > 5: adj = "many"
            sentences.append("There are " + adj + " " + colors[results[i][3]] + " " + labels[i] + ".")
            continue
        
        sentence = art + " " + labels[i] + " is " + colors[i] + "."
        sentences.append(sentence)
        
    return sentences

def getBestSentences2(results, colors, labels):
    sentences = []
    many = set()
    
    for i in range(len(results)):
        word1 = nlp(results[i][0])
        word2 = nlp(results[i][1])
        
        art1 = "A" if labels.count(word1.text) > 1 else "The"
        art2 = "a" if labels.count(word1.text) > 1 else "the"
        
        if labels.count(word1.text) > 3 and word1.text not in many:
            many.add(word1.text)
            # getColorSentences takes care of adding this sentence
            continue
        
        if labels.count(word2.text) > 3 and word2.text not in many:
            many.add(word2.text)
            # getColorSentences takes care of adding this sentence
            continue
        
        # Find best preposition
        prep_similarity = {}
        for prep in results[i][4]:
            nlprep = nlp(prep)

            avg = nlprep.similarity(word1) + nlprep.similarity(word2)
            avg = avg / 2

            prep_similarity[prep] = avg

        prep = max(prep_similarity, key = prep_similarity.get) # get preposition
        sentence = art1 + " " + word1.text + " " + prep + " " + art2 + " " + word2.text + "." # make sentence
        sentences.append(sentence)
    
    c = getColorSentences(colors, labels)
    if len(sentences) > 5:
        shorts = sentences[:5].copy()
        shorts.extend(c)
        sentences.extend(c)
        return shorts, sentences
    
    sentences.extend(c)
    return sentences, sentences

# Make Sentences

In [81]:
# everything[i] = [row, results, failed, colorsForRetBoxes, retLabels, retScores]

In [82]:
final_short_results = []
final_results = []

for row, results, failed, colors, labels, scores in everything:
    short1, sentences1 = getBestSentences(results, colors, labels)
    short2, sentences2 = getBestSentences2(results, colors, labels)
    
    final_short_results.append([row, " ".join(short1), " ".join(short2)])
    final_results.append([row, " ".join(sentences1), " ".join(sentences2)])
    

In [92]:
final_short_results[10]

[11,
 'There are many person. A black cell phone is to the right of a person. A cell phone is to the left of a person. A cell phone is to the right of a person. A cell phone is adjacent to a lightgrey cell phone.',
 'A person is below a person. A person is to the right of a person. A person is to the left of a person. A person is to the left of a person. A person is to the left of a person. A cell phone is black. A cell phone is lightgrey.']

In [93]:
with open('final_short_results.json', 'w') as ot:
    json.dump(final_short_results, ot)

with open('final_long_results.json', 'w') as ot:
    json.dump(final_results, ot)