# Take existing item and add new text, return suggestions for new items

**Requires:**
- 'data/handlabeled_vectors_1k.csv.csv' as generated by compile_features
- Pretrained word2vec from Google News

In [251]:
import pandas as pd
import numpy as np
from scipy.spatial.distance import cdist
import gensim

## Load the data

In [2]:
# Load
fn = '../../Academic_Work/PROJECTS/corpora/animal/GoogleNews-vectors-negative300.bin'
model = gensim.models.KeyedVectors.load_word2vec_format(fn, binary=True)

df = pd.read_csv('data/handlabeled_vectors_1k.csv.csv')

Where df[df.columns[0]]<br>
Column 0 and 1 (identical) are an old index<br>
    2 = item id<br>
    3 = macro category<br>
    4 = handlabeled category<br>
    5:305 = text vectors<br>
    305:605 = image vectors<br>

## Define class that will merge new input into item vector, then find vectors near it's new location in high-d space using cosine

In [273]:
class RNNSearch:
    def __init__(self):
        pass
    
    def update_vector(self, item, word):
        weight = 400 # still not enough to find new areas
        oldvec = df.iloc[item][5:]
        to_merge = model[word]*weight
        newvec = np.mean(np.vstack((oldvec[0:300].values, to_merge)), axis=0)
        newvec = np.hstack((newvec, oldvec[300:]))
        return newvec.reshape(1,-1)
        
    def find_similarities(self, targetvec):
        simvec = cdist(df[df.columns[5:]].values, targetvec, metric='cosine').reshape(-1)
        return pd.Series(1-simvec)   
    
    def search(self, item_id, modification, k):
        newvec = self.update_vector(item_id, modification)
        simvec = self.find_similarities(newvec)
        answers = df.id[ simvec.sort_values(ascending=False).head(k+1).index[1:]]
        return answers
    

In [276]:
word='casual'
item=7063
k = 10
search_model = RNNSearch()
search_model.search(item, word,k)


7017     dMN4ha1IFd
7186     ZrYppAIj9g
7803     5BEhwBBAAD
10748    Jmv4LwOuMM
10537    1fCGJ7PzPO
10459    RZWCyaifil
10039    Hoo0xqCe2V
8323     HZMEmwCcF1
8656     HZMEmwCcF1
8997     fy6xCLt0FB
Name: id, dtype: object

### Check results... does this make sense?

#### For comparison, top results for dress 7063 with 

"dressy"

    10597    dvocmJ3IwL
    10748    Jmv4LwOuMM
    7186     ZrYppAIj9g
    7017     dMN4ha1IFd
    7625     VlthhXAzIT
    8323     HZMEmwCcF1
    8656     HZMEmwCcF1
    7536     OYoTZsGjNo
    8663     BXFWGyp8gY
    8997     fy6xCLt0FB


"casual"

    7017     dMN4ha1IFd
    7186     ZrYppAIj9g
    7803     5BEhwBBAAD
    10748    Jmv4LwOuMM
    10537    1fCGJ7PzPO
    10459    RZWCyaifil
    10039    Hoo0xqCe2V
    8323     HZMEmwCcF1
    8656     HZMEmwCcF1
    8997     fy6xCLt0FB

OK so algorithmically this works, but comparisons between different words suggest that we get only slight modifications with the input. For a more powerful app, we need to move further away from the start location!

**Idea:**
- Try decomposing into smaller dimension space. Then, compare euclidean space to cosie.
- Make web app with interactive I/O
    - Check that word is in vocabulary
    - Make sure returning unique results (and not the original word)
