# Exploration
Loading Data to Redis Cloud

In [1]:
#!/usr/bin/env python3
import typing as t
import asyncio
import numpy as np
import pickle
import redis

from redis.commands.search.field import TagField

In [2]:
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

In [3]:
from models import Paper
from search_index import SearchIndex
import config

In [6]:
def read_paper_df() -> t.List:
    with open('../' + config.DATA_LOCATION + "/arxiv_embeddings_without_cutoff.pkl", "rb") as f:
        df = pickle.load(f)
    return df


In [7]:
df = read_paper_df()

In [8]:
df.head()

Unnamed: 0,id,title,year,authors,categories,abstract,vector
0,704.0304,The World as Evolving Information,2012,Carlos Gershenson,"cs.IT,cs.AI,math.IT,q-bio.PE",This paper discusses the benefits of describ...,"[-0.011167259886860847, -0.026415932923555374,..."
1,704.0865,An architecture-based dependability modeling f...,2006,"Ana-Elena Rugina (LAAS), Karama Kanoun (LAAS),...","cs.PF,cs.SE","For efficiency reasons, the software system ...","[0.02402251772582531, -0.003231793874874711, -..."
2,704.1267,Text Line Segmentation of Historical Documents...,2007,"Laurence Likforman-Sulem, Abderrazak Zahour, B...",cs.CV,There is a huge amount of historical documen...,"[-0.0011464570416137576, 0.04184567928314209, ..."
3,704.2092,A Note on the Inapproximability of Correlation...,2008,Jinsong Tan,"cs.LG,cs.DS",We consider inapproximability of the correla...,"[0.005469118244946003, -0.013095404952764511, ..."
4,704.3395,General-Purpose Computing on a Semantic Networ...,2010,Marko A. Rodriguez,"cs.AI,cs.PL",This article presents a model of general-pur...,"[0.037214089184999466, -0.029926029965281487, ..."


In [9]:
df.sort_values(by=["year"], ascending=False)

Unnamed: 0,id,title,year,authors,categories,abstract,vector
25441,2108.10876,Quantum adaptive agents with efficient long-te...,2022,"Thomas J. Elliott, Mile Gu, Andrew J. P. Garne...","quant-ph,cond-mat.stat-mech,cs.AI,cs.IT,math.IT",Central to the success of adaptive systems i...,"[-0.03755851835012436, 0.008588328026235104, -..."
23338,2103.10005,Neural Network Attribution Methods for Problem...,2022,"Antonios Mamalakis, Imme Ebert-Uphoff and Eliz...","physics.geo-ph,cs.LG",Despite the increasingly successful applicat...,"[-0.0183702539652586, 0.039819177240133286, -0..."
23333,2103.09728,Learning migration models for supporting incre...,2022,"Bruno G\'ois Mateus, Matias Martinez, Christop...",cs.SE,Context: A Legacy system can be defined as a...,"[0.04580807313323021, 0.052638158202171326, -0..."
27096,2201.05461,RecoMed: A Knowledge-Aware Recommender System ...,2022,"Maryam Sajde, Hamed Malek, Mehran Mohsenzadeh","cs.IR,cs.LG",Background and Objective High medicine diver...,"[0.012568038888275623, -0.07941142469644547, -..."
27095,2201.05460,Impact of Stop Sets on Stopping Active Learnin...,2022,Luke Kurlandski and Michael Bloodgood,"cs.IR,cs.CL,cs.LG",Active learning is an increasingly important...,"[0.005745161324739456, 0.021268900483846664, -..."
...,...,...,...,...,...,...,...
29612,cs/0105006,Reverse Engineering from Assembler to Formal S...,2000,M. P. Ward,"cs.SE,cs.PL","The FermaT transformation system, based on r...","[-0.01316212210804224, 0.03356737270951271, -0..."
29524,cs/0004005,Exact Phase Transitions in Random Constraint S...,2000,"Ke Xu, Wei Li","cs.AI,cs.CC,cs.DM",In this paper we propose a new type of rando...,"[0.007238228805363178, -0.00010195512004429474..."
29617,cs/0105032,Learning to Cooperate via Policy Search,2000,"Leonid Peshkin, Kee-Eung Kim, Nicolas Meuleau ...","cs.LG,cs.MA",Cooperative games are those in which both ag...,"[0.02313612774014473, 0.02206106297671795, 0.0..."
29521,cs/0003079,Differential Invariants under Gamma Correction,2000,Andreas Siebert,cs.CV,This paper presents invariants under gamma c...,"[-0.01893344707787037, 0.013828199356794357, 0..."


In [7]:
len(df.vector[0])

768

In [8]:
from tqdm import tqdm

In [9]:
def gather_with_concurrency(n, redis_conn, *papers):
    def load_paper(paper):
            vector = paper.pop('vector')
            paper['paper_id'] = paper.pop('id')
            # TODO - we need to be able to use other separators
            paper['categories'] = paper['categories'].replace(",", "|")
            p = Paper(**paper)
            # save model TODO -- combine these two objects eventually
            p.save()
            # save vector data
            key = "paper_vector:" + str(p.paper_id)
            redis_conn.hset(
                key,
                mapping={
                    "paper_pk": p.pk,
                    "paper_id": p.paper_id,
                    "categories": p.categories,
                    "year": p.year,
                    "vector": np.array(vector, dtype=np.float32).tobytes(),
            })
            
    for p in tqdm(papers):
        load_paper(p)

def load_all_data():
    # TODO use redis-om connection
    redis_conn = redis.from_url(config.REDIS_URL)
    search_index = SearchIndex()
    print("Loading papers into Simpa App")
    papers = read_paper_df()
    papers = papers.to_dict('records')
    gather_with_concurrency(100, redis_conn, *papers)


    print("Creating vector search index")
    categories_field = TagField("categories", separator = "|")
    year_field = TagField("year", separator = "|")
    # create a search index
    if config.INDEX_TYPE == "HNSW":
        search_index.create_hnsw(
            categories_field,
            year_field,
            redis_conn=redis_conn,
            number_of_vectors=len(papers),
            prefix="paper_vector:",
            distance_metric="IP",
        )
    else:
         search_index.create_flat(
            categories_field,
            year_field,
            redis_conn=redis_conn,
            number_of_vectors=len(papers),
            prefix="paper_vector:",
            distance_metric="IP",
        )
    print("Search index created")

In [10]:
load_all_data()

Loading papers into Simpa App


  if __name__ == "__main__":
100%|███████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:03<00:00,  3.29it/s]


Creating vector search index
GET INDEX_NAME:  papers
Search index created


In [11]:
redis_conn = redis.from_url(config.REDIS_URL)

In [12]:
redis_conn.ping()

True

In [13]:
papers = read_paper_df().head(100)

In [14]:
papers.head()

Unnamed: 0,id,title,year,authors,categories,abstract,vector
0,704.0304,The World as Evolving Information,2012,Carlos Gershenson,"cs.IT,cs.AI,math.IT,q-bio.PE",This paper discusses the benefits of describ...,"[-0.011167259886860847, -0.026415932923555374,..."
1,704.0865,An architecture-based dependability modeling f...,2006,"Ana-Elena Rugina (LAAS), Karama Kanoun (LAAS),...","cs.PF,cs.SE","For efficiency reasons, the software system ...","[0.02402251772582531, -0.003231793874874711, -..."
2,704.1267,Text Line Segmentation of Historical Documents...,2007,"Laurence Likforman-Sulem, Abderrazak Zahour, B...",cs.CV,There is a huge amount of historical documen...,"[-0.0011464570416137576, 0.04184567928314209, ..."
3,704.2092,A Note on the Inapproximability of Correlation...,2008,Jinsong Tan,"cs.LG,cs.DS",We consider inapproximability of the correla...,"[0.005469118244946003, -0.013095404952764511, ..."
4,704.3395,General-Purpose Computing on a Semantic Networ...,2010,Marko A. Rodriguez,"cs.AI,cs.PL",This article presents a model of general-pur...,"[0.037214089184999466, -0.029926029965281487, ..."


In [15]:
redis_client = redis.from_url(config.REDIS_URL)

In [16]:
def process_paper(p, i: int) -> t.Dict[str, t.Any]:
    paper = Paper.get(p.paper_pk)
    paper = paper.dict()
    score = 1 - float(p.vector_score)
    paper['similarity_score'] = score
    return paper

def papers_from_results(total, results):
    # extract papers from VSS results
    return {
        'total': total,
        'papers': [
            process_paper(p, i)
            for i, p in enumerate(results.docs)
        ]
    }

In [17]:
search_index = SearchIndex()

In [18]:
paper_vector_key = "paper_vector:0704.0304"
vector = redis_client.hget(paper_vector_key, "vector")

In [19]:
vector

b'\xe2\xf66\xbc:f\xd8\xbc\xad\xb3[\xbd\xe3\xads\xbd\xce\xce\xe5\xbc\xd5r\x00<\x88-\xcd\xbc!\xe3F=\x86\xc4\xc7\xbc\xfe\xe4\x10\xbc7\xf9\x1c\xbc\xa6\x1c\xc89G!L<\x96\xda\xb3\xbb\x8a5\x9b=\x03\xe5\x85\xbd\xce\xfe\x91<h\xf5\x01\xbd\x8f\xec\xa5\xbc\xb9\xee\xdc:\xb9\x90\x85\xbd\xcf\x0f\xa7\xbd\xf9T\xff<1\x93P\xbdU\xff9=kN\xc7<%\xda\xe4\xbc/\x8a\x9a<4\x82\x0b=\x91w5\xbd\x85a\x90\xbd\x00\x9d\x08\xbdC\xb2>=k\xfeF="v\xd55\x8a\xff}\xbd]z\xca<\x92V\xea<\xd1\x00y\xbd\xa6\xd7B\xbd,\x8fE<%\x1b\xa2<W\x02o\xbcW2\x1a\xbc\x08\x187<\x90MS<4\xbf\xf2<\xb0\xa7;<v\xa3\x83\xbdj\x13\xcb<>?\xa9\xbb\t.\xd8\xbc\x1a#\xa9=0\xc7\xce\xbcs\xa4\x8b=B\x10\xc6=d\xeb\xc6<2\xec#=\x02\xd5B=\xfa\x93s<8j\xe6\xbc\xe1\x03\xbc<5\x04\xc1<\x9f\xb4r<+U\x05<u\x1d@<`S\xd4=\x8a\x9f\x97\xbb"O\xe1<\xe5\x9c+=\x1f\xbf\xf0;\x81\x9a\x07<\xec\xc3\xa3\xbcJM\xda<\x07\x95a<\xd88N\xbd\x0f\x8c4\xbc\xf0\xc2\xcd<+_\x05=\xad\xd1\x00<\x8d\x8c\xba;\xad\x8d\x83<dD\xc0<\xbd \x98;\\]\xf5\xbcJH\x92=0\x8f\'<t\xe6\xc5\xbc\x83\x01\xbf\xbc$YK\xbdS\xa2\x15=/\xa

In [20]:
redis_client.hget(paper_vector_key, "categories")

b'cs.IT|cs.AI|math.IT|q-bio.PE'

In [27]:
def find_papers_by_text():
    # Create query
    query = search_index.vector_query(
        [],
        [],
        "KNN",
        5
    )
    
        #     similarity_request.categories,
        # similarity_request.years,
        # similarity_request.search_type,
        # similarity_request.number_of_results

    # find the vector of the Paper listed in the request
    paper_vector_key = "paper_vector:0704.0304"
    vector = redis_client.hget(paper_vector_key, "vector")

    # obtain results of the queries
    results = redis_client.ft(config.INDEX_NAME).search(query, query_params={"vec_param": vector})
    # Get Paper records of those results
    return results

In [28]:
find_papers_by_text()

Result{5 total, docs: [Document {'id': 'paper_vector:0704.0304', 'payload': None, 'vector_score': '0', 'paper_id': '0704.0304', 'paper_pk': '01GGWEBFNDBJTYPSDV77W2GRSC'}, Document {'id': 'paper_vector:1103.1791', 'payload': None, 'vector_score': '0.372496724129', 'paper_id': '1103.1791', 'paper_pk': '01GGWEBQ5VC52NWX6ATQBSHJK9'}, Document {'id': 'paper_vector:0912.4649', 'payload': None, 'vector_score': '0.41434186697', 'paper_id': '0912.4649', 'paper_pk': '01GGWEBM26BT418SW7C8T6S5DN'}, Document {'id': 'paper_vector:1008.3800', 'payload': None, 'vector_score': '0.416849195957', 'paper_id': '1008.3800', 'paper_pk': '01GGWEBNQ3H6HCD25K0H2VNHQY'}, Document {'id': 'paper_vector:1202.6153', 'payload': None, 'vector_score': '0.451893389225', 'paper_id': '1202.6153', 'paper_pk': '01GGWEBVHPFNEAA9DCW11M2R1H'}]}