# Similarity of documents

We'll store the documents locally and put recommendations in the postgres server for easier access with the app.

## Prep the data for computation

In [12]:
import numpy as np
import json
import pickle
import csv

In [13]:
def normalize_rows(matr):
    """
    Normalize the rows of a 2-D array,
    avoiding divisions by 0
    """
    @np.vectorize
    def fix_normalize(norm):
        if norm > 0:
            return norm
        else: 
            return 1
    norms = np.linalg.norm(matr, ord=2, axis=1).reshape(-1,1)
    norms = fix_normalize(norms)
    matr = matr/norms
    return matr

In [14]:
def split_vectors_ids(vectors_file='../../vectors/arxiv_vectors.csv',
             id_json='../../vectors/id.json',
             vectors_pkl='../../vectors/vectors.pkl',
            ):
    """
    Takes in the total csv of vectors for 
    the articles, indexed by arxiv_id and 
    transforms them into a list of ids
    and array of vectors. Then the array is 
    pickled and the ids list is stored as 
    a json.
    """
    
    ids = []
    vectors = []
    
    with open(vectors_file, 'r', newline='') as vectors_csv:
        vectors_reader = csv.reader(vectors_csv)
        for id, *vector in vectors_reader:
            ids.append(id)
            vector = np.array([float(component) for component in vector])
            vectors.append(vector)
    
    with open(id_json, 'w') as json_file:
        json.dump(ids, json_file)
    
    vectors = np.array(vectors)
    vectors = normalize_rows(vectors)
    
    with open(vectors_pkl, 'bw') as pkl_file:
        pickle.dump(vectors, pkl_file)

In [15]:
split_vectors_ids()

In [16]:
with open('../../vectors/vectors.pkl', 'rb') as vec_pkl:
    vectors = pickle.load(vec_pkl)
    
with open('../../vectors/id.json', 'r') as ids_json:
    ids = json.load(ids_json)

## Computing the Scores

In [17]:
def score(vectors, all_ids, id_low, id_high):
    """
    Outputs an array, scores. The columns of scores 
    are correspond to the slice of article_ids 
    
    all_id[id_low:id_high]
    
    The rows are indexed by all_ids.
    
    The value in column C and row R is the 
    similarity between the articles all_ids[R]
    and all_ids[id_low+C].
    """
    mask = [id_low <= index < id_high for index, _ in enumerate(all_ids)]
    rows = vectors[mask]
    scores = vectors @ rows.T
    return scores

In [18]:
def sort_scores(scores, all_ids, cur_ids):
    """
    Returns a dictionary of lists. The keys are the ids 
    of articles we're currently evaluating.
    
    The lists contain tuples of scores and ids. The 
    score is the similarity score between the current article
    and the other component of the tuple.
    """
    recs = {}
    for col_num, col in enumerate(scores.T):
        article_id = cur_ids[col_num]
        recs[article_id] = list(zip(col, ids))
        recs[article_id].sort(key=lambda x: x[0], reverse=True)
        recs[article_id] = recs[article_id][:51]
    return recs

In [23]:
def get_recs_paper_id(arxiv_id, ids, vectors):
    """
    Takes the list of id_number in question, 
    the list ids, and array of vectors
    and returns a dictionary whose key is the arxiv_id
    and the only value is the 
    """
    
    id_num = ids.index(arxiv_id)
    scored = score(vectors, ids, id_num, id_num+1)
    recs = sort_scores(scored, ids, ids[id_num:id_num+1])
    return recs

In [20]:
%time recs = get_recs_paper_id('1801.08262', ids, vectors)

CPU times: user 2.81 s, sys: 692 ms, total: 3.5 s
Wall time: 2.26 s


In [21]:
len(recs['1801.08262'])

51

In [22]:
%time recs = get_recs_paper_id('1805.06077', ids, vectors)

CPU times: user 3.01 s, sys: 568 ms, total: 3.58 s
Wall time: 2.34 s


### Interacting with SQL

We'll define some functions that allow us to store the recommendations in a database so we don't have to compute them again when they're requested by the flask app. In particular the flask app should be able to perform two specific operations with the database.

1. We should be able to retrieve check if a record exists in the table, and if it does render something for the user.
2. If there is not matching record in the table, we can update the table and then service the user.

In [42]:
from sqlalchemy_arxiv import Session, articles_similar, Base

In [21]:
session = Session()
args = {
    'id':'1801.08262',
    'recs':recs['1801.08262'],
}

session.add(articles_similar(**args))
session.commit()
session.close()


In [15]:
session = Session()
r = session.query(articles_similar).all()

session.close()

In [9]:
def send_to_server(arxiv_id, recs, table_class, session):
    recs = recs[arxiv_id]
    recs = recs[(str(score))]
    new_recs = {
        'id':id_request,
        'recs':recs_p
    }
    new_recs = table_class(**new_recs)
    session.add(new_recs)
    session.commit()

In [12]:
id_record = all_ids_list[5]

session = Session()
recs = create_sims_from_id(id_record, all_ids_list=all_ids_list)
send_to_server(id_record, recs, articles_similar, session)
session.close()

In [13]:
def request_recs(id_request, table_class, session):
    query = session.query(table_class).filter(table_class.id==id_request)
    records = query.all()[0].recs.split()
    records = list(zip(records[0::2], records[1::2]))
    records = [list(x) for x in records]
    return records
    