In [64]:
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd
import numpy as np

DATA_PATH = '/Users/johnpentakalos/Development/open_empi/data/'
TEST_RUN_SAMPLES = 500

In [19]:
import embed_record as er
import ray
ray.init()

### Read in File and Model

In [49]:
test_data = pd.read_csv(DATA_PATH + 'test-data-6k.csv')
hf_model = SentenceTransformer('all-mpnet-base-v2')
hf_model_ref = ray.put(hf_model)
test_data

Unnamed: 0,rec_id,given_name,surname,address_1,address_2,city,postalcode,state,date_of_birth,country,phone_number,soc_sec_id,sex,age
0,rec-1662-org,Jake,Blackwell,79 Fitzroy Street,Killarney,Qtas De Villa Blanca,86042,GA,19381016,USA,2656203600,301592586,F,30
1,rec-819-org,Pino,Berry,4 Earle Place,Showgrounds,Milwaukie,45419,OH,19710121,USA,5148899674,114656944,F,30
2,rec-3524-org,Brianna,Paragalli,8 Montgomery Place,Noosa Outlook,Villas De Almeria,55109,CT,19501223,USA,8278600714,325822569,M,22
3,rec-4908-dup-0,Rubi,Bishop,15 Dorrit Black Crescent,,Lenexa,50588,TX,19820818,USA,9058954781,561093848,M,31
4,rec-3803-org,Alexandra,Bishop,213 Phillip Avenue,Bernleigh,Northgate,99349,OK,19231116,USA,,826519778,F,35
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6002,mrn-1236,Other,Ninja,123 Secondary Street,Somecity,Reston,20191,VA,19770806,USA,6348192381,121212121,M,26
6003,mrn-1237,Other,Ninja,123 Secondary Street,Somecity,Reston,20191,VA,19770806,USA,6348192381,121212121,M,26
6004,mrn-1240,Test,Patient,101 Main Street,,Reston,20191,VA,19861005,USA,6348293482,122312121,F,26
6005,mrn-1241,Test,Patient,101 Main Street,,Reston,20191,VA,19861005,USA,6348293482,122312121,F,26


### Encode Strings

In [55]:
# Get concatenated string versions of records
result = test_data.drop('rec_id', axis=1).apply(lambda x: ', '.join(map(str, x)), axis=1)
train_data = pd.DataFrame(result)
train_data.columns = ['concat_user_data']
train_data = train_data.iloc[:TEST_RUN_SAMPLES]


  test_val = train_data.iloc[0][0]


array([[ 0.02433742,  0.02945035, -0.00298819, ...,  0.00631233,
        -0.08409794, -0.01429206],
       [ 0.00716496, -0.02096505,  0.01981294, ..., -0.01427543,
        -0.04235324,  0.01116752],
       [-0.00162916,  0.02848027, -0.03422251, ...,  0.00411329,
        -0.04509211, -0.01832399],
       ...,
       [ 0.01169032,  0.04937701,  0.00966986, ..., -0.0127758 ,
        -0.07125518,  0.00153072],
       [ 0.01856472,  0.01036626, -0.01559971, ..., -0.01597995,
        -0.04757719, -0.03932989],
       [ 0.02928054,  0.03612335,  0.0179114 , ...,  0.02488747,
        -0.02840073, -0.01702896]], dtype=float32)

In [None]:
# Generate embeddings for the shortened set of records
embeddings = hf_model.encode(list(train_data.concat_user_data)) # Each record takes about 1s to encode.
test_val = train_data.iloc[0][0]
embeddings

### Using Ray to Encode Strings

In [56]:

@ray.remote
def encode_string(text):
    model = ray.get(hf_model_ref)
    embeddings = model.encode(text)
    return embeddings
strings_to_encode = list(train_data.concat_user_data)

In [57]:
encoding_tasks = [encode_string.remote(text) for text in strings_to_encode]
ray_embeddings = ray.get(encoding_tasks)
ray_embeddings

[array([ 2.43374258e-02,  2.94503476e-02, -2.98817828e-03,  1.65601447e-02,
        -1.94300879e-02, -1.59314070e-02,  3.89865302e-02,  2.31973398e-02,
         1.25555983e-02,  4.09810543e-02, -1.24407038e-02,  1.09929002e-04,
         4.17448469e-02,  4.11224216e-02,  1.26138497e-02, -4.07936797e-02,
         1.63352229e-02,  4.38878825e-03,  1.34052243e-02, -2.85639446e-02,
        -8.32333192e-02,  3.42372619e-02, -3.49442363e-02,  1.24041717e-02,
         6.97302166e-04,  4.72916029e-02, -6.15261830e-02,  3.10587045e-02,
         2.36213729e-02,  2.19269339e-02, -1.44185945e-02,  2.42243651e-02,
        -4.65464890e-02,  4.21802700e-02,  1.92139169e-06, -1.21483365e-02,
        -2.27041040e-02, -1.39366966e-02, -6.50694370e-02,  8.93467048e-04,
         6.76652929e-03,  4.44600359e-02, -4.06170972e-02,  6.16118591e-03,
        -4.97610755e-02, -5.79143614e-02,  3.38055715e-02, -2.03230456e-02,
        -3.39391604e-02, -6.75413711e-03,  5.76948165e-04, -4.56562702e-04,
         2.5

### Generate Cosine Similarities

In [61]:
cosine_similarities = cosine_similarity(embeddings, embeddings)
cosine_similarities

array([[1.        , 0.46202236, 0.6409489 , ..., 0.6381742 , 0.51206034,
        0.6113167 ],
       [0.46202236, 1.        , 0.3370966 , ..., 0.3409667 , 0.32326257,
        0.30249688],
       [0.6409489 , 0.3370966 , 1.0000001 , ..., 0.5035881 , 0.59506387,
        0.63241434],
       ...,
       [0.6381742 , 0.3409667 , 0.5035881 , ..., 1.        , 0.45212513,
        0.60183764],
       [0.51206034, 0.32326257, 0.59506387, ..., 0.45212513, 1.0000001 ,
        0.67684156],
       [0.6113167 , 0.30249688, 0.63241434, ..., 0.60183764, 0.67684156,
        0.99999976]], dtype=float32)

In [75]:
test_data['id'] = test_data.rec_id.str.split('-').str[1]
tmp_matrix = (test_data['id'].values[:, np.newaxis] == test_data['id'].values).astype(int)
actuals = pd.DataFrame(tmp_matrix)
actuals.head(10)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,5997,5998,5999,6000,6001,6002,6003,6004,6005,6006
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
8,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
9,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0


In [None]:
ray.shutdown()