In [53]:
from transformers import AutoTokenizer, AutoModel
import numpy as np
from scipy.spatial.distance import cosine
import torch
import pandas as pd

In [25]:
# Recommended scibert models: https://github.com/allenai/scibert#tensorflow-models
tokenizer = AutoTokenizer.from_pretrained("allenai/scibert_scivocab_uncased")
model = AutoModel.from_pretrained("allenai/scibert_scivocab_uncased")

In [26]:
# Example title comparison
title1 = "NBA Player Stats for the 2017-18 Season"
title2 = "Pikles and Bananas"#"US Stocks from 1970 to Now"

In [41]:
# tokenize
inputs1 = tokenizer(title1, return_tensors="pt", padding=True)
inputs2 = tokenizer(title2, return_tensors="pt", padding=True)

In [42]:
token_ids = inputs1["input_ids"][0]
decoded_tokens = tokenizer.convert_ids_to_tokens(token_ids)

for token, token_id in zip(decoded_tokens, token_ids):
    print(f"Token: {token}, Token ID: {token_id}")

Token: [CLS], Token ID: 102
Token: nb, Token ID: 10516
Token: ##a, Token ID: 30110
Token: player, Token ID: 8774
Token: stat, Token ID: 1731
Token: ##s, Token ID: 30113
Token: for, Token ID: 168
Token: the, Token ID: 111
Token: 2017, Token ID: 4585
Token: -, Token ID: 579
Token: 18, Token ID: 1178
Token: season, Token ID: 7843
Token: [SEP], Token ID: 103


In [37]:
token_ids = inputs2["input_ids"][0]
decoded_tokens = tokenizer.convert_ids_to_tokens(token_ids)

for token, token_id in zip(decoded_tokens, token_ids):
    print(f"Token: {token}, Token ID: {token_id}")

Token: [CLS], Token ID: 102
Token: pik, Token ID: 27130
Token: ##les, Token ID: 671
Token: and, Token ID: 137
Token: ban, Token ID: 6550
Token: ##ana, Token ID: 4846
Token: ##s, Token ID: 30113
Token: [SEP], Token ID: 103


In [28]:
# Get the embeddings
with torch.no_grad():
    output1 = model(**inputs1)
    output2 = model(**inputs2)

In [32]:
output1['last_hidden_state'].size()

torch.Size([1, 13, 768])

In [38]:
output2['last_hidden_state'].size()

torch.Size([1, 8, 768])

In [22]:
# Convert to np array
embeddings1 = output1['last_hidden_state'][0].mean(dim=0).detach().numpy()
embeddings2 = output2['last_hidden_state'][0].mean(dim=0).detach().numpy()

In [23]:
similarity = 1 - cosine(embeddings1, embeddings2)
print("Similarity:", similarity)

Similarity: 0.6187241077423096


### Test on Real Data

In [46]:
from pandas import DataFrame
import psycopg2
from collections import defaultdict

In [44]:
# Connect to the database with metadata
conn = psycopg2.connect(
    dbname="training_database",
    user="postgres",
    password="default",
    host="localhost",
    port="5432"
)
cursor = conn.cursor()
cursor.execute('SELECT * from dataset')

# Create pandas data frame with metadata
metadata = DataFrame(cursor.fetchall(), columns=['UID', 'Topic', 'Title', 'Description', 'Source', 
                                                 'Tags', 'Licenses', 'Col_names', 'Row_count', 'Col_count', 
                                                 'Entry_count', 'Null_count', 'Usability'])
# why is each sports dataset duplicated three times???
metadata = metadata.drop_duplicates(subset="Title", keep="first").reset_index(drop=True) 
metadata.head()

Unnamed: 0,UID,Topic,Title,Description,Source,Tags,Licenses,Col_names,Row_count,Col_count,Entry_count,Null_count,Usability
0,245,sports,2022 Game Winner Sports Betting Data,### Context\nA dear friend of mine has a sport...,kaggle,"{games,brazil,sports,gambling,python}","{""Attribution 4.0 International (CC BY 4.0)""}","{Data,Entrada,FunÃ§Ã£o,Fundamento,Investimento...",3691,45,27454,91,0.941176
1,246,sports,Find the Sports (Object Detection),I have used alot of Kaggle Datasets and I want...,kaggle,"{sports,""computer science""}",{CC-BY-SA-4.0},"{ImageID,XMin,YMin,XMax,YMax,Labels,OriginalUR...",6538,21,47887,0,0.882353
2,247,sports,Forbes Highest Paid Athletes 1990-2020,### Context\n\nHere is a completel list of the...,kaggle,{sports},{CC0-1.0},"{S.NO,Name,Nationality,""Current Rank"",""Previou...",301,8,2408,24,0.823529
3,248,sports,H&M Sports Apparel Data Set(9k+),The H&M Sports Apparel Data Set is a comprehen...,kaggle,"{global,sports,""clothing and accessories"",begi...",{CC0-1.0},"{Name_of_product,category,""price_of_product(in...",9146,8,73168,0,1.0
4,249,sports,Men's Professional Basketball,"This dataset contains stats on players, coache...",kaggle,{basketball},{other},"{abbrev_type,code,full_name,year,coachID,award...",21821,156,394013,59750,0.852941


In [48]:
naive_full_rec_list = defaultdict(dict)

In [49]:
# TODO: use cache
for _,i in metadata.iterrows():
    for _,j in metadata.iterrows():
        inputs1 = tokenizer(i['Title'], return_tensors="pt", padding=True)
        inputs2 = tokenizer(j['Title'], return_tensors="pt", padding=True)
        with torch.no_grad():
            output1 = model(**inputs1)
            output2 = model(**inputs2)
        embeddings1 = output1['last_hidden_state'][0].mean(dim=0).detach().numpy()
        embeddings2 = output2['last_hidden_state'][0].mean(dim=0).detach().numpy()
        naive_full_rec_list[i['UID']][j['UID']] = 1 - cosine(embeddings1, embeddings2)

In [50]:
naive_full_rec_list

defaultdict(dict,
            {245: {245: 1,
              246: 0.7098619341850281,
              247: 0.7771100997924805,
              248: 0.8072810173034668,
              249: 0.7943781614303589,
              250: 0.7771918177604675,
              276: 0.7380040287971497,
              251: 0.8483273983001709,
              252: 0.793070912361145,
              253: 0.876603364944458,
              254: 0.8014940023422241,
              255: 0.7057815790176392,
              256: 0.772721529006958,
              257: 0.7108669877052307,
              258: 0.7329216599464417,
              259: 0.7712492942810059,
              260: 0.7613843679428101,
              261: 0.6596981883049011,
              262: 0.6513696312904358,
              263: 0.7050653100013733,
              264: 0.7859569787979126,
              265: 0.7526029348373413,
              266: 0.7085951566696167,
              267: 0.6042238473892212,
              268: 0.8099727034568787,
              269: 0.8

In [52]:
metadata[metadata['UID'] == 245]

Unnamed: 0,UID,Topic,Title,Description,Source,Tags,Licenses,Col_names,Row_count,Col_count,Entry_count,Null_count,Usability
0,245,sports,2022 Game Winner Sports Betting Data,### Context\nA dear friend of mine has a sport...,kaggle,"{games,brazil,sports,gambling,python}","{""Attribution 4.0 International (CC BY 4.0)""}","{Data,Entrada,FunÃ§Ã£o,Fundamento,Investimento...",3691,45,27454,91,0.941176


In [62]:
sports = naive_full_rec_list[245]
similarity_df = pd.DataFrame(list(sports.items()), columns=['UID', 'Similarity'])
merged_df = pd.merge(metadata, similarity_df, on='UID')
sorted_df = merged_df.sort_values(by='Similarity', ascending=False)

In [63]:
sorted_df.head(10)

Unnamed: 0,UID,Topic,Title,Description,Source,Tags,Licenses,Col_names,Row_count,Col_count,Entry_count,Null_count,Usability,Similarity
0,245,sports,2022 Game Winner Sports Betting Data,### Context\nA dear friend of mine has a sport...,kaggle,"{games,brazil,sports,gambling,python}","{""Attribution 4.0 International (CC BY 4.0)""}","{Data,Entrada,FunÃ§Ã£o,Fundamento,Investimento...",3691,45,27454,91,0.941176,1.0
9,253,sports,Sports Car Prices dataset,This dataset contains information about the pr...,kaggle,"{beginner,intermediate,tabular,regression,""ret...","{""Attribution 4.0 International (CC BY 4.0)""}","{""ï»¿Car Make"",""Car Model"",Year,""Engine Size (...",1007,8,8056,13,1.0,0.876603
42,287,finance,Finance Loan approval Prediction Data,Finance company deals in all loans. The custom...,kaggle,"{""exploratory data analysis"",""data visualizati...",{CC0-1.0},"{Loan_ID,Gender,Married,Dependents,Education,S...",981,25,12386,233,1.0,0.860746
7,251,sports,Online Sports Betting,# Online Sports Betting\n### A State-by-State ...,kaggle,"{sports,gambling}",{other},"{year,unsheltered,sheltered,""Unnamed: 0"",""new ...",306,65,3195,944,0.941176,0.848327
27,271,housing,Paris Housing Price Prediction,# Context\n\nThis is a set of data created fro...,kaggle,"{""cities and urban areas"",housing,""real estate...",{copyright-authors},"{squareMeters,numberOfRooms,hasYard,hasPool,fl...",10000,17,170000,0,0.911765,0.827573
25,269,housing,Housing Prices Dataset,![](https://raw.githubusercontent.com/Masterx-...,kaggle,"{""real estate""}",{CC0-1.0},"{price,area,bedrooms,bathrooms,stories,mainroa...",545,13,7085,0,1.0,0.826265
24,268,housing,Housing Price Prediction,This dataset provides comprehensive informatio...,kaggle,"{categorical,""real estate"",""data visualization...",{CC0-1.0},"{price,area,bedrooms,bathrooms,stories,mainroa...",545,13,7085,0,1.0,0.809973
3,248,sports,H&M Sports Apparel Data Set(9k+),The H&M Sports Apparel Data Set is a comprehen...,kaggle,"{global,sports,""clothing and accessories"",begi...",{CC0-1.0},"{Name_of_product,category,""price_of_product(in...",9146,8,73168,0,1.0,0.807281
10,254,sports,Sports Stadium Locations,### Content\n\nContains the latitude and longi...,kaggle,"{football,baseball,basketball,sports,tabular}",{CC0-1.0},"{ï»¿Team,League,Division,Lat,Long}",151,5,755,0,0.941176,0.801494
28,272,health,COVID-19 Healthy Diet Dataset,"&gt; ### “Health requires healthy food.""###\nR...",kaggle,"{nutrition,""public health"",health,food,""health...",{CC-BY-SA-3.0},"{Country,""Alcoholic Beverages"",""Animal Product...",703,130,21806,144,1.0,0.801141
