In [3]:
import pandas as pd
from sentence_transformers import SentenceTransformer, util, CrossEncoder
from torch import nn

df = pd.read_csv('pycon_2024_sessions.csv')

# Load a pre-trained sentence transformer model
embedding_model = SentenceTransformer('BAAI/bge-small-en-v1.5')

cross_encoder = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-12-v2', default_activation_function=nn.Sigmoid())
# Compute embeddings for all descriptions
embeddings = embedding_model.encode(df['Description'].tolist(), convert_to_tensor=True)

In [4]:
df['Title'] = df['Title'].str.replace('—PyCon AU 2024', '')

In [5]:
related_articles = util.semantic_search(embeddings, embeddings, top_k=11)

In [6]:
related_articles[3]

[{'corpus_id': 3, 'score': 1.0},
 {'corpus_id': 50, 'score': 0.755312442779541},
 {'corpus_id': 37, 'score': 0.7330219745635986},
 {'corpus_id': 45, 'score': 0.7285691499710083},
 {'corpus_id': 10, 'score': 0.7154734134674072},
 {'corpus_id': 17, 'score': 0.7050767540931702},
 {'corpus_id': 31, 'score': 0.7000476717948914},
 {'corpus_id': 0, 'score': 0.6971954107284546},
 {'corpus_id': 22, 'score': 0.6895818710327148},
 {'corpus_id': 53, 'score': 0.6803258657455444},
 {'corpus_id': 47, 'score': 0.6755890846252441}]

In [7]:
for id, article in enumerate(related_articles):
    article_description = df['Description'][id]
    cross_enoder_pairs = [[article_description, df['Description'][doc['corpus_id']]] for doc in article]
    cross_scores = cross_encoder.predict(cross_enoder_pairs)
    for i, doc in enumerate(article):
        doc['cross_score'] = cross_scores[i]

In [8]:
# Assign a rank based on the 'score' of the related articles using variable called 'position'
for id, article in enumerate(related_articles):
    for i, doc in enumerate(article):
        doc['position'] = i

In [9]:
for item in related_articles[15]:
    print(f"{item['position']}: {df.iloc[item['corpus_id']]['Title']} Score: {item['score']} Cross Score: {item['cross_score']}")

0: Django, REST and OpenAPI - a gentle introduction Score: 1.0000001192092896 Cross Score: 0.9997504353523254
1: Using Multiple Databases in Django Applications Score: 0.7314367294311523 Cross Score: 0.8777825832366943
2: What Python Can Learn From Other Languages Score: 0.7268749475479126 Cross Score: 0.01298246718943119
3: Notes on Over-Engineering: A Project Post-Mortem Score: 0.7174100875854492 Cross Score: 0.7139442563056946
4: Django on AWS for chump change Score: 0.7087013125419617 Cross Score: 0.490553617477417
5: Simplifying Python Web App Operations: Automating K8s Ops with Open Source Score: 0.6891282796859741 Cross Score: 0.011989663355052471
6: The perfect setup? Not setup.py! Building packages the right way Score: 0.6867312788963318 Cross Score: 0.009206107817590237
7: Better dataframes Score: 0.6783459186553955 Cross Score: 0.007230295799672604
8: A Lazy Person's Guide to Building REST Clients or: How I Learned to Stop Worrying and Love Dunder Overrides Score: 0.66765058

In [12]:
for item in sorted(related_articles[14], key=lambda x: x['cross_score'], reverse=True):
    print(f"{item['position']}: {df.iloc[item['corpus_id']]['Title']} | {item['score']} | {item['cross_score']}")

0: Django on AWS for chump change | 1.0000001192092896 | 0.9998844861984253
1: Django, REST and OpenAPI - a gentle introduction | 0.7087013125419617 | 0.01146089006215334
3: Using Multiple Databases in Django Applications | 0.6755949854850769 | 0.003622404532507062
8: Three Django Apps in a Trenchcoat: Writing an event management website with not a lot of time | 0.6145040988922119 | 0.0021197160240262747
2: Notes on Over-Engineering: A Project Post-Mortem | 0.6810979843139648 | 0.001782374456524849
4: Space Django: Migrating and Redesigning a Database while Hunting for Satellites. | 0.6471710205078125 | 0.0006661649676971138
6: A Lazy Person's Guide to Building REST Clients or: How I Learned to Stop Worrying and Love Dunder Overrides | 0.6447445154190063 | 1.342431551165646e-05
10: Who tests the testers? Making and testing pipelines | 0.6053941249847412 | 1.3191965081205126e-05
7: The perfect setup? Not setup.py! Building packages the right way | 0.6233097314834595 | 1.316238285653526e

In [11]:
import pickle
with open('related_articles_scores.pkl', 'wb') as f:
    pickle.dump(related_articles, f)