In [9]:
from pathlib import Path
from datetime import datetime
import json

import polars as pl
import numpy as np
from tqdm import tqdm

In [10]:
events_by_id = {}


# just fill events_by_id from parsed data
events_dir = Path('meetup_parser/simple/meetup_parsing/nl--Amsterdam/')
for event_dir in events_dir.iterdir():
    if not event_dir.is_dir():
        continue

    with open(event_dir / 'event') as event_fd:
        event = json.loads(event_fd.read())
        event_start_dt = datetime.fromisoformat(event['event_start_dt'])
        event['event_start_ts'] = event_start_dt.timestamp()
        
        attendees_data = (json.loads(attendee_file.open().read()) for attendee_file in (event_dir / 'attendees').iterdir())
        
        event_attendees = {
            attendee_data['member_id']: attendee_data
            for attendee_data in attendees_data
        }
        
        events_by_id[event['id']] = {
            'event': event,
            'attendees': event_attendees,
        }

In [11]:
from fastembed import TextEmbedding

multilingual_large_model = TextEmbedding("sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2")
# events_descr_embeddings = np.load('minilm_384d_embeddings_amsterdam.npy')

Fetching 5 files: 100%|██████████████████████████████████████| 5/5 [00:00<00:00, 31394.49it/s]


In [12]:
def get_attendee_descrition(topics):
    if len(topics) == 0:
        return ''
        
    attendee_descrition = 'Hello! My interests are: '
    for topic in topics:
        attendee_descrition += topic['name'] + ', '
    return attendee_descrition

In [13]:
attendees = [(attendee_data['member_id'], get_attendee_descrition(attendee_data['topics'])) for event in events_by_id.values() for attendee_id,attendee_data  in event['attendees'].items()]
print('total attendees:', len(attendees))
attendees[0]
# attendees_by_id = 

total attendees: 21090


('171711162',
 'Hello! My interests are: Духовный поиск, Цельная растительная диета, Развитие детей, Биология, Естественное обучение, Личностный рост, Молодые специалисты, Фитнес, Предпринимательство, Обучение, Цигун, Исцеление, Еда и питье, Самопомощь и самосовершенствование, Здоровый образ жизни, Похудение, Трансформация жизни, Питание, Жизненный коучинг, Духовный рост, ')

In [14]:
attendees_by_id = {attendee_id: description for attendee_id, description in attendees}
print('attendees_by_id len:', len(attendees_by_id))
attendees_by_id['171711162']

attendees_by_id len: 17434


'Hello! My interests are: Духовный поиск, Цельная растительная диета, Развитие детей, Биология, Естественное обучение, Личностный рост, Молодые специалисты, Фитнес, Предпринимательство, Обучение, Цигун, Исцеление, Еда и питье, Самопомощь и самосовершенствование, Здоровый образ жизни, Похудение, Трансформация жизни, Питание, Жизненный коучинг, Духовный рост, '

In [15]:
attendees_data_by_id = {attendee_data['member_id']: attendee_data for event in events_by_id.values() for attendee_id,attendee_data  in event['attendees'].items()}
attendees_data_by_id['171711162']

{'id': '1982703310',
 'member_id': '171711162',
 'name': 'Bas Snippert',
 'topics': [{'id': '1527', 'name': 'Духовный поиск'},
  {'id': '5951', 'name': 'Цельная растительная диета'},
  {'id': '8830', 'name': 'Развитие детей'},
  {'id': '10107', 'name': 'Биология'},
  {'id': '10421', 'name': 'Естественное обучение'},
  {'id': '15133', 'name': 'Личностный рост'},
  {'id': '15231', 'name': 'Молодые специалисты'},
  {'id': '17293', 'name': 'Фитнес'},
  {'id': '17390', 'name': 'Предпринимательство'},
  {'id': '18522', 'name': 'Обучение'},
  {'id': '18796', 'name': 'Цигун'},
  {'id': '20511', 'name': 'Исцеление'},
  {'id': '20882', 'name': 'Еда и питье'},
  {'id': '21517', 'name': 'Самопомощь и самосовершенствование'},
  {'id': '21774', 'name': 'Здоровый образ жизни'},
  {'id': '22936', 'name': 'Похудение'},
  {'id': '23013', 'name': 'Трансформация жизни'},
  {'id': '23034', 'name': 'Питание'},
  {'id': '23063', 'name': 'Жизненный коучинг'},
  {'id': '23765', 'name': 'Духовный рост'}],
 'mem

In [None]:
# uncomment to gen embeddings
# embeddings_generator = multilingual_large_model.embed(attendees_by_id.values())  # reminder this is a generator
# print('start embedding generation')
# attendees_embeddings_by_id = {attendee_id: embedding for attendee_id, embedding in zip(attendees_by_id.keys(), tqdm(embeddings_generator))}
# attendees_embeddings = np.array(list(attendees_embeddings_by_id.values()))
# print('embedding generation done!')
# np.save('minilm_384d_amsterdam_attendees_embeddings_v2', attendees_embeddings)
# print('total attendees embeddings:', len(attendees_embeddings))

In [16]:
events_descr_embeddings = np.load('minilm_384d_embeddings_amsterdam.npy')
attendees_embeddings = np.load('minilm_384d_amsterdam_attendees_embeddings_v2.npy')

print('events_descr_embeddings len', len(events_descr_embeddings))
print('attendees_embeddings len', len(attendees_embeddings))

events_descr_embeddings len 2025
attendees_embeddings len 17434


In [17]:
events_embeddings_by_id = {event_id: event_embedding for event_id, event_embedding in zip(events_by_id.keys(), events_descr_embeddings)}
print('events_embeddings_by_id len', len(events_embeddings_by_id))

attendees_embeddings_by_id = {attendee_id: embedding for attendee_id, embedding in zip(attendees_by_id.keys(), attendees_embeddings)}
print('attendees_embeddings_by_id len', len(attendees_embeddings_by_id))

events_embeddings_by_id len 2025
attendees_embeddings_by_id len 17434


In [18]:
# SetUp qdrant client
from qdrant_client import QdrantClient, models

qdrant_client = QdrantClient(url="http://localhost:6333")

EVENTS_COLLECTION = 'events'
ATTENDEES_COLLECTION = 'attendees'

if qdrant_client.collection_exists(EVENTS_COLLECTION):
    qdrant_client.delete_collection(EVENTS_COLLECTION)
qdrant_client.create_collection(
    collection_name=EVENTS_COLLECTION,
    vectors_config=models.VectorParams(
        size=384, distance=models.Distance.COSINE, on_disk=True
    ),
)

if qdrant_client.collection_exists(ATTENDEES_COLLECTION):
    qdrant_client.delete_collection(ATTENDEES_COLLECTION)
qdrant_client.create_collection(
    collection_name=ATTENDEES_COLLECTION,
    vectors_config=models.VectorParams(
        size=384, distance=models.Distance.COSINE, on_disk=True
    ),
)

# Batch Upload events
# qdrant_client.upsert(
#     collection_name="events",
#     points=models.Batch(
#         payloads=events_by_id.values(),
#         vectors=events_descr_embeddings,
#     ),
# )

qdrant_client.upload_collection(
    collection_name=EVENTS_COLLECTION,
    payload=events_by_id.values(),
    vectors=events_descr_embeddings,
    parallel=4,
    max_retries=3,
)

qdrant_client.upload_collection(
    collection_name=ATTENDEES_COLLECTION,
    payload=attendees_data_by_id.values(),
    vectors=attendees_embeddings,
    parallel=4,
    max_retries=3,
)

In [11]:
from scipy.spatial import distance

similarities = []
similarities_amount = 0

for event_id, event_embedding in zip(events_by_id.keys(), events_descr_embeddings):
    event = events_by_id[event_id]
    event_attendies_similarities = []
    amount = 0

    if (event['event']['description']) == 0:
        continue
    
    for attendee in event['attendees'].values():
        attendee_id = attendee['member_id']
        if len(event['attendees'][attendee_id]['topics']) == 0:
            # attendees without descr is not representative
            continue
        attendee_embedding = attendees_embeddings_by_id[attendee_id]
        event_attendies_similarities.append(distance.cosine(event_embedding, attendee_embedding))
        amount += 1

    if amount == 0:
        # no attendees ???, skip
        continue
        
    sim = sum(event_attendies_similarities) / amount
    print(f'event {event_id} sims: {event_attendies_similarities}')
    similarities.append(sim)
    similarities_amount += 1

total_sim = sum(similarities) / similarities_amount
print('total sim', total_sim)

event 298415155 sims: [0.47643113136291504, 0.4657807946205139, 0.4533889130767853, 0.4946373701095581, 0.6015803457719381]
event 300475806 sims: [0.4889652729034424, 0.43718916177749634, 0.3922033672211537, 0.39823208446002234, 0.39342039823532104, 0.43481632678605975, 0.42381417751312256, 0.5645906627178192, 0.4512898325920105]
event 297437934 sims: [0.5092213305598892, 0.5398383897494787, 0.571690559387207, 0.5379524528980255, 0.5766396774864382, 0.5677107572555542, 0.5215625166893005, 0.5513594612884358, 0.5506224036216736, 0.5955215451432188, 0.6401866757980981, 0.5079082548618317, 0.5344427824020386]
event 300286368 sims: [0.415306031703949, 0.49688923358917236, 0.5191877174861591, 0.41162919998168945, 0.4700820165823679, 0.4379058741274987, 0.5140651021619169, 0.5158830881118774]
event 300698921 sims: [0.6701695322990417, 0.5973589420318604]
event 300426624 sims: [0.48722955508408183, 0.6200501024723053]
event 300135873 sims: [0.6021159589290619, 0.578511761978619, 0.61396738886

In [12]:
# just check event and one of attendees
test_attendee_1 = attendees_by_id['417403295']
print('test_attendee_1', test_attendee_1)

test_attendee_1_emb = attendees_embeddings_by_id['417403295']
event_1 = events_by_id['300417142']
print('\n')
print('event_1 descr', event_1['event']['description'])

event_1_emb = events_embeddings_by_id['300417142']
sim = distance.cosine(event_1_emb, test_attendee_1_emb)
print('sim', sim)

test_attendee_1 Hello! My interests are: Актерское мастерство, Недавно в городе, Расширение прав и возможностей женщин, Маркетинговая стратегия для малого бизнеса, Киноиндустрия, Женщины-предприниматели, Маркетинг, 


event_1 descr Have you ever dreamed of stepping onto the sets of your favorite TV shows and films? Are you curious about the world behind the scenes of the entertainment industry while working next to your favourite Actors

In this course, you'll learn everything you need to know to kickstart your career as a Flim & TV extra, from understanding the role and responsibilities to navigating getting Job’s and set etiquette. Whether you're an aspiring actor looking to gain experience or simply seeking a fun way to make some extra cash, this is for you.

Led by industry professionals with years of experience in acting and production, you'll gain insider knowledge and practical tips that will set you apart in the world of TV extras. Through interactive lessons, real-life example

In [13]:
# top 3 similar event descriptions for this attendee
for i, result in enumerate(qdrant_client.search('events', test_attendee_1_emb, limit=3)):
    print('result', i, result.payload['event']['description'], end='\n------\n\n')
    print('score', result.score)

result 0 Have you ever dreamed of stepping onto the sets of your favorite TV shows and films? Are you curious about the world behind the scenes of the entertainment industry while working next to your favourite Actors

In this course, you'll learn everything you need to know to kickstart your career as a Flim & TV extra, from understanding the role and responsibilities to navigating getting Job’s and set etiquette. Whether you're an aspiring actor looking to gain experience or simply seeking a fun way to make some extra cash, this is for you.

Led by industry professionals with years of experience in acting and production, you'll gain insider knowledge and practical tips that will set you apart in the world of TV extras. Through interactive lessons, real-life examples, and practical exercises, you'll develop the skills and confidence needed to succeed in this dynamic industry.

By the end of the course, you'll be ready to hit the ground running, armed with the tools and insights to pur

In [14]:
events_by_attendees = {}
for event_id, event in events_by_id.items():
    for attendee in event['attendees'].values():
        attendee_id = attendee['member_id']
        if not attendee_id in events_by_attendees:
            events_by_attendees[attendee_id] = []
        events_by_attendees[attendee_id].append(event_id)

In [15]:
K = 5

# Basic DSSM approach for top 5 events
recomendations_by_attendees_top_5 = {}


for attendee_id, events_ids in events_by_attendees.items():
    req_attendee_emb = attendees_embeddings_by_id[attendee_id]
    top_k_recomendatons = qdrant_client.search('events', req_attendee_emb, limit=K, with_vectors=True, with_payload=True)
    recomendations_by_attendees_top_5[attendee_id] = []
    
    for rec in top_k_recomendatons:
        recomendations_by_attendees_top_5[attendee_id].append(rec)

recomendations_by_attendees_top_5['171711162']

[ScoredPoint(id='39339dcd-66a3-4cf2-87b3-cff40fcaa20e', version=3, score=0.6852265, payload={'attendees': {'199164293': {'id': '1993954421', 'member_id': '199164293', 'membership_role': 'ORGANIZER', 'membership_status': 'LEADER', 'name': 'Anne', 'topics': [{'id': '1527', 'name': 'Духовный поиск'}, {'id': '10438', 'name': 'DIY (сделай сам)'}, {'id': '14657', 'name': 'Социальные вопросы'}, {'id': '16285', 'name': 'Путешествия'}, {'id': '16346', 'name': 'Ремёсла'}, {'id': '16881', 'name': 'Женская социальная сеть'}, {'id': '16944', 'name': 'Расширение прав и возможностей женщин'}, {'id': '17012', 'name': 'Мир'}, {'id': '17704', 'name': 'Танец и движение'}, {'id': '17865', 'name': 'Время развлечений'}, {'id': '18280', 'name': 'Сотрудничество творческих умов'}, {'id': '18489', 'name': 'Ходьба'}, {'id': '18522', 'name': 'Обучение'}, {'id': '19991', 'name': 'Японская культура'}, {'id': '20453', 'name': 'Сторителлинг'}, {'id': '20509', 'name': 'Уроки танцев'}, {'id': '20970', 'name': 'Беседа'}

In [16]:
# take top 5 first
recomendations_by_attendees = {
    attendee_id: [recomendation.payload['event']['id'] for recomendation in recomendations]
    for attendee_id, recomendations
    in recomendations_by_attendees_top_5.items()
}
print('recomendations_by_attendees len', len(recomendations_by_attendees))

recomendations_by_attendees len 17434


In [17]:
# Precision@K
total_recommended_relevant = 0
total_recommended = 0

for actual_events, recommended_events in zip(events_by_attendees.values(), recomendations_by_attendees.values()):
    relevant_recomendations = set(actual_events) & set(recommended_events)
    total_recommended_relevant += len(relevant_recomendations)
    total_recommended += len(recommended_events)

precision = total_recommended_relevant / total_recommended
print('Precision@K', precision)

Precision@K 0.0013307330503613628


In [18]:
# Recall@K
total_recommended_relevant = 0
total_relevant = 0

for actual_events, recommended_events in zip(events_by_attendees.values(), recomendations_by_attendees.values()):
    relevant_recomendations = set(actual_events) & set(recommended_events)
    total_recommended_relevant += len(relevant_recomendations)
    total_relevant += len(actual_events)

recall = total_recommended_relevant / total_relevant
print('Recall@K', recall)

Recall@K 0.005500237079184448


In [19]:
#F1Score
f1score = 2 * (precision * recall) / (precision + recall)
print('F1@k', f1score)

F1@k 0.002142989100314059


In [20]:
# Coverage
# Coverage is the percent of items that the recommender is able to recommend
import itertools

unique_recomendations = set(itertools.chain.from_iterable(recomendations_by_attendees.values()))
total_recommended = set(events_by_id.keys())
coverage = len(unique_recomendations) / len(total_recommended)
print('Coverage', coverage)

Coverage 0.3037037037037037


In [21]:
# Diversity is inverse proportional to recommandation intra-list similarity
# Similarity is calculated as average cosine similarity for each-to-each elements
# recomendations_by_attendees_top_5['417403295'][0].vector

def compute_intra_list_similarity(vectors: list[np.array]) -> float:
    acc = 0
    amount = 0
    
    cur_elem_index = 0
    for i in range(cur_elem_index, len(vectors)):
        cur_elem = vectors[cur_elem_index]
        # print('cur_elem', cur_elem)
        # print('vectors[i]', vectors[i])
        acc += distance.cosine(cur_elem, vectors[i])
        amount += 1

    return acc / amount

# def compute_diversity(vectors: list[np.array]) -> float:
total_acc = 0
total_amount = 0

for recommended_events in recomendations_by_attendees_top_5.values():
    recommended_events_vectors = [rec.vector for rec in recommended_events]
    intra_list_sim = compute_intra_list_similarity(recommended_events_vectors)
    total_acc += intra_list_sim
    total_amount += 1

diversity = 1 / (total_acc / total_amount)
print('Diversity', diversity)

Diversity 6.261786211931749


In [42]:
# Personalisation
# calcuate as avg frequency of user recommended objects in total recommended objects

from collections import Counter


# count of each of recommended object
recomendations_by_events_counter = Counter(itertools.chain.from_iterable(recomendations_by_attendees.values()))
total_recomendations = sum(recomendations_by_events_counter.values())
total_recomendations_len = len(recomendations_by_attendees.values())

print('total_recomendations', total_recomendations)
print('total_recomendations_len', total_recomendations_len)

total_acc = 0

for recommended_events in recomendations_by_attendees.values():
    acc = 0
    
    for rec_event in recommended_events:
        freq = recomendations_by_events_counter[rec_event] / total_recomendations
        acc += freq

    total_acc += acc

print('total_acc', total_acc)
print('Personalisation', 1 - (total_acc /  total_recomendations_len))



total_recomendations 87170
total_recomendations_len 17434
total_acc 4444.324721807975
Personalisation 0.7450771640582783


In [45]:
# Novelty / Unexpectedness

# average similarity between two lists items
def lists_similarity(left_list: list[np.array], right_list: list[np.array]) -> float:
    acc = 0
    amount = 0
    
    for l_vec in left_list:
        for  r_vec in right_list:
            sim = distance.cosine(l_vec, r_vec)
            acc += sim
            amount += 1
    return acc / amount

acc = 0
amount = 0

for actual_events, recommended_events in zip(events_by_attendees.values(), recomendations_by_attendees.values()):
    actual_events_embs = [events_embeddings_by_id[event_id] for event_id in actual_events]
    recommended_events_embs = [events_embeddings_by_id[event_id] for event_id in recommended_events]
    
    unexpectedness = lists_similarity(actual_events_embs, recommended_events_embs)
    acc += unexpectedness
    amount += 1

print('Novelty', acc / amount)

Novelty 0.5695223906605548


In [47]:
# Serendipity
# is Unexpectedness * Relevance

def calc_relevance(actual_events: list, recommended_events: list) -> int:
    relevant_recomendations = set(actual_events) & set(recommended_events)
    return len(relevant_recomendations)

acc = 0
amount = 0

for actual_events, recommended_events in zip(events_by_attendees.values(), recomendations_by_attendees.values()):
    # calc novelty
    actual_events_embs = [events_embeddings_by_id[event_id] for event_id in actual_events]
    recommended_events_embs = [events_embeddings_by_id[event_id] for event_id in recommended_events]
    
    unexpectedness = lists_similarity(actual_events_embs, recommended_events_embs)

    # calc relevance
    relevance = calc_relevance(actual_events, recommended_events)
    serendipity = unexpectedness * relevance
    
    acc += serendipity
    amount += 1

print('Serendipity', acc / amount)



Serendipity 0.001765089763932042
