In [3]:
from pathlib import Path
from datetime import datetime
import json

import polars as pl
import numpy as np
from tqdm import tqdm

In [4]:
events_by_id = {}


# just fill events_by_id from parsed data
events_dir = Path('meetup_parser/simple/meetup_parsing/nl--Amsterdam/')
for event_dir in events_dir.iterdir():
    if not event_dir.is_dir():
        continue

    with open(event_dir / 'event') as event_fd:
        event = json.loads(event_fd.read())
        event_start_dt = datetime.fromisoformat(event['event_start_dt'])
        event['event_start_ts'] = event_start_dt.timestamp()
        attendees_data = (json.loads(attendee_file.open().read()) for attendee_file in (event_dir / 'attendees').iterdir())
        
        event_attendees = {
            attendee_data['member_id']: attendee_data
            for attendee_data in attendees_data
        }
        
        events_by_id[event['id']] = {
            'event': event,
            'attendees': event_attendees,
        }

In [5]:
def get_attendee_descrition(topics):
    if len(topics) == 0:
        return ''
        
    attendee_descrition = 'Hello! My interests are: '
    for topic in topics:
        attendee_descrition += topic['name'] + ', '
    return attendee_descrition

In [6]:
events_descr_embeddings = np.load('minilm_384d_embeddings_amsterdam.npy')
attendees_embeddings = np.load('minilm_384d_amsterdam_attendees_embeddings_v2.npy')

print('events_descr_embeddings len', len(events_descr_embeddings))
print('attendees_embeddings len', len(attendees_embeddings))

events_descr_embeddings len 2025
attendees_embeddings len 17434


In [8]:
attendees = [(attendee_data['member_id'], get_attendee_descrition(attendee_data['topics'])) for event in events_by_id.values() for attendee_id,attendee_data  in event['attendees'].items()]
print('total attendees:', len(attendees))
attendees_by_id = {attendee_id: description for attendee_id, description in attendees}
print('attendees_by_id len:', len(attendees_by_id))

events_embeddings_by_id = {event_id: event_embedding for event_id, event_embedding in zip(events_by_id.keys(), events_descr_embeddings)}
print('events_embeddings_by_id len', len(events_embeddings_by_id))

attendees_embeddings_by_id = {attendee_id: embedding for attendee_id, embedding in zip(attendees_by_id.keys(), attendees_embeddings)}
print('attendees_embeddings_by_id len', len(attendees_embeddings_by_id))

total attendees: 21090
attendees_by_id len: 17434
events_embeddings_by_id len 2025
attendees_embeddings_by_id len 17434


In [9]:
# SetUp qdrant client
from qdrant_client import QdrantClient, models

qdrant_client = QdrantClient(url="http://localhost:6333")

EVENTS_COLLECTION = 'events'
ATTENDEES_COLLECTION = 'attendees'

  from .autonotebook import tqdm as notebook_tqdm


In [10]:
events_by_attendees = {}
for event_id, event in events_by_id.items():
    for attendee in event['attendees'].values():
        attendee_id = attendee['member_id']
        if not attendee_id in events_by_attendees:
            events_by_attendees[attendee_id] = []
        events_by_attendees[attendee_id].append(event_id)

In [11]:
# recommenders

import random

def get_top_k_dssm_recommendations(K: int) -> dict:
    # Basic DSSM approach for top K events
    recomendations_by_attendees_top_k = {}
    
    
    for attendee_id, events_ids in events_by_attendees.items():
        req_attendee_emb = attendees_embeddings_by_id[attendee_id]
        top_k_recommendations = qdrant_client.search('events', req_attendee_emb, limit=K, with_vectors=True, with_payload=True)
        recomendations_by_attendees_top_k[attendee_id] = []
        
        for rec in top_k_recommendations:
            recomendations_by_attendees_top_k[attendee_id].append(rec)

    return recomendations_by_attendees_top_k

# returns top_k_recommendations_events_ids, top_k_recommendations_events_vecs
def get_top_k_random_recommendations(K: int) -> dict[str, (list, list)]:
    # Random recommendation approach for top K events
    recomendations_by_attendees_top_k = {}

    all_events_ids = list(events_by_id.keys())
    
    for attendee_id, events_ids in events_by_attendees.items():
        req_attendee_emb = attendees_embeddings_by_id[attendee_id]
        top_k_recommendations_events_ids = random.choices(all_events_ids, k=K)
        top_k_recommendations_events_vecs = [events_embeddings_by_id[rec_event_id] for rec_event_id in top_k_recommendations_events_ids]
        
        
        recomendations_by_attendees_top_k[attendee_id] = (top_k_recommendations_events_ids, top_k_recommendations_events_vecs)
    
    return recomendations_by_attendees_top_k


def get_top_k_randomized_dssm_recommendations(K: int) -> dict:
    # Multi-armed bandit DSSM approach for top K events
    RAND_AMPLITUDE_COEF = 0.07  # coefficient to regulate randomiastion impact
    recomendations_by_attendees_top_k = {}    
    
    for attendee_id, events_ids in events_by_attendees.items():
        req_attendee_emb = attendees_embeddings_by_id[attendee_id]
        top_k_recommendations = qdrant_client.search('events', req_attendee_emb, limit=K*2, with_vectors=True, with_payload=True)
        
        # re-weight recommendations
        for rec in top_k_recommendations:
            rec.score += random.uniform(-1, 1) * RAND_AMPLITUDE_COEF
        top_k_recommendations = sorted(top_k_recommendations, key=lambda rec: rec.score, reverse=True)[:K]                              
        recomendations_by_attendees_top_k[attendee_id] = top_k_recommendations

    return recomendations_by_attendees_top_k

In [12]:
from scipy.spatial import distance
import recsys_vec_metrics


def calculate_metrics_for_qdrant_recommendations(recomendations_by_attendees_top_k) -> dict:
    recomendations_items_by_attendees = {
        attendee_id: [recomendation.payload['event']['id'] for recomendation in recomendations]
        for attendee_id, recomendations
        in recomendations_by_attendees_top_k.items()
    }
    print('recomendations_by_attendees len', len(recomendations_items_by_attendees))
    
    recomendations_vecs_by_attendees = {
        attendee_id: [recomendation.vector for recomendation in recomendations]
        for attendee_id, recomendations
        in recomendations_by_attendees_top_k.items()
    }
    
    metrics_calculator = recsys_vec_metrics.MetricsCalculator(similarity_func=distance.cosine)
    return metrics_calculator.get_all_metrics(
        events_by_attendees.values(),
        recomendations_items_by_attendees.values(),
        recomendations_vecs_by_attendees.values(),
        events_embeddings_by_id,
    )

def calculate_metrics_for_recommendations(recomendations_by_attendees_top_k) -> dict:
    recomendations_items_by_attendees = {
        attendee_id: recomendations[0]
        for attendee_id, recomendations
        in recomendations_by_attendees_top_k.items()
    }
    print('recomendations_by_attendees len', len(recomendations_items_by_attendees))
    
    recomendations_vecs_by_attendees = {
        attendee_id: recomendations[1]
        for attendee_id, recomendations
        in recomendations_by_attendees_top_k.items()
    }    
    
    metrics_calculator = recsys_vec_metrics.MetricsCalculator(similarity_func=distance.cosine)
    return metrics_calculator.get_all_metrics(
        events_by_attendees.values(),
        recomendations_items_by_attendees.values(),
        recomendations_vecs_by_attendees.values(),
        events_embeddings_by_id,
    )

In [8]:
# Basic DSSM approach for top 5 events
recomendations_by_attendees_top_5 = get_top_k_dssm_recommendations(5)
print('recomendations_by_attendees_top_5 len', len(recomendations_by_attendees_top_5))
calculated_metrics_basic_dssm_top_5 = calculate_metrics_for_qdrant_recommendations(recomendations_by_attendees_top_5)
calculated_metrics_basic_dssm_top_5

In [23]:
# Basic DSSM approach for top 10 events
recomendations_by_attendees_top_10 = get_top_k_dssm_recommendations(10)
print('recomendations_by_attendees_top_10 len', len(recomendations_by_attendees_top_10))
calculated_metrics_basic_dssm_top_10 = calculate_metrics_for_qdrant_recommendations(recomendations_by_attendees_top_10)
calculated_metrics_basic_dssm_top_10

In [24]:
# calculate metrics for this recommendations top 10

recomendations_by_attendees_top_10 len 17434
recomendations_by_attendees len 17434
recomendations_by_attendees len 17434


{'Precision@K': 0.001296317540438224,
 'Recall@K': 0.010715979137031768,
 'F1@k': 0.0023128485902880825,
 'Coverage': 0.4059259259259259,
 'Diversity': 4.144171380071656,
 'Personalisation': 0.7248802382762798,
 'Novelty': 0.5412270319039505,
 'Serendipity': 0.004285025776915403}

In [9]:
# calculate metrics for this recommendations top 5

recomendations_by_attendees len 17434
recomendations_by_attendees len 17434


{'Precision@K': 0.0013307330503613628,
 'Recall@K': 0.005500237079184448,
 'F1@k': 0.002142989100314059,
 'Coverage': 0.3037037037037037,
 'Diversity': 6.261786211931749,
 'Personalisation': 0.7450771640582783,
 'Novelty': 0.5695223906605548,
 'Serendipity': 0.001765089763932042}

In [13]:
# Random approach for top 5 events
rand_recomendations_by_attendees_top_5 = get_top_k_random_recommendations(5)
print('rand_recomendations_by_attendees_top_5 len', len(rand_recomendations_by_attendees_top_10))
calculated_metrics_random_top_5 = calculate_metrics_for_recommendations(rand_recomendations_by_attendees_top_5)
calculated_metrics_random_top_5

rand_recomendations_by_attendees_top_5 len 17434
recomendations_by_attendees len 17434


{'Precision@K': 0.0005735918320523115,
 'Recall@K': 0.002370791844476055,
 'F1@k': 0.0009237021984112321,
 'Coverage': 1.0,
 'Diversity': 2.684107079745868,
 'Personalisation': 0.9974745311605654,
 'Novelty': 0.4475721972024242,
 'Serendipity': 0.0009959553160730028}

In [12]:
# Random approach for top 10 events
rand_recomendations_by_attendees_top_10 = get_top_k_random_recommendations(10)
print('rand_recomendations_by_attendees_top_10 len', len(rand_recomendations_by_attendees_top_10))
calculated_metrics_random_top_10 = calculate_metrics_for_recommendations(rand_recomendations_by_attendees_top_10)
calculated_metrics_random_top_10

rand_recomendations_by_attendees_top_10 len 17434
recomendations_by_attendees len 17434


{'Precision@K': 0.0005907995870138809,
 'Recall@K': 0.004883831199620673,
 'F1@k': 0.0010540858619454536,
 'Coverage': 1.0,
 'Diversity': 2.3829064391990387,
 'Personalisation': 0.9950061984371903,
 'Novelty': 0.44758696549273436,
 'Serendipity': 0.0024841501366166486}

In [16]:
qdrant_client.search('events', attendees_embeddings[0], limit=1, with_vectors=True, with_payload=True)

[ScoredPoint(id='39339dcd-66a3-4cf2-87b3-cff40fcaa20e', version=3, score=0.6852265, payload={'attendees': {'199164293': {'id': '1993954421', 'member_id': '199164293', 'membership_role': 'ORGANIZER', 'membership_status': 'LEADER', 'name': 'Anne', 'topics': [{'id': '1527', 'name': 'Духовный поиск'}, {'id': '10438', 'name': 'DIY (сделай сам)'}, {'id': '14657', 'name': 'Социальные вопросы'}, {'id': '16285', 'name': 'Путешествия'}, {'id': '16346', 'name': 'Ремёсла'}, {'id': '16881', 'name': 'Женская социальная сеть'}, {'id': '16944', 'name': 'Расширение прав и возможностей женщин'}, {'id': '17012', 'name': 'Мир'}, {'id': '17704', 'name': 'Танец и движение'}, {'id': '17865', 'name': 'Время развлечений'}, {'id': '18280', 'name': 'Сотрудничество творческих умов'}, {'id': '18489', 'name': 'Ходьба'}, {'id': '18522', 'name': 'Обучение'}, {'id': '19991', 'name': 'Японская культура'}, {'id': '20453', 'name': 'Сторителлинг'}, {'id': '20509', 'name': 'Уроки танцев'}, {'id': '20970', 'name': 'Беседа'}

In [30]:
# Randomized DSSM for top 5 events
dssm_randomized_recomendations_by_attendees_top_5 = get_top_k_randomized_dssm_recommendations(5)
print('dssm_randomized_recomendations_by_attendees_top_5 len', len(dssm_randomized_recomendations_by_attendees_top_5))
calculated_metrics_dssm_randomized_top_5 = calculate_metrics_for_qdrant_recommendations(dssm_randomized_recomendations_by_attendees_top_5)
calculated_metrics_dssm_randomized_top_5

dssm_randomized_recomendations_by_attendees_top_5 len 17434
recomendations_by_attendees len 17434


TypeError: unhashable type: 'dict'

In [31]:
# Randomized DSSM for top 5 events

dssm_randomized_recomendations_by_attendees_top_5 len 17434
recomendations_by_attendees len 17434


{'Precision@K': 0.0012274865205919468,
 'Recall@K': 0.005073494547178758,
 'F1@k': 0.001976722704600037,
 'Coverage': 0.3377777777777778,
 'Diversity': 5.990598878183575,
 'Personalisation': 0.7836356257261659,
 'Novelty': 0.5693685246951432,
 'Serendipity': 0.00165705184619095}

In [32]:
# Randomized DSSM for top 10 events
dssm_randomized_recomendations_by_attendees_top_10 = get_top_k_randomized_dssm_recommendations(10)
print('dssm_randomized_recomendations_by_attendees_top_10 len', len(dssm_randomized_recomendations_by_attendees_top_10))
calculated_metrics_dssm_randomized_top_10 = calculate_metrics_for_qdrant_recommendations(dssm_randomized_recomendations_by_attendees_top_10)
calculated_metrics_dssm_randomized_top_10

dssm_randomized_recomendations_by_attendees_top_10 len 17434
recomendations_by_attendees len 17434


{'Precision@K': 0.0011529195824251462,
 'Recall@K': 0.00953058321479374,
 'F1@k': 0.0020570025072916133,
 'Coverage': 0.44296296296296295,
 'Diversity': 3.98625803322094,
 'Personalisation': 0.7834117558017727,
 'Novelty': 0.5455582511606488,
 'Serendipity': 0.0038589934214119975}

# Time parametrization for relevance()

In [14]:
from qdrant_client import models
from datetime import datetime, timedelta

import math
import random


def adjust_recommendation(recommendation: models.ScoredPoint, time_delta: int) -> models.ScoredPoint:
    # time_delta = days(|event dt - current dt|)
    DECAY = 0.002
    
    original_score = recommendation.score
    adjusted_score = original_score * math.exp(-DECAY * time_delta)
    recommendation.score = adjusted_score
    
    return recommendation


def get_top_k_recommendations(req_attendee_emb, K: int, request_dt: datetime) -> list[models.ScoredPoint]:
    RECOMMENDATION_PERIOD = timedelta(days=180)
    
    top_k_recommendations = qdrant_client.search(
        'events',
        req_attendee_emb,
        limit=K,
        with_vectors=True,
        with_payload=True,
        query_filter=models.Filter(
            must=[
                models.FieldCondition(
                    key="event.event_start_ts",
                    range=models.Range(
                        gte=request_dt.timestamp(),
                        lte=(request_dt + RECOMMENDATION_PERIOD).timestamp(),
                    ),
                ),
            ]
        ),
    )


    for recommendation in top_k_recommendations:
        event_dt = datetime.fromtimestamp(recommendation.payload['event']['event_start_ts'])
        time_delta = abs((event_dt - request_dt).days)
        adjust_recommendation(recommendation, time_delta)
    top_k_recommendations = sorted(top_k_recommendations, key=lambda rec: rec.score, reverse=True)[:K]

    return top_k_recommendations
    

def get_top_k_timedec_recommendations_by_attendees(K: int) -> dict:
    recomendations_by_attendees_top_k = {}

    c= 10
    
    
    for attendee_id, events_ids in events_by_attendees.items():
        req_attendee_emb = attendees_embeddings_by_id[attendee_id]
        recomendations_by_attendees_top_k[attendee_id] = []
        
        for event_id in events_ids:
            event = events_by_id[event_id]
            request_dt = datetime.fromtimestamp(event['event']['event_start_ts']) - timedelta(days=random.randint(0, 7))

            
            top_k_recommendations = get_top_k_recommendations(req_attendee_emb, K, request_dt)
            
            recomendations_by_attendees_top_k[attendee_id].extend(top_k_recommendations)

    return recomendations_by_attendees_top_k



# Collaborative graph-based DSSM with time decay
def get_top_k_timedec_collaborative_dssm_recommendations_by_attendees(K: int) -> dict:
    recomendations_by_attendees_top_k = {}

    c= 10
    
    
    for attendee_id, events_ids in events_by_attendees.items():
        recomendations_by_attendees_top_k[attendee_id] = []
        collaborative_embedding_components = []
        earliest_date = datetime.now()
        
        for event_id in events_ids:
            event = events_by_id[event_id]
            event_dt = datetime.fromtimestamp(event['event']['event_start_ts']) - timedelta(days=random.randint(0, 14))
            if earliest_date > event_dt:
                earliest_date = event_dt
                
            for other_attendee_id in event['attendees'].keys():
                if other_attendee_id == attendee_id:
                    continue
                other_attendee_embedding = attendees_embeddings_by_id[other_attendee_id]
                collaborative_embedding_components.append(other_attendee_embedding)
            
        if len(collaborative_embedding_components) == 0:
            continue
        
        collaborative_embedding = sum(collaborative_embedding_components) / len(collaborative_embedding_components)
        
        top_k_recommendations = get_top_k_recommendations(collaborative_embedding, K, earliest_date)
        recomendations_by_attendees_top_k[attendee_id] = top_k_recommendations

    return recomendations_by_attendees_top_k

In [90]:
# DSSM with time decay with K*2
dssm_dt_decayed_recomendations_by_attendees_top_5 = get_top_k_timedec_recommendations_by_attendees(5)


In [91]:
len(dssm_dt_decayed_recomendations_by_attendees_top_5['374814414'])

5

In [92]:
print('dssm_dt_decayed_recomendations_by_attendees_top_5 len', len(dssm_dt_decayed_recomendations_by_attendees_top_5))
calculated_metrics_dssm_timedec_top_5 = calculate_metrics_for_qdrant_recommendations(dssm_dt_decayed_recomendations_by_attendees_top_5)
calculated_metrics_dssm_timedec_top_5

dssm_dt_decayed_recomendations_by_attendees_top_5 len 17434
recomendations_by_attendees len 17434


{'Precision@K': 0.0015837039706398353,
 'Recall@K': 0.007918444760550024,
 'F1@k': 0.0026395024458862487,
 'Coverage': 0.018271604938271607,
 'Diversity': 3.0818931544883053,
 'Personalisation': 0.4150524035692702,
 'Novelty': 0.5650871938215764,
 'Serendipity': 0.003673888816779593}

In [94]:
# DSSM with time decay withot double K
dssm_dt_decayed_recomendations_by_attendees_top_5_s = get_top_k_timedec_recommendations_by_attendees(5)
print('dssm_dt_decayed_recomendations_by_attendees_top_5_s len', len(dssm_dt_decayed_recomendations_by_attendees_top_5_s))
calculated_metrics_dssm_timedec_top_5 = calculate_metrics_for_qdrant_recommendations(dssm_dt_decayed_recomendations_by_attendees_top_5_s)
calculated_metrics_dssm_timedec_top_5

dssm_dt_decayed_recomendations_by_attendees_top_5_s len 17434
recomendations_by_attendees len 17434


{'Precision@K': 0.002162244181856116,
 'Recall@K': 0.010810810810810811,
 'F1@k': 0.0036037175191249923,
 'Coverage': 0.016296296296296295,
 'Diversity': 3.2189160394820515,
 'Personalisation': 0.3963807633720221,
 'Novelty': 0.5586208181810789,
 'Serendipity': 0.004995074342483842}

In [None]:
# DSSM with time decay K=10
dssm_dt_decayed_recomendations_by_attendees_top_10_s = get_top_k_timedec_recommendations_by_attendees(10)
print('dssm_dt_decayed_recomendations_by_attendees_top_10_s len', len(dssm_dt_decayed_recomendations_by_attendees_top_10_s))
calculated_metrics_dssm_timedec_top_10 = calculate_metrics_for_qdrant_recommendations(dssm_dt_decayed_recomendations_by_attendees_top_10_s)
calculated_metrics_dssm_timedec_top_10

In [110]:
# Collaborative graph-based DSSM with events time decay K=5
collaborative_dssm_dt_decayed_recomendations_by_attendees_top_5_s = get_top_k_timedec_collaborative_dssm_recommendations_by_attendees(5)
print('collaborative_dssm_dt_decayed_recomendations_by_attendees_top_5_s len', len(collaborative_dssm_dt_decayed_recomendations_by_attendees_top_5_s))
collaborative_calculated_metrics_dssm_timedec_top_5 = calculate_metrics_for_qdrant_recommendations(collaborative_dssm_dt_decayed_recomendations_by_attendees_top_5_s)
collaborative_calculated_metrics_dssm_timedec_top_5

collaborative_dssm_dt_decayed_recomendations_by_attendees_top_5_s len 17434
recomendations_by_attendees len 17434


ZeroDivisionError: division by zero

In [15]:
# Collaborative graph-based DSSM with events time decay K=10
collaborative_dssm_dt_decayed_recomendations_by_attendees_top_10_s = get_top_k_timedec_collaborative_dssm_recommendations_by_attendees(5)
print('collaborative_dssm_dt_decayed_recomendations_by_attendees_top_10_s len', len(collaborative_dssm_dt_decayed_recomendations_by_attendees_top_10_s))
collaborative_calculated_metrics_dssm_timedec_top_10 = calculate_metrics_for_qdrant_recommendations(collaborative_dssm_dt_decayed_recomendations_by_attendees_top_10_s)
collaborative_calculated_metrics_dssm_timedec_top_10

collaborative_dssm_dt_decayed_recomendations_by_attendees_top_10_s len 17434
recomendations_by_attendees len 17434


{'Precision@K': 0.0006459948320413437,
 'Recall@K': 0.0026078710289236607,
 'F1@k': 0.0010354890332297845,
 'Coverage': 0.12,
 'Diversity': 5.492886700286568,
 'Personalisation': 0.5068265067821913,
 'Novelty': 0.6200403730666585,
 'Serendipity': 0.0008549917876937069}