In [1]:
from systemtools.basics import *
from systemtools.file import *
from systemtools.printer import *
from databasetools.mongo import *
from datetime import datetime as pdtime

In [2]:
from renewalsimulator.utils import *

In [3]:
config = \
{
    "start_time": "2021-03-03 11:00", # 2021-03-03, 2021-05-03
    "end_time": pdtime.now(), # pdtime.now()
}

In [4]:
config["start_time"] = convertDate(config["start_time"], DATE_FORMAT.datetime)
config["end_time"] = convertDate(config["end_time"], DATE_FORMAT.datetime)

In [5]:
print("Competition duration: " + secondsToHumanReadableDuration(convertDate(config["end_time"], DATE_FORMAT.timestamp) - convertDate(config["start_time"], DATE_FORMAT.timestamp)))
bp(config)

Competition duration: 2207h 9m 0.499s
{ end_time: 2021-06-03 12:09:00.499233, start_time: 2021-03-03 11:00:00+00:00 }


In [6]:
bucketsCol = MongoCollection("renewal", "recommendation_buckets")

renewal recommendation_buckets initialised.


In [7]:
buckets = list(bucketsCol.find({"timestamp": {"$gt": config["start_time"], "$lt": config["end_time"]}}))
buckets = [b for b in buckets if "interleaving" in b]
bp(buckets)
print("Found " + str(len(buckets)) + " buckets")

[
  {
    _id: 6080289ac90e1004a68ea993,
    bucket_id: 83,
    interleaving: 
    {
      interleaved_recommendations: [ 202979, 202999, ..., 202219, 202646 ],
      interleaving_method: team-draft,
      recsystems: 
      [
        {
          recommendations: [ 202690, 202755, ..., 202937, 202779 ],
          recsystem_id: 60756dcad89bd74b3af11cb5
        },
        {
          recommendations: [ 202979, 202535, ..., 202331, 202495 ],
          recsystem_id: 5fae9ebf2573bc71bec43bca
        }
      ]
    },
    recommendations: 
    [
      {
        recommendations: [ 202979, 202936, ..., 193028, 193026 ],
        recsystem_id: 5fae9ebf2573bc71bec43bca,
        response_time: 14.71
      },
      {
        recommendations: [ 202999, 202961, ..., 194387, 194348 ],
        recsystem_id: 60756dcad89bd74b3af11cb5,
        response_time: 14.94
      },
      {
        error: exception,
        exception: recsystem 60757476d89bd74b3af11cb6 is not connected,
        recommendations: [ ],

In [8]:
def getClicksCursor(test=False):
    clicksCol = MongoCollection("renewal", "articles.events", verbose=False)
    return clicksCol.find\
    (
        {
            'clicked': None if test else True,
            "timestamp":
            {
                "$gt": config["start_time"],
                "$lt": config["end_time"]
            }
        },
        sort=[('timestamp', pymongo.ASCENDING)],
    )

In [9]:
recsysNameCache = dict()
def getRecsysName(objid):
    if objid in recsysNameCache:
        return recsysNameCache[objid]
    else:
        col = MongoCollection("renewal", "recsystems", verbose=False)
        result = col.findOne(ObjectId(objid))["name"]
        recsysNameCache[objid] = result
        return result

In [10]:
clickedNews = dict()
for current in getClicksCursor(test=False):
    if current['user_id'] not in clickedNews:
        clickedNews[current['user_id']] = set()
    clickedNews[current['user_id']].add(current['article_id'])
for user in list(clickedNews.keys()):
    if len(clickedNews[user]) < 3:
        del clickedNews[user]
bp(clickedNews, 4)

{
  '8uEHBsOaOcQhow8z9qIrI78n9Fk1': { 276465, 276757, 276780, 276792, 276891, 276963, 277371, 277664, 277871, 277877, 277924, 278054, 278272, 278372, 278435, 278515, 278580, 278607, 278649 },
  'Mhkc4xuaFPWnmbFomIv8drAtsn13': { 276804, 278144, 279075, 281330 },
  'Smjg3R7WxFXxbC8hN9k7xkm59OB3': { 178132, 178188, 178196 },
  'UT0gJVyQBCYDfpD0UFcDPTFvzav1': { 188277, 188287, 250975, 251470, 251589, 251617, 251669, 251684, 253583, 253618, ..., 284681, 284749, 284764, 284766, 284772, 285681, 287371, 288679, 289484, 292997 },
  'UvmzxD3o9QWe0myWkTKNFYUXOGj2': { 251077, 251214, 251377, 251403, 251906, 255554, 255904 },
  'VoR7Zj8zPvgFFrYvuhWbiePQXh13': { 177671, 177685, 202906, 229955, 230309 },
  'WlN8rEQ73yZt1B5kR6rfhHMNaMp1': { 244077, 246326, 258963 },
  'XE3WtVujcnQK19PwF3J1Ljk7BCo2': { 188052, 188153, 188206, 188232 },
  'bzQ4o2uudoYcs5Zs6cX1KSHoOkD2': { 276819, 276829, 277205, 277620, 277995, 278151, 278212, 279575, 279915, 279970, 285093, 295332, 301337, 308893, 310102, 310388 },
  '

In [11]:
outcomes = []
competitors = set()
no_match_count = 0
match_count = 0
for bucket in buckets:
    try:
        del bucket['articles']
    except: pass
    user_id = bucket["user_id"]
    if user_id not in clickedNews:
        clickedNews[user_id] = set()
    timestamp = bucket["timestamp"]
    interleaved_list = bucket["interleaving"]["interleaved_recommendations"]
    recommendations = dict()
    for current in bucket['interleaving']['recsystems']:
        recommendations[current['recsystem_id']] = current['recommendations']
    assert len(recommendations) == 2
    recsys_a, recsys_b = tuple(recommendations.keys())
    competitors.add(recsys_a)
    competitors.add(recsys_b)
    recs_a, recs_b = recommendations[recsys_a], recommendations[recsys_b]
    assert recsys_a != recsys_b
    a_clicks = 0
    b_clicks = 0
    for article_id in interleaved_list:
        if article_id in clickedNews[user_id]:
            if article_id in recs_a:
                a_clicks += 1
            if article_id in recs_b:
                b_clicks += 1
    if a_clicks == 0 and b_clicks == 0:
        pass
        no_match_count += 1
        # print("No match")
    elif a_clicks > b_clicks:
        match_count += 1
        print(getRecsysName(recsys_a) + " won against " + getRecsysName(recsys_b) + " (" + str(timestamp) + ")")
        outcomes.append((recsys_a, recsys_b, -1, timestamp))
    elif a_clicks < b_clicks:
        match_count += 1
        print(getRecsysName(recsys_b) + " won against " + getRecsysName(recsys_a) + " (" + str(timestamp) + ")")
        outcomes.append((recsys_a, recsys_b, 1, timestamp))
    else:
        match_count += 1
        print("It's a draw for " + getRecsysName(recsys_a) + " and " + getRecsysName(recsys_b) + " (" + str(timestamp) + ")")
        outcomes.append((recsys_a, recsys_b, 0, timestamp))
competitors = list(competitors)
bp(competitors, 4)
bp(outcomes, 4)
print("no_match_count: " + str(no_match_count))
print("match_count: " + str(match_count))

It's a draw for baseline-random-2 and baseline-popular-1 (2021-04-30 08:00:55.527000)
It's a draw for baseline-random-2 and baseline-random-1 (2021-04-30 08:24:13.767000)
hjrecsys3 won against hjrecsys2 (2021-05-04 13:55:50.137000)
It's a draw for hjrecsys3 and hjrecsys2 (2021-05-07 12:37:10.403000)
test-baseline-1 won against hjrecsys3 (2021-05-07 13:03:52.370000)
hjrecsys2 won against baseline-popular-1 (2021-05-07 13:22:33.316000)
hjrecsys3 won against hjrecsys2 (2021-05-07 13:59:12.365000)
It's a draw for hjrecsys3 and hjrecsys2 (2021-05-07 15:59:03.549000)
hayj won against hjrecsys3 (2021-05-07 16:20:56.245000)
It's a draw for hayj and hjrecsys3 (2021-05-07 16:22:28.453000)
It's a draw for hayj and hjrecsys3 (2021-05-07 16:23:19.880000)
hayj won against hjrecsys3 (2021-05-07 16:29:02.614000)
hayj won against hjrecsys3 (2021-05-07 16:29:58.402000)
test-baseline-1 won against hjrecsys2 (2021-05-07 16:32:12.261000)
test-baseline-1 won against hjrecsys2 (2021-05-07 16:33:39.026000)
te

In [12]:
rr = RenewalRanking(competitors, float_precision=1)
rankings = []
for a, b, result, timestamp in outcomes:
    if result == 1:
        winner = b
    elif result == -1:
        winner = a
    else:
        winner = None
    rr.match(a, b, winner=winner)
    rankings.append((rr.get_ranking_stats(), timestamp))
bp(rankings)

[
  (
    [ ( 5fae9ec42573bc71bec43bcd, 25.0, 8.33 ), ( 60910f84690fad01ccacf303, 25.0, 8.33 ), ..., ( 5fae9ec02573bc71bec43bcb, 25.0, 6.45 ), ( 60757476d89bd74b3af11cb6, 25.0, 8.33 ) ],
    2021-04-30 08:00:55.527000
  ),
  (
    [ ( 5fae9ec42573bc71bec43bcd, 25.0, 8.33 ), ( 60910f84690fad01ccacf303, 25.0, 8.33 ), ..., ( 5fae9ec02573bc71bec43bcb, 25.0, 5.45 ), ( 60757476d89bd74b3af11cb6, 25.0, 8.33 ) ],
    2021-04-30 08:24:13.767000
  ),
  ...,
  (
    [ ( 607989ab7a2f6f120b2e9a2b, 27.8, 1.14 ), ( 60910f84690fad01ccacf303, 26.7, 0.88 ), ..., ( 5fae9ec12573bc71bec43bcc, 24.0, 1.72 ), ( 5fae9ec42573bc71bec43bcd, 20.9, 2.79 ) ],
    2021-05-27 09:00:37.246000
  ),
  (
    [ ( 607989ab7a2f6f120b2e9a2b, 27.8, 1.14 ), ( 60910f84690fad01ccacf303, 26.7, 0.88 ), ..., ( 5fae9ec12573bc71bec43bcc, 24.0, 1.72 ), ( 5fae9ec42573bc71bec43bcd, 20.9, 2.79 ) ],
    2021-05-28 06:00:21.958000
  )
]


In [13]:
rk = []
for i in range(len(outcomes)):
    current = dict()
    current["recsys_a"] = outcomes[i][0]
    current["recsys_b"] = outcomes[i][1]
    current["result"] = outcomes[i][2]
    current["timestamp"] = outcomes[i][3]
    current["ranking"] = rankings[i][0]
    rk.append(current)

In [14]:
bp(rk)

[
  {
    ranking: [ ( 5fae9ec42573bc71bec43bcd, 25.0, 8.33 ), ( 60910f84690fad01ccacf303, 25.0, 8.33 ), ..., ( 5fae9ec02573bc71bec43bcb, 25.0, 6.45 ), ( 60757476d89bd74b3af11cb6, 25.0, 8.33 ) ],
    recsys_a: 5fae9ec02573bc71bec43bcb,
    recsys_b: 5fae9ec12573bc71bec43bcc,
    result: 0,
    timestamp: 2021-04-30 08:00:55.527000
  },
  {
    ranking: [ ( 5fae9ec42573bc71bec43bcd, 25.0, 8.33 ), ( 60910f84690fad01ccacf303, 25.0, 8.33 ), ..., ( 5fae9ec02573bc71bec43bcb, 25.0, 5.45 ), ( 60757476d89bd74b3af11cb6, 25.0, 8.33 ) ],
    recsys_a: 5fae9ec02573bc71bec43bcb,
    recsys_b: 5fae9ebf2573bc71bec43bca,
    result: 0,
    timestamp: 2021-04-30 08:24:13.767000
  },
  ...,
  {
    ranking: [ ( 607989ab7a2f6f120b2e9a2b, 27.8, 1.14 ), ( 60910f84690fad01ccacf303, 26.7, 0.88 ), ..., ( 5fae9ec12573bc71bec43bcc, 24.0, 1.72 ), ( 5fae9ec42573bc71bec43bcd, 20.9, 2.79 ) ],
    recsys_a: 60757476d89bd74b3af11cb6,
    recsys_b: 60910f84690fad01ccacf303,
    result: -1,
    timestamp: 2021-05-27 09:

In [15]:
rk = []
for ranking, timestamp in rankings:
    current_rk = dict()
    for current_competitor in ranking:
        current_rk[current_competitor[0]] = dict()
        current_rk[current_competitor[0]]["score"] = current_competitor[1]
        current_rk[current_competitor[0]]["confidence"] = current_competitor[2]
    rk.append(current_rk)
bp(rk)

[
  {
    5fae9ebf2573bc71bec43bca: { confidence: 8.33, score: 25.0 },
    5fae9ec02573bc71bec43bcb: { confidence: 6.45, score: 25.0 },
    5fae9ec12573bc71bec43bcc: { confidence: 6.45, score: 25.0 },
    5fae9ec42573bc71bec43bcd: { confidence: 8.33, score: 25.0 },
    60756dcad89bd74b3af11cb5: { confidence: 8.33, score: 25.0 },
    60757476d89bd74b3af11cb6: { confidence: 8.33, score: 25.0 },
    607989ab7a2f6f120b2e9a2b: { confidence: 8.33, score: 25.0 },
    60910f84690fad01ccacf303: { confidence: 8.33, score: 25.0 }
  },
  {
    5fae9ebf2573bc71bec43bca: { confidence: 6.03, score: 25.0 },
    5fae9ec02573bc71bec43bcb: { confidence: 5.45, score: 25.0 },
    5fae9ec12573bc71bec43bcc: { confidence: 6.45, score: 25.0 },
    5fae9ec42573bc71bec43bcd: { confidence: 8.33, score: 25.0 },
    60756dcad89bd74b3af11cb5: { confidence: 8.33, score: 25.0 },
    60757476d89bd74b3af11cb6: { confidence: 8.33, score: 25.0 },
    607989ab7a2f6f120b2e9a2b: { confidence: 8.33, score: 25.0 },
    60910f8

In [16]:
# aggregated_rk = []
# while target_index < len(rk):
    

In [17]:
x = []
x_translation = dict()
for ranking_stats, dtime in rankings:
    timestamp = convertDate(dtime, DATE_FORMAT.timestamp)
    x.append(dtime)
    x_translation[timestamp] = str(dtime)
bp(x)
bp(x_translation)

[ 2021-04-30 08:00:55.527000, 2021-04-30 08:24:13.767000, ..., 2021-05-27 09:00:37.246000, 2021-05-28 06:00:21.958000 ]
{ 1619762455.527: 2021-04-30 08:00:55.527000, 1619763853.767: 2021-04-30 08:24:13.767000, 1620129350.137: 2021-05-04 13:55:50.137000, 1620383830.403: 2021-05-07 12:37:10.403000, 1620385432.37: 2021-05-07 13:03:52.370000, ..., 1621617866.588: 2021-05-21 19:24:26.588000, 1621707929.795: 2021-05-22 20:25:29.795000, 1621919345.913: 2021-05-25 07:09:05.913000, 1622098837.246: 2021-05-27 09:00:37.246000, 1622174421.958: 2021-05-28 06:00:21.958000 }


In [18]:
scores = dict()
for competitor in competitors:
    current_scores = []
    for ranking in rankings:
        for current in ranking[0]:
            if current[0] == competitor:
                current_scores.append(current[1])
                break
    scores[str(competitor)] = current_scores
bp(scores)

{
  5fae9ebf2573bc71bec43bca: [ 25.0, 25.0, ..., 24.7, 24.3 ],
  5fae9ec02573bc71bec43bcb: [ 25.0, 25.0, ..., 26.6, 26.6 ],
  5fae9ec12573bc71bec43bcc: [ 25.0, 25.0, ..., 24.0, 24.0 ],
  5fae9ec42573bc71bec43bcd: [ 25.0, 25.0, ..., 20.9, 20.9 ],
  60756dcad89bd74b3af11cb5: [ 25.0, 25.0, ..., 24.3, 24.3 ],
  60757476d89bd74b3af11cb6: [ 25.0, 25.0, ..., 25.8, 25.9 ],
  607989ab7a2f6f120b2e9a2b: [ 25.0, 25.0, ..., 27.8, 27.8 ],
  60910f84690fad01ccacf303: [ 25.0, 25.0, ..., 26.7, 26.7 ]
}


In [19]:
data = mergeDicts(scores, dict(x=x))
bp(data)

{
  5fae9ebf2573bc71bec43bca: [ 25.0, 25.0, ..., 24.7, 24.3 ],
  5fae9ec02573bc71bec43bcb: [ 25.0, 25.0, ..., 26.6, 26.6 ],
  5fae9ec12573bc71bec43bcc: [ 25.0, 25.0, ..., 24.0, 24.0 ],
  5fae9ec42573bc71bec43bcd: [ 25.0, 25.0, ..., 20.9, 20.9 ],
  60756dcad89bd74b3af11cb5: [ 25.0, 25.0, ..., 24.3, 24.3 ],
  60757476d89bd74b3af11cb6: [ 25.0, 25.0, ..., 25.8, 25.9 ],
  607989ab7a2f6f120b2e9a2b: [ 25.0, 25.0, ..., 27.8, 27.8 ],
  60910f84690fad01ccacf303: [ 25.0, 25.0, ..., 26.7, 26.7 ],
  x: [ 2021-04-30 08:00:55.527000, 2021-04-30 08:24:13.767000, ..., 2021-05-27 09:00:37.246000, 2021-05-28 06:00:21.958000 ]
}


In [20]:
from bokeh.models import ColumnDataSource
from bokeh.plotting import figure, output_notebook, show

In [21]:
output_notebook()

In [22]:
source = ColumnDataSource(data=data)

In [23]:
p = figure(plot_width=1000, plot_height=600, x_axis_type='datetime')

In [24]:
# p.xaxis.ticker = x
# p.xaxis.major_label_overrides = x_translation

In [25]:
from bokeh.palettes import Category10
colors = Category10[10]

In [26]:
legend_it = []
i = 0
for current in scores.keys():
    print(current)
    el = p.line('x', current, source=source, line_width=2, color=colors[i])
    legend_it.append((getRecsysName(current), [el]))
    i += 1
    if i == len(colors):
        i = 0

5fae9ec42573bc71bec43bcd
60910f84690fad01ccacf303
5fae9ebf2573bc71bec43bca
60756dcad89bd74b3af11cb5
607989ab7a2f6f120b2e9a2b
5fae9ec12573bc71bec43bcc
5fae9ec02573bc71bec43bcb
60757476d89bd74b3af11cb6


In [27]:
# from bokeh.models import SingleIntervalTicker, LinearAxis

In [28]:
# ticker = SingleIntervalTicker(interval=200, num_minor_ticks=10)
# xaxis = LinearAxis(ticker=ticker)
# xaxis.major_label_overrides = x_translation
# p.add_layout(xaxis, 'below')
# p.xaxis.major_label_orientation = math.pi/2

In [29]:
# x_translation

In [30]:
from bokeh.models import DatetimeTickFormatter
from bokeh.models import Legend

In [31]:
# p.xaxis.formatter=DatetimeTickFormatter(
#         hours=["%d %B %Y"],
#         days=["%d %B %Y"],
#         months=["%d %B %Y"],
#         years=["%d %B %Y"],
#     )

In [32]:
p.xaxis.major_label_orientation = math.pi/4

In [33]:
legend = Legend(items=legend_it)
legend.click_policy = "mute"
p.add_layout(legend, 'below')

In [34]:
show(p)

In [35]:
# TODO https://stackoverflow.com/questions/46730609/position-the-legend-outside-the-plot-area-with-bokeh

In [36]:
new_data = copy.deepcopy(data)
new_data['x'] = list(range(len(data['x'])))
p = figure(plot_width=1000, plot_height=600)
source = ColumnDataSource(data=new_data)
legend_it = []
i = 0
for current in scores.keys():
    print(current)
    el = p.line('x', current, source=source, line_width=2, color=colors[i])
    legend_it.append((getRecsysName(current), [el]))
    i += 1
    if i == len(colors):
        i = 0
legend = Legend(items=legend_it)
legend.click_policy = "mute"
p.add_layout(legend, 'below')
show(p)

5fae9ec42573bc71bec43bcd
60910f84690fad01ccacf303
5fae9ebf2573bc71bec43bca
60756dcad89bd74b3af11cb5
607989ab7a2f6f120b2e9a2b
5fae9ec12573bc71bec43bcc
5fae9ec02573bc71bec43bcb
60757476d89bd74b3af11cb6


In [37]:
# TODO faire des points qui indique le 1vs1 en overlay
# TODO ajouter la date à interval regulier, ou tracer une ligne vertical avec date en abcsisse
# En overlay donner le Elo score et le score de confidence
# Print un tableau du ranking aussi

In [38]:
# hjrecsys3 -> keywords
# test-baseline-1 (Erik) -> keywords