In [1]:
!pip install pyserini==0.13.0
!pip install srt==3.5.0
!pip install pyNTCIREVAL==0.0.3



In [2]:
from pathlib import Path
import json
import srt
from pyserini.search import SimpleSearcher
from pyserini.analysis import Analyzer, get_lucene_analyzer
from IPython.display import display
from typing import List, Dict

import pandas as pd

from pyNTCIREVAL import Labeler
from pyNTCIREVAL.metrics import MSnDCG

from tqdm.notebook import tqdm

In [3]:
BASE_DIR = Path()
DATA_DIR = BASE_DIR.joinpath("lolcs")
assert BASE_DIR.exists()
assert DATA_DIR.exists()

In [4]:
!head -n2 lolcs/twitch_comments_anonymized.jsonl

{"file_id": "001", "title": "EDG vs.WE | LNG vs. SN - Week 1 Day 1 | LPL Summer Split (2020)", "url": "https://www.twitch.tv/lpl/video/641800502", "start": "03:11:16", "end": "03:46:38", "youtube_start": "0:00:00", "youtube_end": "0:35:22", "comments": [{"id": "001-0000", "name": "648313f9a58f9c60060590a74ee5bf9a", "content": "imagine being lng management weirdchamp", "twitch_time": "3:11:16", "youtube_time": "0:00:00"}, {"id": "001-0001", "name": "7af16905686de26adfb34a1a596f3411", "content": "wp in the eu face off sad deadius was too heavy pepehands", "twitch_time": "3:11:16", "youtube_time": "0:00:00"}, {"id": "001-0002", "name": "3ce231eab308630a056199990313264e", "content": "open the mod", "twitch_time": "3:11:16", "youtube_time": "0:00:00"}, {"id": "001-0003", "name": "7590ae09cf5f21906d27ba80a2b2f18f", "content": "next friday", "twitch_time": "3:11:21", "youtube_time": "0:00:05"}, {"id": "001-0004", "name": "5e92d82294df3391ced66c81f3ef6c2b", "content": "which one did you bet on

In [5]:
twitch_videos = []
with open(DATA_DIR.joinpath("twitch_comments_anonymized.jsonl")) as jsonfile:
  for line in jsonfile:
    twitch_videos.append(json.loads(line))
len(twitch_videos)

20

In [6]:
# display comment data
for key, value in twitch_videos[0].items():
  if key != "comments":
    print(key, value)
print()
for comment in twitch_videos[0]["comments"][:5]:
  display(comment)

file_id 001
title EDG vs.WE | LNG vs. SN - Week 1 Day 1 | LPL Summer Split (2020)
url https://www.twitch.tv/lpl/video/641800502
start 03:11:16
end 03:46:38
youtube_start 0:00:00
youtube_end 0:35:22



{'content': 'imagine being lng management weirdchamp',
 'id': '001-0000',
 'name': '648313f9a58f9c60060590a74ee5bf9a',
 'twitch_time': '3:11:16',
 'youtube_time': '0:00:00'}

{'content': 'wp in the eu face off sad deadius was too heavy pepehands',
 'id': '001-0001',
 'name': '7af16905686de26adfb34a1a596f3411',
 'twitch_time': '3:11:16',
 'youtube_time': '0:00:00'}

{'content': 'open the mod',
 'id': '001-0002',
 'name': '3ce231eab308630a056199990313264e',
 'twitch_time': '3:11:16',
 'youtube_time': '0:00:00'}

{'content': 'next friday',
 'id': '001-0003',
 'name': '7590ae09cf5f21906d27ba80a2b2f18f',
 'twitch_time': '3:11:21',
 'youtube_time': '0:00:05'}

{'content': 'which one did you bet on',
 'id': '001-0004',
 'name': '5e92d82294df3391ced66c81f3ef6c2b',
 'twitch_time': '3:11:22',
 'youtube_time': '0:00:06'}

In [7]:
# save for anserini indexing
collection_dir = Path("collection")
collection_dir.mkdir(exist_ok=True)
MIN_LENGTH = 3
with open(collection_dir.joinpath("comments.jsonl"), "w") as jsonfile:
  for video in twitch_videos:
    file_id = video["file_id"]
    for comment_id, comment in enumerate(video["comments"]):
      # filter too short comment
      if len(comment['content'].split()) < MIN_LENGTH:
        continue
      comment["contents"] = f"{video['title']}\n{comment['content']}"
      jsonfile.write(json.dumps(comment))
      jsonfile.write("\n")

In [8]:
!wc -l collection/comments.jsonl
!head -n2 collection/comments.jsonl

23411 collection/comments.jsonl
{"id": "001-0000", "name": "648313f9a58f9c60060590a74ee5bf9a", "content": "imagine being lng management weirdchamp", "twitch_time": "3:11:16", "youtube_time": "0:00:00", "contents": "EDG vs.WE | LNG vs. SN - Week 1 Day 1 | LPL Summer Split (2020)\nimagine being lng management weirdchamp"}
{"id": "001-0001", "name": "7af16905686de26adfb34a1a596f3411", "content": "wp in the eu face off sad deadius was too heavy pepehands", "twitch_time": "3:11:16", "youtube_time": "0:00:00", "contents": "EDG vs.WE | LNG vs. SN - Week 1 Day 1 | LPL Summer Split (2020)\nwp in the eu face off sad deadius was too heavy pepehands"}


# Indexing using Pyserini

In [9]:
%%time
!mkdir -p index
!python -m pyserini.index \
  -collection JsonCollection \
  -generator DefaultLuceneDocumentGenerator \
  -threads 1 \
  -input collection \
  -index index \
  -storePositions -storeDocvectors -storeRaw -language en

2021-11-01 08:50:41,931 INFO  [main] index.IndexCollection (IndexCollection.java:643) - Setting log level to INFO
2021-11-01 08:50:41,934 INFO  [main] index.IndexCollection (IndexCollection.java:646) - Starting indexer...
2021-11-01 08:50:41,940 INFO  [main] index.IndexCollection (IndexCollection.java:648) - DocumentCollection path: collection
2021-11-01 08:50:41,940 INFO  [main] index.IndexCollection (IndexCollection.java:649) - CollectionClass: JsonCollection
2021-11-01 08:50:41,941 INFO  [main] index.IndexCollection (IndexCollection.java:650) - Generator: DefaultLuceneDocumentGenerator
2021-11-01 08:50:41,941 INFO  [main] index.IndexCollection (IndexCollection.java:651) - Threads: 1
2021-11-01 08:50:41,942 INFO  [main] index.IndexCollection (IndexCollection.java:652) - Stemmer: porter
2021-11-01 08:50:41,942 INFO  [main] index.IndexCollection (IndexCollection.java:653) - Keep stopwords? false
2021-11-01 08:50:41,943 INFO  [main] index.IndexCollection (IndexCollection.java:654) - Sto

# Try to Retrieve Comments using Pyserini

In [10]:
youtube_videos = []
with open(DATA_DIR.joinpath("youtube_descriptions.jsonl")) as jsonfile:
  for line in jsonfile:
    youtube_videos.append(json.loads(line))
len(youtube_videos)

20

In [11]:
# display description data
print("key:", list(youtube_videos[0].keys()))
print("num of max descs in one video:", max(len(video["descriptions"]) for video in youtube_videos))
print("file_id:", youtube_videos[0]["file_id"])
print("title:", youtube_videos[0]["title"])
for desc in youtube_videos[0]["descriptions"][:5]:
  print(desc)

key: ['file_id', 'title', 'descriptions']
num of max descs in one video: 2017
file_id: 001
title: EDG vs. WE - Game 1 _ LPL Summer Split 2020 Week 1 _ Edward Gaming vs. Team WE
{'id': '001-0001', 'start': '0:00:00', 'end': '0:00:02', 'content': 'I think this is again w mean looking to'}
{'id': '001-0002', 'start': '0:00:02', 'end': '0:00:04', 'content': 'just get this ball rolling nice and'}
{'id': '001-0003', 'start': '0:00:04', 'end': '0:00:07', 'content': "early against EDG we'll see if they can"}
{'id': '001-0004', 'start': '0:00:07', 'end': '0:00:09', 'content': 'get that ball rolling or if the EDG are'}
{'id': '001-0005', 'start': '0:00:09', 'end': '0:00:10', 'content': 'gonna be able to stop them in their'}


In [12]:
def to_seconds(time_str: str) -> int:
    # A:BB:CC,DDD -> A*3600+BB*60+CC.DDD
    if time_str.count(":") == 2:
        hour_str, minute_str, second_str = time_str.split(":")
    elif time_str.count(":") == 1:
        minute_str, second_str = time_str.split(":")
        hour_str = "0"

    if "," in second_str:
        second_str = second_str.replace(",", ".")
    return int(hour_str) * 3600 + int(minute_str) * 60 + int(float(second_str)+0.5)


MIN_COMMENT_WORDS = 3
MIN_CAND_REL_COMMENTS = 20


def comment_rel_score(tw_time: int, yt_time: int,
                      tw_comment: str, wpm: int = 44) -> float:
    # relevance score of twitch comment
    # see eq. (1) in section 3.2 in detail 
    return tw_time - yt_time - wpm * len(tw_comment.split()) / 60


def filter_and_annotate_description(twitch, youtube) -> None:
    # filter youtube description by twitch time
    # and annotate relevant comment of the description
    descs = []

    twitch_start = to_seconds(twitch["youtube_start"])

    for desc in youtube["descriptions"]:
        desc_end = to_seconds(desc["end"])
        if desc_end < twitch_start:
            continue
        # (relevance, comment_id)
        rel_comments = []
        for comment in twitch["comments"]:
            comment_start = to_seconds(comment["youtube_time"])
            if len(comment["content"].split()) < MIN_COMMENT_WORDS:
                continue
            rel_score = comment_rel_score(
                comment_start, desc_end, comment["content"])
            if rel_score >= 0.0:
                rel_comments.append((rel_score, comment["id"]))
        if len(rel_comments) >= MIN_CAND_REL_COMMENTS:
            rel_comments.sort()
            rel_ids = {}
            rel_comments = rel_comments[:MIN_CAND_REL_COMMENTS]
            for i, (_, comment_id) in enumerate(rel_comments):
                rel_ids[comment_id] = MIN_CAND_REL_COMMENTS - i
            desc["relevant_comments"] = rel_ids
            descs.append(desc)

    youtube["descriptions"] = descs

In [13]:
# filter youtube description by twitch time
# and annotate relevant comments of the descprition
for twitch, youtube in zip(twitch_videos, youtube_videos):
    assert twitch["file_id"] == youtube["file_id"]
    n_descs = len(youtube["descriptions"])
    filter_and_annotate_description(twitch, youtube)
    print("file_id:", twitch["file_id"], "before:", n_descs, "after:", len(youtube["descriptions"]))

file_id: 001 before: 971 after: 962
file_id: 002 before: 929 after: 898
file_id: 003 before: 965 after: 933
file_id: 004 before: 831 after: 809
file_id: 005 before: 815 after: 777
file_id: 010 before: 919 after: 888
file_id: 012 before: 1132 after: 1107
file_id: 030 before: 812 after: 770
file_id: 034 before: 791 after: 752
file_id: 041 before: 1000 after: 780
file_id: 071 before: 1162 after: 956
file_id: 099 before: 1038 after: 903
file_id: 262 before: 1371 after: 1156
file_id: 273 before: 1562 after: 1224
file_id: 275 before: 1715 after: 1352
file_id: 276 before: 1951 after: 1068
file_id: 279 before: 1636 after: 998
file_id: 284 before: 1999 after: 1362
file_id: 998 before: 1154 after: 1007
file_id: 999 before: 2017 after: 1023


In [14]:
# display relevant comment ids and their relevance scores
youtube_videos[0]["descriptions"][0]

{'content': 'I think this is again w mean looking to',
 'end': '0:00:02',
 'id': '001-0001',
 'relevant_comments': {'001-0006': 20,
  '001-0011': 19,
  '001-0014': 18,
  '001-0015': 16,
  '001-0016': 15,
  '001-0018': 17,
  '001-0022': 14,
  '001-0023': 12,
  '001-0025': 11,
  '001-0026': 13,
  '001-0027': 10,
  '001-0028': 9,
  '001-0031': 8,
  '001-0032': 7,
  '001-0034': 6,
  '001-0038': 5,
  '001-0046': 3,
  '001-0047': 1,
  '001-0048': 4,
  '001-0049': 2},
 'start': '0:00:00'}

In [15]:
# let's try to retrieve comments for above description
searcher = SimpleSearcher('index')
query = youtube_videos[0]["descriptions"][10]["content"]
hits = searcher.search(query, k=5)

print("Query:", query)
print()

for i in range(len(hits)):
  doc = searcher.doc(hits[i].docid)
  comment = json.loads(doc.raw())
  print(f'{i+1:02} {hits[i].score:.5f}')
  display(comment)
  print()

Query: go we're on to the rift against team w E

01 5.88690


{'content': 'e w slow xd',
 'contents': 'IG vs. FPX - 2020 LPL Regional Qualifier | LoL China 9th Anniversary (2020)\ne w slow xd',
 'id': '279-0754',
 'name': 'a5470fd7b832156ea814c07316176080',
 'twitch_time': '5:05:23',
 'youtube_time': '0:18:09'}


02 5.82240


{'content': 'this match is w/e shrugs',
 'contents': '2021 LPL Regional Qualifier | RNG vs WE | League of Legends CN 10th Anniversary Day2\nthis match is w/e shrugs',
 'id': '998-2251',
 'name': 'fc737fdded9e01336bb569a24d96eb54',
 'twitch_time': '4:07:09',
 'youtube_time': '0:31:43'}


03 5.75930


{'content': 'swordart missed his e and w',
 'contents': 'EDG vs.WE | LNG vs. SN - Week 1 Day 1 | LPL Summer Split (2020)\nswordart missed his e and w',
 'id': '001-1194',
 'name': '73572420454f2dbb8d61754069ba5dbc',
 'twitch_time': '3:30:25',
 'youtube_time': '0:19:09'}


04 5.75930


{'content': 'swordart missed his e and w',
 'contents': 'EDG vs.WE | LNG vs. SN - Week 1 Day 1 | LPL Summer Split (2020)\nswordart missed his e and w',
 'id': '003-1192',
 'name': '73572420454f2dbb8d61754069ba5dbc',
 'twitch_time': '3:30:25',
 'youtube_time': '0:19:09'}


05 5.27920


{'content': 'lpl 4 team go to world na also 4 team go to world korea also 4 team go to world',
 'contents': 'SN vs. LGD - 2020 LPL Regional Qualifier | LoL China 9th Anniversary (2020)\nlpl 4 team go to world na also 4 team go to world korea also 4 team go to world',
 'id': '273-1324',
 'name': '5d5e9acb29a7852cc1dc7ec17228e5cc',
 'twitch_time': '1:39:05',
 'youtube_time': '0:25:01'}




# Comment Retrieval using Anserini

## Installing Anserini

In [16]:
%%capture
!apt-get install maven -qq

In [17]:
%%capture
!git clone --recurse-submodules https://github.com/castorini/anserini.git
%cd anserini
!mvn clean package appassembler:assemble -DskipTests -Dmaven.javadoc.skip=true
%cd ..

## Create Topic (Query) File

In [18]:
topic_path = Path("topics.tsv")
analyzer = Analyzer(get_lucene_analyzer())
with open(topic_path, "w") as tsvfile:
  for video in youtube_videos:
    file_id = video["file_id"]
    for desc in video["descriptions"]:
      if len(desc["content"].split()) < 2:
        continue
      # filter queries whose length is zero when parsed
      if len(analyzer.analyze(desc["content"])) == 0:
        continue      
      tsvfile.write(f"{desc['id']}\t{desc['content']}\n")
!wc -l topics.tsv
!head -n10 topics.tsv
!tail -n10 topics.tsv

19147 topics.tsv
001-0001	I think this is again w mean looking to
001-0002	just get this ball rolling nice and
001-0003	early against EDG we'll see if they can
001-0004	get that ball rolling or if the EDG are
001-0005	gonna be able to stop them in their
001-0006	tracks and control the snowball it's not
001-0007	again it's a game number one of this
001-0008	series as we see the aurelion sol come
001-0009	through for T Jamar I'm looking at him
001-0010	against scout in the middle a so here we
999-1638	heading towards this top lane <eos>
999-1639	trying to finish this one off and it
999-1640	looks like we will have some extra money
999-1641	going over to charity as the underdogs
999-1642	already looking like the top dogs here
999-1643	in our first game <eos>
999-1644	<eos> <com2> yep we get something good out of it a
999-1645	pretty cool cup
999-1646	from the side of pcs and also some nice
999-1647	charity money


## Searching by Basic Retrieval Models

In [19]:
%%time
!mkdir -p runs
# BM25 (k1=0.9, b=0.4)
!anserini/target/appassembler/bin/SearchCollection -topicreader TsvString -index index \
 -topics topics.tsv -output runs/run.bm25.txt -bm25 -hits 100

2021-11-01 08:54:20,047 INFO  [main] search.SearchCollection (SearchCollection.java:288) - Index: index
2021-11-01 08:54:20,260 INFO  [main] search.SearchCollection (SearchCollection.java:368) - Language: en
2021-11-01 08:54:20,261 INFO  [main] search.SearchCollection (SearchCollection.java:369) - Stemmer: porter
2021-11-01 08:54:20,262 INFO  [main] search.SearchCollection (SearchCollection.java:370) - Keep stopwords? false
2021-11-01 08:54:20,262 INFO  [main] search.SearchCollection (SearchCollection.java:371) - Stopwords file null
2021-11-01 08:54:20,581 INFO  [main] search.SearchCollection (SearchCollection.java:582) - runtag: Anserini
2021-11-01 08:54:20,601 INFO  [pool-2-thread-1] search.SearchCollection$SearcherThread (SearchCollection.java:182) - [Start] ranker: bm25(k1=0.9,b=0.4), reranker: default
2021-11-01 08:54:21,876 INFO  [pool-2-thread-1] search.SearchCollection$SearcherThread (SearchCollection.java:263) - 100 queries processed
2021-11-01 08:54:22,580 INFO  [pool-2-threa

In [20]:
# Query Likelihood with Dirichlet Smoothing (mu=1000)
!anserini/target/appassembler/bin/SearchCollection -topicreader TsvString -index index \
 -topics topics.tsv -output runs/run.qld.txt -qld -hits 100

2021-11-01 08:55:20,093 INFO  [main] search.SearchCollection (SearchCollection.java:288) - Index: index
2021-11-01 08:55:20,458 INFO  [main] search.SearchCollection (SearchCollection.java:368) - Language: en
2021-11-01 08:55:20,459 INFO  [main] search.SearchCollection (SearchCollection.java:369) - Stemmer: porter
2021-11-01 08:55:20,460 INFO  [main] search.SearchCollection (SearchCollection.java:370) - Keep stopwords? false
2021-11-01 08:55:20,461 INFO  [main] search.SearchCollection (SearchCollection.java:371) - Stopwords file null
2021-11-01 08:55:20,700 INFO  [main] search.SearchCollection (SearchCollection.java:582) - runtag: Anserini
2021-11-01 08:55:20,729 INFO  [pool-2-thread-1] search.SearchCollection$SearcherThread (SearchCollection.java:182) - [Start] ranker: qld(mu=1000), reranker: default
2021-11-01 08:55:22,118 INFO  [pool-2-thread-1] search.SearchCollection$SearcherThread (SearchCollection.java:263) - 100 queries processed
2021-11-01 08:55:22,766 INFO  [pool-2-thread-1] s

In [21]:
# Query Likelihood with Jelinek-Mercer Smoothing (lambda=0.1)
!anserini/target/appassembler/bin/SearchCollection -topicreader TsvString -index index \
 -topics topics.tsv -output runs/run.qljm.txt -qljm -hits 100

2021-11-01 08:56:18,222 INFO  [main] search.SearchCollection (SearchCollection.java:288) - Index: index
2021-11-01 08:56:18,439 INFO  [main] search.SearchCollection (SearchCollection.java:368) - Language: en
2021-11-01 08:56:18,440 INFO  [main] search.SearchCollection (SearchCollection.java:369) - Stemmer: porter
2021-11-01 08:56:18,440 INFO  [main] search.SearchCollection (SearchCollection.java:370) - Keep stopwords? false
2021-11-01 08:56:18,442 INFO  [main] search.SearchCollection (SearchCollection.java:371) - Stopwords file null
2021-11-01 08:56:18,674 INFO  [main] search.SearchCollection (SearchCollection.java:582) - runtag: Anserini
2021-11-01 08:56:18,694 INFO  [pool-2-thread-1] search.SearchCollection$SearcherThread (SearchCollection.java:182) - [Start] ranker: qljm(lambda=0.1), reranker: default
2021-11-01 08:56:19,983 INFO  [pool-2-thread-1] search.SearchCollection$SearcherThread (SearchCollection.java:263) - 100 queries processed
2021-11-01 08:56:20,635 INFO  [pool-2-thread-

In [22]:
# Sequential Dependence Model (w/ QLD)
!anserini/target/appassembler/bin/SearchCollection -topicreader TsvString -index index \
 -topics topics.tsv -output runs/run.sdm.txt -qld -sdm -hits 100

2021-11-01 08:57:15,201 INFO  [main] search.SearchCollection (SearchCollection.java:288) - Index: index
2021-11-01 08:57:15,408 INFO  [main] search.SearchCollection (SearchCollection.java:368) - Language: en
2021-11-01 08:57:15,409 INFO  [main] search.SearchCollection (SearchCollection.java:369) - Stemmer: porter
2021-11-01 08:57:15,410 INFO  [main] search.SearchCollection (SearchCollection.java:370) - Keep stopwords? false
2021-11-01 08:57:15,410 INFO  [main] search.SearchCollection (SearchCollection.java:371) - Stopwords file null
2021-11-01 08:57:15,658 INFO  [main] search.SearchCollection (SearchCollection.java:582) - runtag: Anserini
2021-11-01 08:57:15,680 INFO  [pool-2-thread-1] search.SearchCollection$SearcherThread (SearchCollection.java:182) - [Start] ranker: qld(mu=1000), reranker: default
2021-11-01 08:57:17,428 INFO  [pool-2-thread-1] search.SearchCollection$SearcherThread (SearchCollection.java:263) - 100 queries processed
2021-11-01 08:57:18,264 INFO  [pool-2-thread-1] s

In [23]:
!wc -l runs/*.txt

  1825823 runs/run.bm25.txt
  1825823 runs/run.qld.txt
  1825823 runs/run.qljm.txt
  1825823 runs/run.sdm.txt
  7303292 total


# Compute Metrics

## Compute nDCG@$k$ ($k=3, 5, 10, 20)$

In [24]:
def read_run_file(run_path: Path) -> Dict[str, List[str]]:
    # read run file then return ranked list for each query
    query_to_ranking = {}
    with open(run_path) as f:
        for line in f:
            qid, _, doc_id, *_ = line.split()
            ranking = query_to_ranking.get(qid)
            if ranking is None:
                query_to_ranking[qid] = ranking = []
            ranking.append(doc_id)
    return query_to_ranking


def compute_topic_metrics(ranked_list, relevant_comments) -> Dict[str, float]:
    labeler = Labeler(relevant_comments)
    labeled_ranked_list = labeler.label(ranked_list)
    rel_level_num = MIN_CAND_REL_COMMENTS + 1
    xrelnum = labeler.compute_per_level_doc_num(rel_level_num)

    grades = list(range(MIN_CAND_REL_COMMENTS+1))
    cutoffs = [3, 5, 10, 20]
    metrics = [MSnDCG(xrelnum, grades, cutoff) for cutoff in cutoffs]

    topic_result = {}
    for metric in metrics:
        score = metric.compute(labeled_ranked_list)
        topic_result[str(metric)] = score
    return topic_result


def compute_metrics(youtube_videos, topic_to_ranking) -> Dict[str, Dict[str, float]]:
    topic_results = {}
    for youtube in tqdm(youtube_videos):
        for desc in youtube["descriptions"]:
            topic_id = desc["id"]
            if topic_id not in topic_to_ranking:
                continue
            ranked_list = topic_to_ranking[topic_id]
            relevant_comments = desc["relevant_comments"]
            topic_results[topic_id] = compute_topic_metrics(
                ranked_list, relevant_comments)
    return topic_results


def compute_comment_overlap(youtube_videos, twitch_videos,
                            topic_to_ranking, cutoff: int) -> Dict[str, Dict[str, float]]:
    id_to_comments = {}
    for twitch in twitch_videos:
        for comment in twitch["comments"]:
            assert comment["id"] not in id_to_comments
            id_to_comments[comment["id"]] = comment["content"]
    topic_results = {}
    for youtube in tqdm(youtube_videos):
        for desc in youtube["descriptions"]:
            topic_id = desc["id"]
            if topic_id not in topic_to_ranking:
                continue
            retrieved_comment_words = []
            for doc_id in topic_to_ranking[topic_id][:cutoff]:
                retrieved_comment_words.extend(id_to_comments[doc_id].split())
            relevant_comment_words = []
            assert len(desc["relevant_comments"]) == MIN_CAND_REL_COMMENTS
            for doc_id in list(desc["relevant_comments"])[:cutoff]:
                relevant_comment_words.extend(id_to_comments[doc_id])
            retrieved_set = set(retrieved_comment_words)
            relevant_set = set(relevant_comment_words)

            precision = len(retrieved_set & relevant_set) / len(retrieved_set)
            recall = len(retrieved_set & relevant_set) / len(relevant_set)
            if precision + recall == 0.0:
                f1 = 0.0
            else:
                f1 = 2.0 * precision * recall / (precision + recall)
            topic_results[topic_id] = {
                f"Precision@{cutoff}": precision,
                f"Recall@{cutoff}": recall,
                f"F1@{cutoff}": f1,
            }
    return topic_results

In [25]:
run_dir = Path("runs")
run_ranking_results = []
for run_path in sorted(run_dir.glob("*.txt")):
  print(run_path.stem)
  topic_to_ranking = read_run_file(run_path)
  results = compute_metrics(youtube_videos, topic_to_ranking)
  run_ranking_results.append(pd.DataFrame(results).T.mean().to_frame(run_path.stem.split(".")[1].upper()))

run.bm25


  0%|          | 0/20 [00:00<?, ?it/s]

run.qld


  0%|          | 0/20 [00:00<?, ?it/s]

run.qljm


  0%|          | 0/20 [00:00<?, ?it/s]

run.sdm


  0%|          | 0/20 [00:00<?, ?it/s]

In [26]:
result_df = pd.concat(run_ranking_results, axis=1).T
result_df.round(5)

Unnamed: 0,MSnDCG@0003,MSnDCG@0005,MSnDCG@0010,MSnDCG@0020
BM25,0.00193,0.00182,0.00175,0.00209
QLD,0.00211,0.00196,0.00192,0.00221
QLJM,0.00205,0.00181,0.00178,0.00209
SDM,0.00209,0.00196,0.00192,0.0022


## Word Matching between Retrieval Comments & Relevant Comments

In [27]:
run_dir = Path("runs")
cutoffs = [3, 5, 10, 20]
run_word_results = {cutoff: [] for cutoff in cutoffs}
for run_path in sorted(run_dir.glob("*.txt")):
  print(run_path.stem)
  topic_to_ranking = read_run_file(run_path)
  for cutoff in cutoffs:
    print(f"cutoff={cutoff}")
    results = compute_comment_overlap(youtube_videos, twitch_videos, topic_to_ranking, cutoff)
    run_word_results[cutoff].append(pd.DataFrame(results).T.mean().to_frame(run_path.stem.split(".")[1].upper()))

run.bm25
cutoff=3


  0%|          | 0/20 [00:00<?, ?it/s]

cutoff=5


  0%|          | 0/20 [00:00<?, ?it/s]

cutoff=10


  0%|          | 0/20 [00:00<?, ?it/s]

cutoff=20


  0%|          | 0/20 [00:00<?, ?it/s]

run.qld
cutoff=3


  0%|          | 0/20 [00:00<?, ?it/s]

cutoff=5


  0%|          | 0/20 [00:00<?, ?it/s]

cutoff=10


  0%|          | 0/20 [00:00<?, ?it/s]

cutoff=20


  0%|          | 0/20 [00:00<?, ?it/s]

run.qljm
cutoff=3


  0%|          | 0/20 [00:00<?, ?it/s]

cutoff=5


  0%|          | 0/20 [00:00<?, ?it/s]

cutoff=10


  0%|          | 0/20 [00:00<?, ?it/s]

cutoff=20


  0%|          | 0/20 [00:00<?, ?it/s]

run.sdm
cutoff=3


  0%|          | 0/20 [00:00<?, ?it/s]

cutoff=5


  0%|          | 0/20 [00:00<?, ?it/s]

cutoff=10


  0%|          | 0/20 [00:00<?, ?it/s]

cutoff=20


  0%|          | 0/20 [00:00<?, ?it/s]

In [28]:
result_dfs = []
for cutoff in cutoffs:
  result_dfs.append(pd.concat(run_word_results[cutoff], axis=1).T)
display(pd.concat(result_dfs, axis=1).round(4))

Unnamed: 0,Precision@3,Recall@3,F1@3,Precision@5,Recall@5,F1@5,Precision@10,Recall@10,F1@10,Precision@20,Recall@20,F1@20
BM25,0.0268,0.0321,0.0271,0.0271,0.0421,0.0311,0.0269,0.0618,0.036,0.0263,0.0903,0.0397
QLD,0.0254,0.0313,0.0257,0.0266,0.042,0.0305,0.0266,0.0625,0.0358,0.0261,0.0925,0.0396
QLJM,0.0257,0.0324,0.0266,0.0264,0.0422,0.0305,0.0263,0.0609,0.0352,0.0258,0.0882,0.0389
SDM,0.0254,0.0312,0.0257,0.0265,0.0418,0.0304,0.0265,0.0622,0.0357,0.026,0.0922,0.0395
