In [2]:
import pandas as pd
import csv
import json
import numpy as np
from tqdm import tqdm
from pyserini.index import IndexReader
from collections import Counter


In [4]:
!python -m pyserini.index -collection JsonCollection \
                         -generator DefaultLuceneDocumentGenerator \
                         -threads 1 \
                         -input transcripts/resources \
                         -index transcripts/indexes/ \
                         -storePositions -storeDocvectors -storeRaw

2021-11-02 10:24:51,660 INFO  [main] index.IndexCollection (IndexCollection.java:643) - Setting log level to INFO
2021-11-02 10:24:51,661 INFO  [main] index.IndexCollection (IndexCollection.java:646) - Starting indexer...
2021-11-02 10:24:51,662 INFO  [main] index.IndexCollection (IndexCollection.java:648) - DocumentCollection path: transcripts/resources
2021-11-02 10:24:51,662 INFO  [main] index.IndexCollection (IndexCollection.java:649) - CollectionClass: JsonCollection
2021-11-02 10:24:51,662 INFO  [main] index.IndexCollection (IndexCollection.java:650) - Generator: DefaultLuceneDocumentGenerator
2021-11-02 10:24:51,662 INFO  [main] index.IndexCollection (IndexCollection.java:651) - Threads: 1
2021-11-02 10:24:51,662 INFO  [main] index.IndexCollection (IndexCollection.java:652) - Stemmer: porter
2021-11-02 10:24:51,663 INFO  [main] index.IndexCollection (IndexCollection.java:653) - Keep stopwords? false
2021-11-02 10:24:51,663 INFO  [main] index.IndexCollection (IndexCollection.java

In [6]:
index_reader = IndexReader('transcripts/indexes')


In [7]:
idx = range(54936)
len(idx)

54936

In [8]:
terms = {term.term: term.df for term in index_reader.terms()}
len(terms)

15317

In [10]:
dl = {i: len(json.loads(index_reader.doc(str(i)).raw())['contents']) for i in idx if index_reader.doc(str(i))}
len(dl)

54936

In [12]:
avg_dl = np.mean([len(json.loads(index_reader.doc(str(i)).raw())['contents']) for i in idx if index_reader.doc(str(i))])
avg_dl

175.3736347750109

In [13]:
df = {term: (index_reader.get_term_counts(term, analyzer=None))[0] for term in terms.keys()}
df

{'0': 232,
 '0.18': 1,
 '00': 5,
 '01100111': 1,
 '0110111001101111': 1,
 '03': 1,
 '04': 1,
 '0400': 2,
 '05': 3,
 '06': 1,
 '0700': 1,
 '08': 1,
 '0about': 1,
 '1': 6842,
 '1,000': 3,
 '1.1': 1,
 '1.3': 1,
 '1.8': 1,
 '10': 7987,
 '10,000': 1,
 '100': 241,
 '1000': 1,
 '101': 235,
 '102': 232,
 '102.2': 1,
 '103': 231,
 '104': 234,
 '105': 231,
 '106': 231,
 '107': 231,
 '108': 231,
 '109': 231,
 '10pm': 1,
 '10th': 3,
 '11': 2927,
 '11,446': 1,
 '110': 232,
 '111': 231,
 '112': 232,
 '113': 231,
 '114': 232,
 '115': 231,
 '1150': 1,
 '116': 231,
 '117': 231,
 '1175': 2,
 '118': 231,
 '118.31': 1,
 '119': 231,
 '11pm': 1,
 '11th': 1,
 '12': 2591,
 '12.5': 1,
 '12.50': 1,
 '120': 237,
 '121': 231,
 '122': 231,
 '123': 233,
 '124': 231,
 '125': 232,
 '126': 231,
 '127': 231,
 '128': 235,
 '129': 231,
 '12th': 3,
 '13': 2626,
 '130': 235,
 '131': 231,
 '132': 231,
 '133': 231,
 '134': 231,
 '135': 231,
 '136': 231,
 '137': 231,
 '138': 231,
 '139': 231,
 '14': 2598,
 '140': 232,
 '141':

In [15]:
tf = {i: index_reader.get_document_vector(str(i)) for i in idx if index_reader.doc(str(i))}
tf

{0: {'dress': 1,
  'line': 1,
  'howard': 1,
  'scene': 1,
  'topolog': 1,
  'knight': 1,
  'raj': 1,
  'enter': 1,
  'titl': 1,
  'episod': 1,
  'sheldon': 1,
  'lobbi': 1,
  'codpiec': 1,
  'court': 1,
  'line_idx': 1,
  'monk': 1,
  'entranc': 1,
  'jester': 1,
  '0': 1,
  'actor': 1,
  'mediev': 2,
  'leonard': 1,
  '2': 2,
  'build': 1,
  'gui': 1,
  'seri': 1,
  'gentleman': 1},
 1: {'sheldon': 1,
  'line': 1,
  'codpiec': 1,
  'fair': 1,
  'line_idx': 1,
  'actor': 1,
  '1': 1,
  'ever': 1,
  'topolog': 1,
  '2': 2,
  'renaiss': 1,
  'worst': 1,
  'titl': 1,
  'seri': 1,
  'episod': 1},
 2: {'sheldon': 1,
  'line': 1,
  'codpiec': 1,
  'pleas': 1,
  'go': 1,
  'line_idx': 1,
  'actor': 1,
  'leonard': 1,
  'topolog': 1,
  '2': 3,
  'let': 1,
  'titl': 1,
  'seri': 1,
  'episod': 1},
 3: {'german': 1,
  'some': 1,
  'rife': 1,
  'line': 1,
  'spice': 1,
  'mead': 2,
  'best': 1,
  'had': 1,
  'rhineheitsgebot': 1,
  'topolog': 1,
  'her': 1,
  'would': 1,
  'sever': 1,
  'now': 1

In [16]:
N = index_reader.stats()['documents']
N

54936

In [17]:
import pickle


# Open a file and use dump()
with open('file.pickle', 'wb') as file:
    
    # A new file will be created
    pickle.dump(tf, file)
    pickle.dump(N, file)
    pickle.dump(df, file)
    pickle.dump(dl, file)
    pickle.dump(avg_dl, file)
    
    
# Open the file in binary mode
with open('file.pickle', 'rb') as file:

    # Call load method to deserialze
    var_tf = pickle.load(file)
    var_N = pickle.load(file)
    var_df = pickle.load(file)
    var_dl = pickle.load(file)
    var_avg_dl = pickle.load(file)
  

In [18]:
print(var_tf)

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [10]:
# k1=1.5, b=0.5, k3=1.2

In [11]:
# k1=1.5, b=0.45, k3=1.2

In [12]:
# k1=1.4, b=0.45, k3=1.1 *

In [19]:
def score_bm25(query, doc_id, k1=1.5, b=0.5, k3=1.2):
    rank_score = 0

    # get query term frequency
    qtf = Counter(query)

    for word in query:
        if tf[doc_id].get(word):
            print(word, df.get(word,0), tf[doc_id][word], dl[doc_id])
            rank_score += np.log((N - df.get(word,0) + 0.5)/(df.get(word,0) + 0.5)) * (k1 + 1) * tf[doc_id][word] / (
                        k1 * (1 - b + b * dl[doc_id] / avg_dl) + tf[doc_id][word]) * (k3 + 1) * qtf[word] / (
                        k3 + qtf[word])

#     print(i,rank_score)
    return rank_score

In [21]:
query_data = pd.read_csv('query.csv')

In [22]:
QueryId = []
DocumentId = []
Score = []
Raw = []
# for qid in tqdm(range(query_data.shape[0])):
for qid in tqdm(range(1)):
    analyzed_query = index_reader.analyze(query_data.loc[qid,'Query Description'])
    rel_score = []
    for docid in idx:
        rel_score.append(score_bm25(analyzed_query, docid))
    rel_score = np.array(rel_score)
    relevant = np.argsort(rel_score)[::-1][:5]  
    Score += [rel_score[r] for r in relevant]
    Raw += [json.loads(index_reader.doc(str(idx[r])).raw())['contents'] for r in relevant]
    QueryId += [query_data.loc[qid,'QueryId']]*5
    DocumentId += [idx[r] for r in relevant]

  0%|          | 0/1 [00:00<?, ?it/s]

what 5253 1 198
what 5253 1 251
big 903 1 251
what 5253 1 154
what 5253 1 210
what 5253 1 183
what 5253 1 157
what 5253 1 162
what 5253 1 169
what 5253 1 130
what 5253 1 130
what 5253 1 267
what 5253 1 130
what 5253 1 215
what 5253 1 161
what 5253 1 172
what 5253 1 136
what 5253 1 184
what 5253 1 128
what 5253 1 225
what 5253 1 132
what 5253 1 171
what 5253 1 307
what 5253 1 112
what 5253 1 201
big 903 1 201
what 5253 1 138
what 5253 2 179
what 5253 1 140
what 5253 1 303
what 5253 1 766
what 5253 1 187
what 5253 1 145
what 5253 1 127
what 5253 1 138
what 5253 1 133
what 5253 1 132
what 5253 1 275
what 5253 1 176
what 5253 1 272
what 5253 1 148
what 5253 1 122
what 5253 1 394
what 5253 1 153
what 5253 2 152
what 5253 1 175
what 5253 1 332
what 5253 1 135
what 5253 1 253
what 5253 1 579
what 5253 2 206
what 5253 1 334
what 5253 1 189
what 5253 1 133
big 903 1 249
what 5253 1 144
what 5253 1 150
what 5253 1 134
what 5253 1 155
what 5253 2 139
big 903 1 137
what 5253 1 168
what 5253 1 188


what 5253 2 139
what 5253 1 141
what 5253 1 196
what 5253 1 187
what 5253 1 296
what 5253 1 146
what 5253 1 153
what 5253 1 149
what 5253 1 150
what 5253 1 147
what 5253 1 277
what 5253 1 199
what 5253 1 226
what 5253 2 258
what 5253 1 209
what 5253 1 197
what 5253 1 231
what 5253 1 153
what 5253 1 218
what 5253 1 138
what 5253 1 154
what 5253 1 263
what 5253 1 207
what 5253 1 132
what 5253 1 189
what 5253 1 160
what 5253 1 127
what 5253 1 168
what 5253 1 132
what 5253 1 166
what 5253 1 282
what 5253 1 170
what 5253 1 174
what 5253 1 138
what 5253 1 145
what 5253 1 125
what 5253 1 333
what 5253 1 180
big 903 1 134
what 5253 1 198
what 5253 1 246
what 5253 1 129
what 5253 1 182
what 5253 1 165
what 5253 1 239
what 5253 1 150
what 5253 1 212
what 5253 1 196
what 5253 1 130
what 5253 1 313
what 5253 1 174
what 5253 1 150
what 5253 1 179
what 5253 1 151
what 5253 1 272
what 5253 1 154
what 5253 1 242
what 5253 1 275
what 5253 1 183
what 5253 1 124
what 5253 1 187
what 5253 1 124
what 5253 

what 5253 2 373
what 5253 2 243
what 5253 1 182
what 5253 1 137
what 5253 1 143
what 5253 1 260
what 5253 1 210
what 5253 1 133
what 5253 1 144
what 5253 1 310
what 5253 1 343
what 5253 1 137
what 5253 1 192
big 903 1 145
big 903 1 218
what 5253 1 144
what 5253 1 134
what 5253 1 173
what 5253 1 122
what 5253 1 164
what 5253 1 144
what 5253 2 163
what 5253 1 187
what 5253 1 214
what 5253 1 197
what 5253 2 303
what 5253 1 255
what 5253 1 242
what 5253 1 282
what 5253 1 557
what 5253 1 137
what 5253 1 130
what 5253 1 227
what 5253 1 150
what 5253 1 143
what 5253 1 174
what 5253 1 233
what 5253 1 166
what 5253 1 136
what 5253 1 203
what 5253 1 129
what 5253 1 264
what 5253 1 133
what 5253 1 233
what 5253 1 142
what 5253 1 133
what 5253 1 143
what 5253 1 399
what 5253 1 139
what 5253 1 212
what 5253 1 123
what 5253 1 134
what 5253 1 153
what 5253 3 253
what 5253 1 194
what 5253 1 227
what 5253 1 148
what 5253 1 133
what 5253 1 122
what 5253 1 124
what 5253 1 137
what 5253 1 192
what 5253 1 

what 5253 1 138
what 5253 1 147
what 5253 1 176
what 5253 1 189
what 5253 1 163
what 5253 1 164
what 5253 1 197
what 5253 1 213
what 5253 1 150
what 5253 1 214
what 5253 1 133
what 5253 1 256
big 903 1 235
what 5253 2 185
what 5253 1 190
what 5253 1 146
what 5253 1 170
what 5253 1 156
what 5253 1 228
what 5253 1 197
what 5253 1 177
what 5253 1 135
what 5253 1 125
what 5253 1 170
what 5253 1 135
what 5253 1 165
what 5253 1 115
what 5253 1 138
what 5253 1 148
what 5253 1 388
what 5253 1 148
what 5253 1 212
what 5253 1 135
what 5253 1 132
what 5253 1 148
what 5253 1 138
what 5253 1 131
what 5253 1 165
what 5253 1 202
what 5253 1 141
what 5253 1 146
what 5253 1 141
what 5253 1 140
what 5253 1 167
big 903 1 167
what 5253 1 161
big 903 1 161
what 5253 1 115
what 5253 1 153
what 5253 1 180
what 5253 1 132
what 5253 1 156
what 5253 1 121
what 5253 1 126
what 5253 1 137
big 903 1 382
what 5253 1 200
what 5253 1 116
what 5253 1 260
what 5253 1 165
what 5253 1 143
what 5253 1 136
what 5253 1 136


what 5253 1 122
what 5253 1 114
what 5253 1 123
what 5253 1 132
what 5253 1 207
what 5253 1 236
what 5253 1 157
what 5253 1 183
big 903 1 133
what 5253 1 231
what 5253 1 113
what 5253 1 143
what 5253 1 134
what 5253 1 113
what 5253 1 234
what 5253 2 480
what 5253 1 124
what 5253 1 205
big 903 1 167
what 5253 1 137
what 5253 1 140
what 5253 1 333
big 903 1 217
big 903 1 209
what 5253 1 233
what 5253 2 229
what 5253 1 194
what 5253 1 154
what 5253 1 134
what 5253 1 231
what 5253 1 174
what 5253 1 143
what 5253 1 137
what 5253 1 207
what 5253 1 149
what 5253 1 164
what 5253 1 124
big 903 1 162
what 5253 1 167
what 5253 1 151
what 5253 1 152
what 5253 1 165
what 5253 1 181
what 5253 1 171
what 5253 1 306
what 5253 1 130
what 5253 1 151
what 5253 1 148
what 5253 1 170
what 5253 1 115
what 5253 1 124
big 903 1 339
what 5253 1 135
what 5253 1 128
what 5253 1 209
what 5253 1 128
what 5253 1 119
what 5253 1 140
what 5253 1 270
what 5253 1 151
what 5253 1 198
what 5253 3 187
big 903 1 231
what 5

what 5253 1 188
what 5253 1 147
what 5253 2 425
what 5253 1 130
what 5253 1 167
what 5253 1 161
what 5253 1 162
what 5253 1 175
what 5253 1 213
what 5253 1 164
what 5253 2 220
what 5253 1 118
what 5253 1 142
what 5253 1 233
what 5253 1 118
what 5253 1 148
what 5253 1 133
what 5253 1 134
what 5253 1 354
what 5253 1 135
what 5253 1 143
what 5253 1 297
what 5253 1 148
what 5253 1 180
what 5253 1 232
what 5253 1 457
what 5253 1 158
what 5253 1 173
what 5253 1 177
what 5253 1 291
what 5253 1 366
what 5253 1 241
big 903 1 194
what 5253 1 135
big 903 1 161
big 903 1 143
what 5253 1 255
big 903 1 294
what 5253 1 140
what 5253 1 157
what 5253 1 132
what 5253 1 260
what 5253 1 339
big 903 1 301
what 5253 1 118
what 5253 1 164
what 5253 1 244
what 5253 1 116
what 5253 1 309
big 903 1 309
what 5253 1 147
what 5253 1 223
what 5253 1 143
what 5253 1 209
what 5253 1 150
what 5253 1 154
big 903 1 133
what 5253 1 121
what 5253 1 129
what 5253 1 146
what 5253 1 209
what 5253 1 156
what 5253 1 267
what 5

what 5253 1 143
what 5253 1 153
what 5253 1 144
what 5253 1 226
what 5253 1 333
what 5253 1 152
what 5253 1 180
what 5253 1 114
what 5253 1 178
what 5253 2 141
what 5253 1 161
what 5253 1 147
what 5253 1 165
big 903 1 165
what 5253 1 179
what 5253 1 167
what 5253 1 135
what 5253 1 128
what 5253 1 142
what 5253 1 134
what 5253 1 165
what 5253 1 125
what 5253 1 221
what 5253 2 307
what 5253 1 134
what 5253 1 179
what 5253 1 161
what 5253 1 200
what 5253 1 144
what 5253 1 197
what 5253 1 234
what 5253 1 125
what 5253 1 224
what 5253 1 138
what 5253 1 140
big 903 1 173
what 5253 1 155
what 5253 1 208
what 5253 1 158
what 5253 1 135
what 5253 1 175
what 5253 1 188
what 5253 1 185
what 5253 1 184
what 5253 1 164
what 5253 1 145
what 5253 1 136
what 5253 1 201
what 5253 1 329
what 5253 1 130
what 5253 1 377
what 5253 1 251
what 5253 1 191
what 5253 1 151
what 5253 1 235
what 5253 1 144
what 5253 1 168
what 5253 1 136
what 5253 1 152
what 5253 1 143
what 5253 1 228
what 5253 1 123
what 5253 1 

100%|██████████| 1/1 [00:03<00:00,  3.17s/it]

 5253 1 218
what 5253 1 182
what 5253 1 501
what 5253 1 121
what 5253 1 131
what 5253 1 144
what 5253 1 202
what 5253 1 178
what 5253 1 189
what 5253 1 305
what 5253 1 167
what 5253 1 161
what 5253 1 317
what 5253 1 142
what 5253 1 229
what 5253 1 124
what 5253 1 147
what 5253 1 168
what 5253 1 166
what 5253 2 190
what 5253 1 178
what 5253 1 123
what 5253 1 152
what 5253 1 192
what 5253 1 123
what 5253 1 139
what 5253 1 146
what 5253 1 140
what 5253 1 173
what 5253 1 134
what 5253 1 145
what 5253 1 133
what 5253 1 158
what 5253 1 217
big 903 1 138
what 5253 1 232
what 5253 1 148
what 5253 1 129
big 903 1 261
what 5253 1 180
what 5253 1 226
what 5253 1 179
what 5253 1 131
what 5253 1 166
what 5253 1 165
what 5253 1 184
what 5253 1 216
what 5253 1 159
big 903 1 167
what 5253 1 133
what 5253 1 152
big 903 1 322
what 5253 1 152
big 903 1 161
what 5253 1 147
what 5253 1 161
what 5253 1 136
what 5253 1 122
what 5253 1 121
what 5253 1 152
what 5253 1 124
what 5253 1 144
what 5253 1 149
what 5




In [23]:
result = pd.DataFrame({'QueryId':QueryId, 'DocumentId':DocumentId, 'Score':Score, 'Raw':Raw})
result

Unnamed: 0,QueryId,DocumentId,Score,Raw
0,0,15225,12.822136,"{""line"": ""THE BIG BANG THEORY"", ""actor"": ""Capt..."
1,0,38444,11.985246,"{""line"": ""Following a \u201cPreviously on The ..."
2,0,5707,11.944216,"{""line"": ""Following a \u201cPreviously on The ..."
3,0,46936,11.835083,"{""line"": ""Ay-yi-yi, bang-bang."", ""actor"": ""Jam..."
4,0,12068,10.740316,"{""line"": ""The blogosphere is a-buzzing with ne..."


In [24]:
result_final = result[['QueryId','Raw']]
result_final.head(10)

Unnamed: 0,QueryId,Raw
0,0,"{""line"": ""THE BIG BANG THEORY"", ""actor"": ""Capt..."
1,0,"{""line"": ""Following a \u201cPreviously on The ..."
2,0,"{""line"": ""Following a \u201cPreviously on The ..."
3,0,"{""line"": ""Ay-yi-yi, bang-bang."", ""actor"": ""Jam..."
4,0,"{""line"": ""The blogosphere is a-buzzing with ne..."


In [25]:
result_final.to_csv('sample_result.csv',index=False)