In [1]:
# coding: utf-8
from bs4 import BeautifulSoup
import numpy as np
import pandas as pd
import nltk
import re
import os
import math
import json
import sys
import time
import json
from PorterStemmer import *
#from nltk.stem import PorterStemmer
#from nltk.tokenize import word_tokenize
ps = PorterStemmer()
from tqdm.notebook import tqdm
import random
random.seed(42)

In [2]:
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

In [3]:
delim = '''[ ',(){}.:;"`\n]'''

In [4]:
data_dir = "C:\Files\col764-a2-release\col764-ass2-release\data"

In [5]:
metadata = os.path.join(data_dir,"metadata.csv")

In [6]:
meta_df = pd.read_csv(metadata)
meta_df = meta_df[['cord_uid','title','abstract','pdf_json_files','pmc_json_files']]
#meta_df = meta_df.head(1000)
meta_df['pdf_json_files'] = meta_df['pdf_json_files'].astype(str)
meta_df['pmc_json_files'] = meta_df['pmc_json_files'].astype(str)

  interactivity=interactivity, compiler=compiler, result=result)


In [7]:
def getTokensFromText(text):
    words = re.split(delim, text)
    res = []
    for w in words:
        if(len(w) > 2):
            #temp = ps.stem(w.lower(), 0, len(w)-1)
            temp = w.lower()
            if temp not in stop_words and not re.search('[0-9]+',temp):
                res.append(temp)
    return res

In [8]:
topics_file = "covid19-topics.xml"
top100_file= "t40-top-100.txt"
relevance_file = "t40-qrels.txt"

In [9]:
def get_queries():
    with open(os.path.join(data_dir,topics_file), "r") as file:
        content = file.readlines()
        content = "".join(content)
        bs_content = BeautifulSoup(content, "lxml")
        queries = bs_content.find_all("query")
        all_text = []
        for q in queries:
            all_text.append(q.text)
        return all_text

In [10]:
def get_relevances():
    rel_dict = {}
    with open(os.path.join(data_dir,relevance_file),"r") as file:
        for x in file:
            line = x.split(" ")
            qid = int(line[0])
            docid = line[2]
            rel = int(line[3].replace("\n",""))
            if qid not in rel_dict:
                rel_dict[qid] = {docid:rel}
            else:
                rel_dict[qid][docid] = rel
    return rel_dict

In [11]:
def get_top100():
    top100_dict = {}
    with open(os.path.join(data_dir,top100_file),"r") as file:
        for x in file:
            line = x.split(" ")
            if(len(line)>1):
                qid = int(line[0])
                docid = line[2]
                if qid not in top100_dict:
                    top100_dict[qid] = [docid]
                else:
                    top100_dict[qid].append(docid)
    return top100_dict

In [12]:
top100_dict = get_top100()
rel_dict = get_relevances()

In [13]:
def get_tf_dict(docid):
    tf_dict = {}
    paths = filemap[docid]
    alltext = ""
    for path in paths.split("; "):
        if(path == "nan"):
            continue
        datapath = os.path.join(data_dir,path)
        with open(datapath) as data:
            data_dict = json.load(data)
            for section in data_dict['body_text']:
                section_text = section['text']
                alltext+=section_text
                alltext+=" "
    tokens = getTokensFromText(alltext)
    for token in tokens:
        if token not in tf_dict:
            tf_dict[token]=1
        else:
            tf_dict[token]+=1
    return tf_dict

In [14]:
with open("filemap.json", "r") as jsonfile:
    filemap = json.load(jsonfile)

In [15]:
def getFreqMap(qno):
    # tf map from word->doc-> freq
    top100docs = top100_dict[qno]
    freq_map = {}
    doclen_map ={}
    for doc in (top100docs):
        tf_dict = get_tf_dict(doc)
        total = 0
        for entry in tf_dict.items():
            token = entry[0]
            freq = entry[1]
            total +=freq
            if token not in freq_map:
                freq_map[token] = {}
            freq_map[token][doc] = freq
        doclen_map[doc] = total
    vocab = list(freq_map.keys())
    
    return freq_map,doclen_map

In [16]:
freq_map,doclen_map = getFreqMap(2)
len(list(freq_map.keys()))

26343

In [17]:
def getProb_dict(qno,mu):
    freq_map,doclen_map = getFreqMap(qno)
    vocab = list(freq_map.keys())
    # map word->doc->prob
    prob_map = {}
    tf_dict = {}
    for w in vocab:
        tot = 0
        for doc in freq_map[w].keys():
            t1 = freq_map[w][doc]
            tot+=t1
        tf_dict[w] = tot
    for w in vocab:
        for doc in freq_map[w].keys():
            t1 = freq_map[w][doc]
            t2 = tf_dict[w]
            prob = (t1+mu*t2)/(doclen_map[doc] + mu)
            if w not in prob_map:
                prob_map[w] = {}
            prob_map[w][doc] = prob
    return prob_map

In [165]:
getProb_dict(1,0.6)['weather']

{'jm18lj5t': 0.0012555342628692262, '8ywd6j2b': 0.0011781536648994362}

In [18]:
q_map = {}
for i,q_orig in (enumerate(get_queries())):
    q = q_orig.lower()
    q_map[i+1] = q

In [19]:
def getTopkExp(qno,k,mu):
    prob_map = getProb_dict(qno,mu)
    q = q_map[qno]
    top100docs = top100_dict[qno]
    qtokens = getTokensFromText(q)
    # map from word -> finProb
    finmap = {}
    vocab = list(prob_map.keys())
    for w in vocab:
        total = 0
        for doc in top100docs:
            prob = 1
            for qi in qtokens:
                if doc in prob_map[qi]:
                    prob*=prob_map[qi][doc]
                else:
                    prob = 0
            if doc in prob_map[w]:
                prob*=prob_map[w][doc]
            else:
                prob = 0
            total += prob
        finmap[w] = total
    sorted_tokens = dict(sorted(finmap.items(), key=lambda x: x[1],reverse = True))
    sorted_tokens = list(sorted_tokens.keys())
    return sorted_tokens[:k]

In [20]:
def getTopkExpR2(qno,k,mu):
    prob_map = getProb_dict(qno,mu)
    q = q_map[qno]
    top100docs = top100_dict[qno]
    qtokens = getTokensFromText(q)
    # map from word -> finProb
    finmap = {}
    vocab = list(prob_map.keys())
    for w in vocab:
        total = 0
        for doc in top100docs:
            if doc in prob_map[w]:
                total+=prob_map[w][doc]
        p_w = total
        prob = p_w * p_w
        
        for qi in qtokens:
            total = 0
            for doc in top100docs:
                temp = 1
                if doc in prob_map[w]:
                    temp*= prob_map[w][doc]
                else:
                    temp*=0
                if doc in prob_map[qi]:
                    temp*=prob_map[qi][doc]
                else:
                    temp*=0
                total+=temp
            prob*=total
        finmap[w] = total
    sorted_tokens = dict(sorted(finmap.items(), key=lambda x: x[1],reverse = True))
    sorted_tokens = list(sorted_tokens.keys())
    return sorted_tokens[:k]

In [120]:
getTopkExp(30,20,0.6)

HBox(children=(FloatProgress(value=0.0), HTML(value='')))




['patients',
 'remdesivir',
 'treatment',
 'clinical',
 'virus',
 'viral',
 'rna',
 'antiviral',
 'coronavirus',
 'drug',
 'respiratory',
 'drugs',
 'disease',
 'severe',
 'infection',
 'used',
 'also',
 'use',
 'activity',
 'results']

In [182]:
getTopkExpR2(2,20,0.6)

['virus',
 'temperature',
 'may',
 'weather',
 'cases',
 'also',
 'data',
 'health',
 'disease',
 'study',
 'number',
 'time',
 'climate',
 'conditions',
 'new',
 'transmission',
 'change',
 'population',
 'respiratory',
 'used']

In [112]:
q_map

{1: 'coronavirus origin',
 2: 'coronavirus response to weather changes',
 3: 'coronavirus immunity',
 4: 'how do people die from the coronavirus',
 5: 'animal models of covid-19',
 6: 'coronavirus test rapid testing',
 7: 'serological tests for coronavirus',
 8: 'coronavirus under reporting',
 9: 'coronavirus in canada',
 10: 'coronavirus social distancing impact',
 11: 'coronavirus hospital rationing',
 12: 'coronavirus quarantine',
 13: 'how does coronavirus spread',
 14: 'coronavirus super spreaders',
 15: 'coronavirus outside body',
 16: 'how long does coronavirus survive on surfaces',
 17: 'coronavirus clinical trials',
 18: 'masks prevent coronavirus',
 19: 'what alcohol sanitizer kills coronavirus',
 20: 'coronavirus and ace inhibitors',
 21: 'coronavirus mortality',
 22: 'coronavirus heart impacts',
 23: 'coronavirus hypertension',
 24: 'coronavirus diabetes',
 25: 'coronavirus biomarkers',
 26: 'coronavirus early symptoms',
 27: 'coronavirus asymptomatic',
 28: 'coronavirus hy

In [21]:
def getQueryVector(query,qno,k,mu,vocab):
    tokens = getTokensFromText(query)
    tokens.extend(getTopkExp(qno,k,mu))
    query_tf_dict = {}
    for token in tokens:
        if token not in query_tf_dict:
            query_tf_dict[token]=1
        else:
            query_tf_dict[token]+=1
    vec = np.zeros(len(vocab))
    freq_map,doclen_map = getFreqMap(qno)
    for word,tf in query_tf_dict.items():
        if word in vocab.keys():
            pos = vocab[word]
            normtf = 1 + math.log(tf,2)
            #normidf = get_idf(word)
            idf = len(freq_map[word])
            normidf = math.log(1+100/idf,2)
            tfidf = normtf * normidf
            vec[pos] = tfidf
    return vec

In [22]:
vocab_dict = {}
for i,q_orig in tqdm(enumerate(get_queries())):
    qno = i+1
    freq_map,doclen_map = getFreqMap(qno)
    vocab = {}
    for i,word in enumerate(freq_map.keys()):
        vocab[word] = i
    vocab_dict[qno] = vocab

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




In [170]:
def LM_updateQueries(k,mu):
    updated_queries = []
    for i,q_orig in tqdm(enumerate(get_queries())):
        q = q_orig.lower()
        qno = i+1
        vocab = vocab_dict[qno]
        qv0 = getQueryVector(q,qno,k,mu,vocab)
        updated_queries.append(qv0)
    return updated_queries

In [173]:
updated_queries = LM_updateQueries(20,0.6)

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




In [172]:
def getDocVector(docid,vocab,freq_map):
    tf_dict = get_tf_dict(docid)
    vec = np.zeros(len(vocab))
    for word,tf in tf_dict.items():
        if word in vocab:
            pos = vocab[word]
            normtf = 1 + math.log(tf,2)
            idf = len(freq_map[word])
            normidf = math.log(1+100/idf,2)
            tfidf = normtf * normidf
            vec[pos] = tfidf
    return vec

In [185]:
def getNDCG(qno,docs):
    def discount_helper(arr):
        score = 0
        for i,e in enumerate(arr):
            score += e/math.log(i+2,2)
        return score
    scores = []
    for doc in docs:
        if doc in rel_dict[qno]:
            scores.append(rel_dict[qno][doc])
        else:
            scores.append(0)
    origscore = discount_helper(scores)
    sortedscore = discount_helper(sorted(scores,reverse=True))
    if sortedscore == 0:
        return 0
    return origscore/sortedscore

In [177]:
def getSim(queryVec,docVec):
    if np.linalg.norm(queryVec) == 0 or np.linalg.norm(docVec)==0:
        return 0.0
    return np.dot(queryVec,docVec)/(np.linalg.norm(queryVec) * np.linalg.norm(docVec))

In [174]:
def LM_getNDCG(updated_queries):
    total = 0
    allscores = []
    for i,q in tqdm(enumerate(updated_queries)):
        qno = i+1
        freq_map,doclen_map = getFreqMap(qno)
        doc_score_dict = {}
        top100docs = top100_dict[qno]
        vocab = vocab_dict[qno]
        for doc in top100docs:
            score = getSim(q,getDocVector(doc,vocab,freq_map))
            doc_score_dict[doc] = score
        sorted_docs = dict(sorted(doc_score_dict.items(), key=lambda x: x[1],reverse = True))
        sorted_docs = list(sorted_docs.keys())
        ndcg = getNDCG(qno,sorted_docs)
        allscores.append(ndcg)
        total +=ndcg
    return allscores,total/len(updated_queries)

In [184]:
LM_getNDCG(updated_queries)

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




([0.8364713928251899,
  0.8490650495555893,
  0.7459413854057952,
  0.25711046737534016,
  0.6642047612890934,
  0.9241256794184907,
  0.9305632475767501,
  0.38097179678159365,
  0,
  0.8151962527342926,
  0.5099261368517586,
  0.7199773775799176,
  0.4583099850885371,
  0.7780650091691769,
  0.5138749241456431,
  0.7845182542274431,
  0.8599859952763825,
  0.9282851645109501,
  0.6549040069641665,
  0.9655616430301298,
  0.8107292573564864,
  0.8928692285678204,
  0.8834844738000237,
  0.9273835820616646,
  0.8205136415143893,
  0.8503490718309931,
  0.932127583573335,
  0.9428005952239759,
  0.8516524622049824,
  0.8831539646074478,
  0.3520754855748004,
  0.2839494431967705,
  0.7196372486008927,
  0.6190450953833722,
  0.6640565901693796,
  0.9767514386170004,
  0.7783355860841272,
  0.9453382783356529,
  0.9664697328709919,
  0.8378654121803345],
 0.7378911675390171)

In [23]:
a = [['a','b'],['c','d']]

In [26]:
for i,ex in enumerate(a):
    ll = ""
    for e in ex:
        ll+=e
        ll+=","
    l = str(i+1) + " : "+ ll[:-1]
    print(l)

1 : a,b
2 : c,d
