<h1>EECS 549 Homework 5</h1>

Haley Johnson

In [1]:
import os
import gzip
import json
import pickle
import numpy as np 
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from utils import load_person_attributes, get_docid_to_categories, attribute_eval_pipeline, make_df
#, make_df
from relevance import *

from ranker import BM25, CrossEncoderScorer, Ranker
from vector_ranker import VectorRanker
from document_preprocessor import RegexTokenizer
from l2r import L2RFeatureExtractor, L2RRanker

In [2]:
def make_df(results, name:str):
    df = pd.DataFrame(results)
    df.columns = ['docid', 'score']
    df['rank'] = df['score'].rank(method = 'dense', ascending = False)
    df['ranker'] = name
    return df

<h2>Load Data</h2>

In [3]:
df = pd.read_json("../wikipedia_200k_dataset.jsonl.gz", lines=True)

In [4]:
docs = []
with gzip.open("../wikipedia_200k_dataset.jsonl.gz") as f: 
    doc = f.readline()
    while doc:
        doc = json.loads(doc)
        docid = doc['docid']
        text = doc['text']
        title = doc['title']
        docs.append((docid, title, text))
        doc = f.readline()

In [5]:
document_preprocessor = RegexTokenizer("\\w+", lowercase = True)

In [6]:
encoded_docs = np.load("../wiki-200k-vecs.msmarco-MiniLM-L12-cos-v5.npy")
row_to_docid = [doc[0] for doc in docs]
raw_text_dict = {doc[0]: " ".join(document_preprocessor.tokenize(doc[2])[:500]) for doc in docs}

<h2>Problem 5</h2>

In [7]:
doc_attributes = load_person_attributes("../person-attributes.csv", "../eval/common_attributes.csv")
doc_attributes = pd.DataFrame.from_dict(doc_attributes).T.reset_index()
doc_attributes.columns = ['docid', 'title', 'ethnicity', 'gender', 'religion', 'politics']

In [8]:
common_attributes = pd.read_csv("../eval/common_attributes.csv")
common_attributes = common_attributes[common_attributes['attribute_label'] != 'attribute_label']

In [9]:
attributes = list(np.unique(common_attributes['attribute_label'].values))

<h3>IR System Set Up</h3>

In [10]:
with open("../stopwords.txt") as f:
    words = f.read()
    stopwords = words.split()

network_features = pd.read_csv("network_stats.csv")

with open("../recognized_categories.csv") as f: 
    recognized_categories = f.readlines()

docid_to_categories = get_docid_to_categories("../wikipedia_200k_dataset.jsonl")
docid_to_network_features = network_features.set_index('docid').to_dict(orient = 'index')

feature_names =  ['doc length', 'title length', 'query length', 'doc term frequency', 'doc tf-idf',
                  'title term frequency', 'title tf-idf', 'document BM25', 'document pivoted normalization', 
                  'document pagerank', 'document hub score', 'document authority score', 'uniqueness ratio']
    
all_feature_names = feature_names + recognized_categories
cross_encoder = CrossEncoderScorer(raw_text_dict)

In [None]:
with open("../index", "rb") as f: 
    index = pickle.load(f)

with open("../title_index", "rb") as f: 
    title_index = pickle.load(f)

In [None]:
feature_extractor = L2RFeatureExtractor(index, title_index, docid_to_categories, document_preprocessor, stopwords, 
                                        recognized_categories, docid_to_network_features, cross_encoder)

In [None]:
vector_ranker = VectorRanker('sentence-transformers/msmarco-MiniLM-L12-cos-v5', encoded_docs, row_to_docid)

In [None]:
l2r = L2RRanker(index, title_index, document_preprocessor, stopwords, vector_ranker, feature_extractor)
l2r.train("../hw5_relevance.train.csv")

<h3>Query 'person'</h3>

In [None]:
person_df = attribute_eval_pipeline('person', l2r, doc_attributes, attributes, common_attributes)

<h3>Query 'woman'</h3>

In [None]:
woman_df = attribute_eval_pipeline('woman', l2r, doc_attributes, attributes, common_attributes)

<h2>Problem 6</h2>

In [None]:
queries_to_judgements = load_true_relevance("../hw4_relevance.test.csv")

In [None]:
lambda_03 = run_relevance_tests(queries_to_judgements, '../eval/relevance_lambda_03.csv', 10, mmr_lambda = 0.3)

In [None]:
lambda_05 = run_relevance_tests(queries_to_judgements, '../eval/relevance_lambda_03.csv', 10, mmr_lambda = 0.3)