# Get QREL dictionary for reference

In [1]:
QREL_FILE = '../HW1/trec_eval/qrels.adhoc.51-100.AP89.txt'

def retrieve_qrel(QREL_FILE):
    counter = 0
    queryno_list = [59, 77, 94, 85, 95, 91, 56, 71, 64, 62, 93, 99, 58, 54, 87, 100, 89, 61, 68, 63, 57, 97, 98, 60, 80]
    print(len(queryno_list))
    with open (QREL_FILE, 'r', encoding='ISO-8859-1') as f:
        qrel_dict = {}
        relevants = []
        for line in f:
            items = line.split()
            queryno, docno, relevance = int(items[0]), items[2], float(items[3])
            if relevance == 1 and queryno in queryno_list:
                counter += 1
                relevants.append(docno)
            if queryno in qrel_dict:
                qrel_dict[queryno].append(docno)
            else:  
                qrel_dict[queryno] = [docno]
    # print(len(qrel_dict))
    # print(counter)
    # print(len(relevants))
    return qrel_dict, relevants

# Get relevant documents

In [2]:
OKAPI_BM25 = './rel_results/okapi-bm25.txt'
OKAPI_TF = './rel_results/okapi-tf.txt'
TF_IDF = './rel_results/tf-idf.txt'

def file_info(TEST_FILE, qnum):
    list, dictionary = [], {}
    with open (TEST_FILE, 'r', encoding='ISO-8859-1') as f:
        for line in f:
            items = line.split()
            queryno, docno, score = int(items[0]), items[2], float(items[4])
            if queryno == qnum:
                id_pair = str(queryno) + '-' + docno
                list.append(id_pair)
                dictionary[id_pair] = score
    return list, dictionary

def relevant_docs(OKAPI_BM25, OKAPI_TF, TF_IDF, relevance):
    queryno_list = [59, 77, 85, 91, 56, 71, 64, 62, 93, 58, 54, 87, 100, 89, 61, 68, 63, 57, 60, 80]
    rel_docs, len_map = {}, {}

    for qnum in queryno_list:
        bm25_list, bm25_dict = file_info(OKAPI_BM25, qnum)
        tf_list, tf_dict = file_info(OKAPI_TF, qnum)
        idf_list, idf_dict = file_info(TF_IDF, qnum)
        len_map[qnum] = len(bm25_list)
        for id_pair in bm25_list:
            if id_pair[-13:] in relevance:
                rel_docs[id_pair] = (bm25_dict[id_pair], tf_dict[id_pair], idf_dict[id_pair], 1)
            else:
                rel_docs[id_pair] = (bm25_dict[id_pair], tf_dict[id_pair], idf_dict[id_pair], 0)
    return len_map, rel_docs

# Get non-relevant documents

In [3]:
import random

NON_OKAPI_BM25 = './non_rel_results/okapi-bm25.txt'
NON_OKAPI_TF = './non_rel_results/okapi-tf.txt'
NON_TF_IDF = './non_rel_results/tf-idf.txt'

def intersection(lst1, lst2): 
    lst3 = [value for value in lst1 if value in lst2] 
    return lst3 

def non_relevant_docs(len_map, NON_OKAPI_BM25, NON_OKAPI_TF, NON_TF_IDF):
    queryno_list = [59, 77, 85, 91, 56, 71, 64, 62, 93, 58, 54, 87, 100, 89, 61, 68, 63, 57, 60, 80]
    non_rel_docs = {}

    for qnum in queryno_list:
        bm25_list, bm25_dict = file_info(NON_OKAPI_BM25, qnum)
        tf_list, tf_dict = file_info(NON_OKAPI_TF, qnum)
        idf_list, idf_dict = file_info(NON_TF_IDF, qnum)

        intrsct_list = intersection(intersection(bm25_list, tf_list), idf_list)
        # if len(intrsct_list) + len_map[qnum] > 1000:
        intrsct_list = random.sample(intrsct_list, 1000 - len_map[qnum])
        # print(qnum, len_map[qnum], len(intrsct_list))
        for id_pair in intrsct_list:
            non_rel_docs[id_pair] = (bm25_dict[id_pair], tf_dict[id_pair], idf_dict[id_pair], 0)
    return non_rel_docs

# Get documents for test queries

In [4]:
TEST_OKAPI_BM25 = './test_results/test_okapi-bm25.txt'
TEST_OKAPI_TF = './test_results/test_okapi-tf.txt'
TEST_TF_IDF = './test_results/test_tf-idf.txt'

def test_docs(TEST_OKAPI_BM25, TEST_OKAPI_TF, TEST_TF_IDF):
    queryno_list = [94, 95, 97, 98, 99]
    test_docs = {}
    for qnum in queryno_list:
        bm25_list, bm25_dict = file_info(TEST_OKAPI_BM25, qnum)
        tf_list, tf_dict = file_info(TEST_OKAPI_TF, qnum)
        idf_list, idf_dict = file_info(TEST_TF_IDF, qnum)
        print(len(bm25_list))
        for id_pair in bm25_list:
            try:
                bm25_score = bm25_dict[id_pair]
            except:
                bm25_score = 0
            try:
                tf_score = tf_dict[id_pair]
            except:
                tf_score = 0
            try:
                idf_score = idf_dict[id_pair]
            except:
                idf_score = 0
            if id_pair[-13:] in relevance:
                test_docs[id_pair] = (bm25_score, tf_score, idf_score, 1)
            else:
                test_docs[id_pair] = (bm25_score, tf_score, idf_score, 0)
    return test_docs

## Get non-relevant documents according to the number of relevant documents

In [5]:
qrel_dict, relevance = retrieve_qrel(QREL_FILE)
len_map, rel_docs = relevant_docs(OKAPI_BM25, OKAPI_TF, TF_IDF, relevance)
non_rel_docs = non_relevant_docs(len_map, NON_OKAPI_BM25, NON_OKAPI_TF, NON_TF_IDF)
test_docs = test_docs(TEST_OKAPI_BM25, TEST_OKAPI_TF, TEST_TF_IDF)

25
1000
1000
1000
1000
1000


# Generate feature matrix

In [6]:
import pandas as pd

feature_map = dict(**rel_docs, **non_rel_docs)
f_matrix = pd.DataFrame(columns = ['Queryno-Docno', 'Okapi-BM25', 'Okapi-TF', 'TF-IDF', 'Relevance'])
for key, item in sorted(feature_map.items()):
    f_matrix = f_matrix.append(pd.Series([key, item[0], item[1], item[2], item[3]], index = f_matrix.columns), ignore_index=True)
for key, item in sorted(test_docs.items()):
    f_matrix = f_matrix.append(pd.Series([key, item[0], item[1], item[2], item[3]], index = f_matrix.columns), ignore_index=True)

## Training set

In [7]:
f_matrix.head(20000)

Unnamed: 0,Queryno-Docno,Okapi-BM25,Okapi-TF,TF-IDF,Relevance
0,100-AP890101-0053,4.145897,0.678843,1.386598,0
1,100-AP890102-0144,5.755935,0.781337,1.812924,0
2,100-AP890103-0003,4.733380,0.876695,1.542451,0
3,100-AP890103-0146,4.253067,0.821857,1.477725,0
4,100-AP890104-0062,5.577723,0.793778,1.797725,0
...,...,...,...,...,...
19995,93-AP891230-0046,2.931008,0.827404,1.028523,0
19996,93-AP891230-0067,5.689349,0.738578,1.772829,0
19997,93-AP891230-0074,6.156061,0.735453,1.993646,0
19998,93-AP891230-0090,4.714761,0.395055,1.624637,0


In [8]:
f_matrix.tail(5000)

Unnamed: 0,Queryno-Docno,Okapi-BM25,Okapi-TF,TF-IDF,Relevance
20000,94-AP890101-0002,5.991360,0.703359,2.153543,0
20001,94-AP890102-0093,4.920011,0.000000,1.845500,0
20002,94-AP890102-0107,6.512217,0.000000,2.040016,0
20003,94-AP890103-0030,6.040243,0.898343,2.230469,0
20004,94-AP890103-0087,8.137603,0.000000,2.647158,0
...,...,...,...,...,...
24995,99-AP891229-0100,6.726268,0.933025,0.000000,0
24996,99-AP891229-0167,13.944339,1.424712,5.133301,0
24997,99-AP891229-0182,6.763340,0.000000,0.000000,0
24998,99-AP891230-0053,11.277287,1.079564,3.916483,0


## Testing set

# Linear Regression

In [9]:
from sklearn import model_selection
from sklearn import linear_model
from operator import itemgetter

X = f_matrix.values[:,1:4]
Y = f_matrix.values[:,4]
f_matrix.shape

(25000, 5)

### helper methods

In [10]:
def createDict(qdIDTest, predictions):
    testDict = {}
    i = 0
    for prediction in predictions:
        qdIDVal = qdIDTest[i][0]
        i += 1
        qID = qdIDVal.split('-', 1)[0]
        docID = qdIDVal.split('-', 1)[1]
        if qID in testDict:
            testDict[qID].append((docID, prediction))
        else:
            testDict[qID] = []
            testDict[qID].append((docID, prediction))
    return testDict

def sortDict(testDict):
    for item in testDict:
        sorted_list = sorted(testDict[item], key=itemgetter(1), reverse=True)
        testDict[item] = sorted_list
    return testDict

## 1. Test the model

In [16]:
kfold = model_selection.KFold(n_splits=5, random_state=None, shuffle=False) # k-fold cross validation
qdID = f_matrix.values
counter = 0
for train, test in kfold.split(X, Y):
    counter += 1
    qdIDTest = qdID[test]
    regr = linear_model.LinearRegression()
    regr.fit(X[train], Y[train]) #train
    predictions = regr.predict(X[test]) #predict
    testDict = createDict(qdIDTest, predictions)
    testDict = sortDict(testDict)
    out = open('./test_result_{}.txt'.format(counter), "a")
    for qnum, tuples in testDict.items():
        rank = 0
        for tuple in tuples:
            rank += 1
            str = ('{} Q0 {} {} {} Exp'.format(qnum, tuple[0], rank, tuple[1]))
            out.write(str+"\n")

## 2. Test the model on training data

In [15]:
kfold = model_selection.KFold(n_splits=5, random_state=None, shuffle=False) #k-fold cross validation
qdID = f_matrix.values
counter = 0
for train, test in kfold.split(X, Y):
    qdIDTest = qdID[train]
    regr = linear_model.LinearRegression()
    regr.fit(X[train], Y[train]) #train
    predictions = regr.predict(X[train]) #predict
    testDict = createDict(qdIDTest, predictions)
    testDict = sortDict(testDict)
    out = open('./test_train_result_{}.txt'.format(counter), "a")
    for qnum, tuples in testDict.items():
        rank = 0
        for tuple in tuples:
            rank += 1
            str = ('{} Q0 {} {} {} Exp'.format(qnum, tuple[0], rank, tuple[1]))
            out.write(str+"\n")

In [None]:
# test performance
# [0.4484, 0.2116, 0.2302, 0.3248, 0.2819]
# Avg of avg precisions: About 0.299

# training performance
# [0.2563, 0.3806, 0.3724, 0.3493, 0.3573]
# Avg of avg precisions: About 0.343