In [1]:
## from week 6 lab
def dcg_at_k(r, k, method=0):
    """Score is discounted cumulative gain (dcg)

    Relevance is positive real values.  Can use binary
    as the previous methods.

    Example from
    http://www.stanford.edu/class/cs276/handouts/EvaluationNew-handout-6-per.pdf
    >>> r = [3, 2, 3, 0, 0, 1, 2, 2, 3, 0]
    >>> dcg_at_k(r, 1)
    3.0
    >>> dcg_at_k(r, 1, method=1)
    3.0
    >>> dcg_at_k(r, 2)
    5.0
    >>> dcg_at_k(r, 2, method=1)
    4.2618595071429155
    >>> dcg_at_k(r, 10)
    9.6051177391888114
    >>> dcg_at_k(r, 11)
    9.6051177391888114

    Args:
        r: Relevance scores (list or numpy) in rank order
            (first element is the first item)
        k: Number of results to consider
        method: If 0 then weights are [1.0, 1.0, 0.6309, 0.5, 0.4307, ...]

    Returns:
        Discounted cumulative gain
    """
    import numpy as np
    r = np.asfarray(r)[:k]
    if r.size: ## why is this r.size? when will this be false?
        return r[0] + np.sum(r[1:] / np.log2(np.arange(2, r.size + 1)))
    return 0.

In [2]:
## from week 6 lab
def ndcg_at_k(r, k, method=0):
    """Score is normalized discounted cumulative gain (ndcg)

    Relevance is positive real values.  Can use binary
    as the previous methods.

    Example from
    http://www.stanford.edu/class/cs276/handouts/EvaluationNew-handout-6-per.pdf
    >>> r = [3, 2, 3, 0, 0, 1, 2, 2, 3, 0]
    >>> ndcg_at_k(r, 1)
    1.0
    >>> r = [2, 1, 2, 0]
    >>> ndcg_at_k(r, 4)
    0.9203032077642922
    >>> ndcg_at_k(r, 4, method=1)
    0.96519546960144276
    >>> ndcg_at_k([0], 1)
    0.0
    >>> ndcg_at_k([1], 2)
    1.0

    Args:
        r: Relevance scores (list or numpy) in rank order
            (first element is the first item)
        k: Number of results to consider
        method: If 0 then weights are [1.0, 1.0, 0.6309, 0.5, 0.4307, ...]
                If 1 then weights are [1.0, 0.6309, 0.5, 0.4307, ...]

    Returns:
        Normalized discounted cumulative gain
    """
    import numpy as np

    dcg_max = dcg_at_k(sorted(r, reverse=True), k, method)
    if not dcg_max:
        return 0.
    print('For k is {}, DCG scorce is {}'.format(k,dcg_at_k(r, k, method)))
    print('For k is {}, IDCG scorce is {}'.format(k,dcg_max))
    return dcg_at_k(r, k, method) / dcg_max

In [3]:
def assigningBM25ScoreToRelevantAndRetrieved(_bm25ScoreDf, relevantDocsList):
    """[summary]
    This function helps to assign zero values to those non-relevant and retrieved documents
    It retents the score of those relevant and retrieved

    Args:
        _bm25ScoreDf ([dataframe]): [a dataframe where rows are modules and columns is the bm25 scores]
        relevantAndRetrievedDocs ([list]): [list of modules based on the golden standard(idea outcome based on survey)]
    """
    df = _bm25ScoreDf.copy(deep = False)
    irrelevantAndRetrievedDocsList = list(set(df.index) - set(relevantDocsList))
    
    for relevantAndRetrievedDoc in irrelevantAndRetrievedDocsList:
        df.loc[relevantAndRetrievedDoc]['bm25Score'] = 0
    """[summary]
    output is a df with score that are retrieved and relevant(relevant depends on the gold standard)
    """
    return(df)

In [4]:
def NDCGWithVariousK(retrievedDocsDf,listOfRelevantDocs, exportResults = 0, queryNum = '', fileName = 'test'):
    """[summary]
    This function compute the NDGC at vaious K

    Args:
        retrievedDocsDf ([dataframe]): [dataframe of retrieved documents and it's bm25 score]
        listOfRelevantDocs ([list]): [list of relevant Documents based on gold standard]
        exportResults (int, optional): [to determine to export ndcg results]. Defaults to 0 and 1 to export ndgc score
        fileName (str, optional): [fileName to be exported ideally it should be the "ndcg_score_'model name']. Defaults to 'test'.
    """
    ## assign zero values to those non-relevant and retrieved documents, It retain the score of those relevant and retrieved
    BM25ScoreToRelevantAndRetrieved = assigningBM25ScoreToRelevantAndRetrieved(retrievedDocsDf,listOfRelevantDocs)
    ## obtain the score of the BM25 of the relevant and retrieved modules
    BM25ScoreToRelevantAndRetrievedScoreList = list(BM25ScoreToRelevantAndRetrieved.bm25Score)
    
    ## dict to save NDCGScore ie {k(ranking):NDCG Score}
    NDCGScoreDict = {}
    for i in range(1,len(BM25ScoreToRelevantAndRetrievedScoreList)):
        ndcg_at_kScore = ndcg_at_k(BM25ScoreToRelevantAndRetrievedScoreList,i)
        print('For k is {}, NDCG scorce is {}\n'.format(i,ndcg_at_kScore))
        NDCGScoreDict[i] = ndcg_at_kScore
    
    ## convert dict to df for easier sorting analysis of the scores and exporting it to csv
    import pandas as pd
    NDCGDf = pd.DataFrame.from_dict(NDCGScoreDict,orient='index',columns=['NDCGScore{}'.format(queryNum)])
    NDCGDf.reset_index(inplace = True)
    ## rename the column to k columns 
    NDCGDf.rename(columns={"index": "k"}, inplace = True)
    
    ## to export the ndcg scores to csv if exportResults == 1
    if exportResults == 1:
        fileName = 'ndcg_score_{}.csv'.format(fileName)
        NDCGDf.to_csv('../results/ndcg_score/{}'.format(fileName))
    return(NDCGDf)
    

# Toy Problem formulation

In [5]:
"Test Case : the retrievedDocScore"
## assume docs are not in bm25 scorce order
retrievedDocs = ['D','C', 'B','A'] 
retrievedDocsScore = [0.43, 0.26, 0.03, 0.37]
## I realised that the score should be in ascending order of bm25 score hence I made some changes to fit our use case
# retrievedDocsScore = [0.43,  0.37, 0.26, 0.03]

## creating a retrievedDocsDf for test cases
## this should be the same format of the bm25 output
retrievedDocsDict = {}
for index in range(len(retrievedDocs)):
    retrievedDocsDict[retrievedDocs[index]] = retrievedDocsScore[index]
import pandas as pd
retrievedDocsDf1 = pd.DataFrame.from_dict(retrievedDocsDict,orient='index',columns = ['bm25Score'])

print('BM25 output:')
retrievedDocsDf1

BM25 output:


Unnamed: 0,bm25Score
D,0.43
C,0.26
B,0.03
A,0.37


In [6]:
"Test Case : the retrievedDocScore"
## assume docs are not in bm25 scorce order
retrievedDocs = ['C','D', 'B','A'] 
retrievedDocsScore = [0.5, 0.3, 0.2, 0.1]
## I realised that the score should be in ascending order of bm25 score hence I made some changes to fit our use case
# retrievedDocsScore = [0.43,  0.37, 0.26, 0.03]

## creating a retrievedDocsDf for test cases
## this should be the same format of the bm25 output
retrievedDocsDict = {}
for index in range(len(retrievedDocs)):
    retrievedDocsDict[retrievedDocs[index]] = retrievedDocsScore[index]
import pandas as pd
retrievedDocsDf2 = pd.DataFrame.from_dict(retrievedDocsDict,orient='index',columns = ['bm25Score'])

print('BM25 output:')
retrievedDocsDf2

BM25 output:


Unnamed: 0,bm25Score
C,0.5
D,0.3
B,0.2
A,0.1


In [7]:
"Test Case : The Relevant Docs"
relevantDocs1 = ['B','D','E']
print('List of relevant Docs: {}'.format(relevantDocs1))
relevantDocs2 = ['A','C']
print('List of relevant Docs: {}'.format(relevantDocs2))

List of relevant Docs: ['B', 'D', 'E']
List of relevant Docs: ['A', 'C']


In [8]:
retrievedlist = [retrievedDocsDf1,retrievedDocsDf2]
relevantlist =[relevantDocs1,relevantDocs2]

In [9]:
if __name__ == "__main__":
    ## test case
    import pandas as pd
    ## this index is meant to keep track of the NDCG score of each query
    queryIndex = 0
    for retrieved in retrievedlist:
    ## to compute the NDCG of a single query
        NDCGWithVariousKdf = NDCGWithVariousK(retrieved,relevantlist[queryIndex],0,queryIndex)
    ## if this is 1st NDCG score been compute, make it's df to NDCG df else merge with the current overall NDGC df
        if queryIndex == 0:
            NDCGDf = NDCGWithVariousKdf
        else:
            NDCGDf = pd.merge(NDCGDf, NDCGWithVariousKdf, on=["k"])
        queryIndex += 1

For k is 1, DCG scorce is 0.43
For k is 1, IDCG scorce is 0.43
For k is 1, NDCG scorce is 1.0

For k is 2, DCG scorce is 0.43
For k is 2, IDCG scorce is 0.45999999999999996
For k is 2, NDCG scorce is 0.9347826086956522

For k is 3, DCG scorce is 0.4489278926071437
For k is 3, IDCG scorce is 0.45999999999999996
For k is 3, NDCG scorce is 0.9759302013198777

For k is 1, DCG scorce is 0.5
For k is 1, IDCG scorce is 0.5
For k is 1, NDCG scorce is 1.0

For k is 2, DCG scorce is 0.5
For k is 2, IDCG scorce is 0.6
For k is 2, NDCG scorce is 0.8333333333333334

For k is 3, DCG scorce is 0.5
For k is 3, IDCG scorce is 0.6
For k is 3, NDCG scorce is 0.8333333333333334



In [10]:
import pandas as pd 
import numpy as np
from gensim.models import KeyedVectors
from gensim.parsing.preprocessing import remove_stopwords
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
import operator
import time

# import logging
# logging.basicConfig(level=logging.INFO, format='%(message)s')
# logger = logging.getLogger()
# logger.addHandler(logging.FileHandler("logs/MAP.log", 'a'))
# print = logger.info


from utils.association_matrix import  get_top_k_associated_words, get_associated_words
from utils.query_processing import get_wordnet_pos, process_query, expand_query
from basic_bm25 import bm25_basic, get_result
from bm25_with_pseudo_relevance import bm25_pseudo_relevance_back
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
lemmatizer = WordNetLemmatizer()


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is alr

In [11]:
## function to compute the NDCG for cosine simliarities
def get_NDCG_cosine(query_val,tf):
    import CosineSimilarity
    ## compute Cosine simliarities score
    cosineSimDf = CosineSimilarity.rankedModuleOfCosineSim(query_val,tf)
    cosineSimDf = cosineSimDf.T
    return(cosineSimDf)

In [12]:
def clean_elective_names(relevant_results):
    # clean up the relevant course names 

    #https://stackoverflow.com/questions/2582138/finding-and-replacing-elements-in-a-list
    try:
        relevant_results = relevant_results.split(',')
        relevant_results = [x.replace("'",'') for x in relevant_results]
        relevant_results = [x.replace("[",'') for x in relevant_results]
        relevant_results = [x.replace("]",'') for x in relevant_results]
        relevant_results = [x.replace(" ",'',1) for x in relevant_results if x != relevant_results[0]]
    except:
        pass
    replacements = {
        ' 50.035 Computer Vision': '50.035 Computer Vision'
        ,'50.043 Database Systems / Database and Big Data Systems (for class 2021)': '50.043 Database Systems'
        }

    relevant_results = [replacements.get(x, x) for x in relevant_results]
    
    if '40.302 Advanced Optim/ 40.305 Advanced Stochastic' in relevant_results:
        relevant_results.remove('40.302 Advanced Optim/ 40.305 Advanced Stochastic')
        relevant_results.append('40.302 Advanced Topics in Optimisation#')
        relevant_results.append('40.305 Advanced Topics in Stochastic Modelling#')
    return relevant_results

In [13]:
## for Cosine Similarity (query expansion, course information)
import pandas as pd
tf = pd.read_csv('../data/course_info_scores/course_info_tf.csv', index_col = 0)
query_val= pd.read_csv('../data/survey/vaildation_sample_query.csv',index_col = 0)
pipeline2 = get_NDCG_cosine(query_val,tf)


Current computing Query: network, term, model, technology, probability
Number of terms in corpus: 8

Current computing Query: term, different, skill, mongodb, long


  dist = 1.0 - uv / np.sqrt(uu * vv)


In [None]:
import pandas as pd
queryCount = 0
NDCGDf = 0
for query,row in pipeline2.iterrows():
    ## create the df for retrieved docs and it's score
        retrievedDocsDict = {}
        cleanedElectives = clean_elective_names(row['topModules'])
        for index in range(len(row['topModules'])):
            retrievedDocsDict[cleanedElectives[index]] = row['topModulesScore'][index]
        import pandas as pd
        retrievedDocsDf = pd.DataFrame.from_dict(retrievedDocsDict,orient='index',columns = ['bm25Score'])
    
    ## cleaned golden/vaildation set modules
        print(retrievedDocsDf[list(retrievedDocsDf)[0]])
        print(validModules)
        validModules = clean_elective_names(query_val['expectedElectivesInOrder'][queryCount])
        print(validModules)
    ## to compute the NDCG of a single query
        NDCGWithVariousKdf = NDCGWithVariousK(retrievedDocsDf,validModules,0,queryCount)
    ## if this is 1st NDCG score been compute, make it's df to NDCG df else merge with the current overall NDGC df
        if queryCount == 0:
            NDCGDf = NDCGWithVariousKdf
        else:
            NDCGDf = pd.merge(NDCGDf, NDCGWithVariousKdf, on=["k"])
        queryCount += 1

50.012 Networks                                                  0.33558
01.104 Networked Life                                            0.27537
40.319 Statistical and Machine Learning                          0.26261
01.117 Brain-Inspired Computing and its Applications (Term 8)    0.25574
40.305 Advanced Topics in Stochastic Modelling#                  0.25503
50.020 Network Security                                          0.22059
40.232 Water Resources Management                                0.21678
50.035 Computer Vision                                           0.20015
01.107 Urban Transportation                                      0.18570
50.039 Theory and Practice of Deep Learning                      0.17916
Name: bm25Score, dtype: float64
['40.242 Derivative Pricing and Risk Management', '50.043 Database Systems', '50.038 Computational Data Science', '50.037 Blockchain Technology', '50.035 Computer Vision', '40.319 Statistical and Machine Learning', '40.324 Fundamentals of

In [None]:
NDCGDf

Unnamed: 0,k,NDCGScore0,NDCGScore1,NDCGScore2,NDCGScore3,NDCGScore4,NDCGScore5,NDCGScore6,NDCGScore7,NDCGScore8,...,NDCGScore20,NDCGScore21,NDCGScore22,NDCGScore23,NDCGScore24,NDCGScore25,NDCGScore26,NDCGScore27,NDCGScore28,NDCGScore29
0,1,0.0,0.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,...,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,0.0
1,2,0.0,0.0,0.673675,0.0,0.0,0.624825,0.554345,1.0,0.0,...,0.626879,0.667943,1.0,1.0,0.73,0.773922,0.834189,0.584266,0.751248,0.629159
2,3,0.0,0.0,0.772366,0.0,0.0,0.624825,0.699825,0.839407,0.0,...,0.707246,0.556132,0.823666,1.0,0.653465,0.707101,0.834189,0.490789,0.842669,0.522107
3,4,0.0,0.0,0.772366,0.0,0.0,0.624825,0.828532,0.864141,0.0,...,0.84974,0.627582,0.726308,0.961608,0.73759,0.707101,0.834189,0.665399,0.842669,0.460243
4,5,0.193298,0.0,0.772366,0.261236,0.0,0.624825,0.828532,0.864141,0.0,...,0.84974,0.730857,0.663122,0.938689,0.714181,0.79606,0.834189,0.77461,0.842669,0.460243
5,6,0.193298,0.0,0.772366,0.261236,0.197449,0.624825,0.828532,0.946206,0.0,...,0.84974,0.805274,0.663122,0.938689,0.714181,0.79606,0.834189,0.77461,0.886906,0.565188
6,7,0.193298,0.0,0.772366,0.261236,0.197449,0.624825,0.828532,0.946206,0.192098,...,0.84974,0.805274,0.743272,0.938689,0.768776,0.79606,0.834189,0.77461,0.886906,0.565188
7,8,0.310712,0.0,0.836755,0.392379,0.36065,0.624825,0.828532,0.946206,0.192098,...,0.84974,0.805274,0.815217,0.963674,0.768776,0.841676,0.834189,0.77461,0.886906,0.565188
8,9,0.310712,0.315465,0.836755,0.392379,0.36065,0.624825,0.828532,0.946206,0.337437,...,0.84974,0.805274,0.815217,0.963674,0.797748,0.841676,0.834189,0.77461,0.886906,0.640183
