In [17]:
def dcg_at_k(r, k, method=0):
    """Score is discounted cumulative gain (dcg)

    Relevance is positive real values.  Can use binary
    as the previous methods.

    Example from
    http://www.stanford.edu/class/cs276/handouts/EvaluationNew-handout-6-per.pdf
    >>> r = [3, 2, 3, 0, 0, 1, 2, 2, 3, 0]
    >>> dcg_at_k(r, 1)
    3.0
    >>> dcg_at_k(r, 1, method=1)
    3.0
    >>> dcg_at_k(r, 2)
    5.0
    >>> dcg_at_k(r, 2, method=1)
    4.2618595071429155
    >>> dcg_at_k(r, 10)
    9.6051177391888114
    >>> dcg_at_k(r, 11)
    9.6051177391888114

    Args:
        r: Relevance scores (list or numpy) in rank order
            (first element is the first item)
        k: Number of results to consider
        method: If 0 then weights are [1.0, 1.0, 0.6309, 0.5, 0.4307, ...]

    Returns:
        Discounted cumulative gain
    """
    import numpy as np
    r = np.asfarray(r)[:k]
    if r.size:
        return r[0] + np.sum(r[1:] / np.log2(np.arange(2, r.size + 1)))
    return 0.

In [18]:
## from week 6 lab
def ndcg_at_k(r, k, method=0):
    """Score is normalized discounted cumulative gain (ndcg)

    Relevance is positive real values.  Can use binary
    as the previous methods.

    Example from
    http://www.stanford.edu/class/cs276/handouts/EvaluationNew-handout-6-per.pdf
    >>> r = [3, 2, 3, 0, 0, 1, 2, 2, 3, 0]
    >>> ndcg_at_k(r, 1)
    1.0
    >>> r = [2, 1, 2, 0]
    >>> ndcg_at_k(r, 4)
    0.9203032077642922
    >>> ndcg_at_k(r, 4, method=1)
    0.96519546960144276
    >>> ndcg_at_k([0], 1)
    0.0
    >>> ndcg_at_k([1], 2)
    1.0

    Args:
        r: Relevance scores (list or numpy) in rank order
            (first element is the first item)
        k: Number of results to consider
        method: If 0 then weights are [1.0, 1.0, 0.6309, 0.5, 0.4307, ...]
                If 1 then weights are [1.0, 0.6309, 0.5, 0.4307, ...]

    Returns:
        Normalized discounted cumulative gain
    """
    import numpy as np

    dcg_max = dcg_at_k(sorted(r, reverse=True), k, method)
    if not dcg_max:
        return 0.
    return dcg_at_k(r, k, method) / dcg_max

In [19]:
def assigningBM25ScoreToRelevantAndRetrieved(_bm25ScoreDf, relevantAndRetrievedDocs):
    df = _bm25ScoreDf.copy(deep = False)
    irrelevantAndRetrievedDocsList = list(set(df.index) - set(relevantAndRetrievedDocs))
    
    for relevantAndRetrievedDoc in irrelevantAndRetrievedDocsList:
        df.loc[relevantAndRetrievedDoc]['bm25Score'] = 0
    return(df)

In [20]:
## based setup
## assume docs are in order
retrievedDocs = ['D','C', 'B','A'] 
retrievedDocsScore = [0.43, 0.26, 0.03, 0.37]

relevantAndRetrievedDocs = ['B','D']

retrievedDocsDict = {}
for index in range(len(retrievedDocs)):
    retrievedDocsDict[retrievedDocs[index]] = retrievedDocsScore[index]
import pandas as pd
retrievedDocsDf = pd.DataFrame.from_dict(retrievedDocsDict,orient='index',columns = ['bm25Score'])


In [21]:
BM25ScoreToRelevantAndRetrieved = assigningBM25ScoreToRelevantAndRetrieved(retrievedDocsDf,relevantAndRetrievedDocs)
BM25ScoreToRelevantAndRetrieved

Unnamed: 0,bm25Score
D,0.43
C,0.0
B,0.03
A,0.0


In [22]:
BM25ScoreToRelevantAndRetrievedScoreList = list(BM25ScoreToRelevantAndRetrieved.bm25Score)
BM25ScoreToRelevantAndRetrievedScoreList

[0.43, 0.0, 0.03, 0.0]

In [33]:
"""[summary]
There are still difference between the lab ndcg and mannual caulcation NDCG
"""
ndcg_at_kScore = ndcg_at_k(BM25ScoreToRelevantAndRetrievedScoreList,4)
ndcg_at_kScore


0.9759302013198777

In [4]:
import pandas as pd
scrapped = pd.read_csv('../data/bm25/bm25_no_survey/merged_courses_tf.csv',index_col = 0)
survey = pd.read_csv('../data/survey_processing/surveyToBeTokenized.csv',index_col = 0)

In [10]:
wordCounter = 0
listOfWordInScrappedNSurvey = []
for suveryWord in survey.index:
    if suveryWord in scrapped.index:
        listOfWordInScrappedNSurvey.append(suveryWord)
        wordCounter +=1
wordCounter

154

In [25]:
SampleQuery1 = 'computational, analysis, solidity, mongodb, evaluation'
SampleQuery1 = SampleQuery1.split(", ")

SampleQuery2 = 'analysis, ampl, cast, sklearn, ui'
SampleQuery2 = SampleQuery2.split(", ")


SampleQuery3 = 'real, soup, social, computational, tensorflow'
SampleQuery3 = SampleQuery3.split(", ")

SampleQuery4 = 'cleaning, business, real, soup, concept'
SampleQuery4 = SampleQuery4.split(", ")

SampleQuery5 = 'wireframing, inventory, long, system, technology'
SampleQuery5 = SampleQuery5.split(", ")

listOfSampleQuery = [SampleQuery1, SampleQuery2, SampleQuery3, SampleQuery4, SampleQuery5]

In [28]:
for sampleQuery in listOfSampleQuery:
    wordCounter = 0
    listOfWordInScrappedNSurvey = []
    for word in sampleQuery:
        if word in scrapped.index:
            listOfWordInScrappedNSurvey.append(word)
            wordCounter +=1
    print('Common Words between scrapped and SampleQuery{}: {}'.format(sampleQuery,wordCounter))
    print('Common terms : {}\n'.format(listOfWordInScrappedNSurvey))

Common Words between scrapped and SampleQuery['computational', 'analysis', 'solidity', 'mongodb', 'evaluation']: 3
Common terms : ['computational', 'analysis', 'evaluation']

Common Words between scrapped and SampleQuery['analysis', 'ampl', 'cast', 'sklearn', 'ui']: 3
Common terms : ['analysis', 'cast', 'sklearn']

Common Words between scrapped and SampleQuery['real', 'soup', 'social', 'computational', 'tensorflow']: 4
Common terms : ['real', 'social', 'computational', 'tensorflow']

Common Words between scrapped and SampleQuery['cleaning', 'business', 'real', 'soup', 'concept']: 3
Common terms : ['business', 'real', 'concept']

Common Words between scrapped and SampleQuery['wireframing', 'inventory', 'long', 'system', 'technology']: 4
Common terms : ['inventory', 'long', 'system', 'technology']

