In [11]:
import pandas as pd 
import ast
import numpy as np
import string
from gensim.models import KeyedVectors
from gensim.parsing.preprocessing import remove_stopwords
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
import operator
import math
from tqdm import tqdm
import time
import itertools
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
lemmatizer = WordNetLemmatizer()

[nltk_data] Downloading package punkt to /Users/jeroe/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/jeroe/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/jeroe/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [12]:
tf = pd.read_csv('../../../data/bm25/bm25_no_survey/merged_courses_tf.csv', header=0, index_col=0)
tf_norm = pd.read_csv('../../../data/bm25/bm25_no_survey/merged_courses_tf_norm.csv', header=0, index_col=0)
idf = pd.read_csv('../../../data/bm25/bm25_no_survey/merged_courses_idf.csv', header=0, index_col=0)
df = pd.read_csv('../../../data/bm25/bm25_no_survey/merged_courses_df.csv', header=0, index_col=0)
glove_kv = '../../../pretrained_corpus/glove_6B_300d.kv'   # pretrained vectors for query expansion
added_data = pd.read_csv('../../../data/bm25/bm25_relevance_feedback/TrainingSampleQuery.csv', header = 0 , index_col = 0)
association_matrix = pd.read_csv('../../../data/association_matrix/association_matrix_trg.csv', header = 0, index_col = 0)
norm_association_matrix = pd.read_csv('../../../data/association_matrix/norm_association_matrix_trg.csv', header = 0, index_col = 0)

In [14]:
def get_wordnet_pos(word):
    """Map POS tag to first character lemmatize() accepts"""
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}

    return tag_dict.get(tag, wordnet.NOUN)

def process_query(query):
    query = query.translate(str.maketrans('', '', string.punctuation))
    query = str(np.char.lower(query))
    query = ''.join([i for i in query if not i.isdigit()])
    query = remove_stopwords(query)
    query = [lemmatizer.lemmatize(w, get_wordnet_pos(w))
             for w in nltk.word_tokenize(query)]
    query = ' '.join([str(elem) for elem in query])
    query = query.translate(str.maketrans('', '', string.punctuation))
    return query.split()

def expand_query(query,glove_kv,topn):
    model = KeyedVectors.load(glove_kv)
    expanded_query = []
    for word in query:
        expanded_query.append(word)
        if word in model:
            for close_word in model.most_similar(word, topn = topn):
                expanded_query.append(close_word[0])
    return expanded_query


In [15]:
def get_associated_words(query, matrix):
    '''
    for each word in original query, add on a word which is best associated with it via the corpus
    '''
    extended_query = query.copy()
    for word in query:
        if word in matrix.index:
            new_word = matrix[word].astype(float).idxmax()
            extended_query.append(new_word)
    return extended_query

def get_top_k_associated_words(relevant_courses, tf, matrix, k):
    '''
    get top k associated words and reformulate back into the query
    '''
    total_new_words = []
    for course in relevant_courses:
        words = tf.index[tf[course]>0].tolist()
        # filtered_matrix = matrix[matrix.index.isin(words)]
        associated_words = {}
        for word in words:
            associated_words[matrix[word].astype(float).idxmax()] = matrix[word].astype(float).max()
        new_words = sorted(associated_words, key=associated_words.get, reverse=True )[:k]
        total_new_words += new_words
    return total_new_words

def create_association_matrix(tf):
    '''
    takes in tf dataframe to get association matrix df
    '''
    unique_words = tf.index.values.tolist()
    ls = list(itertools.combinations(tf.index, 2))
    for idx in tf.index:
        ls.append((idx,idx))
    associations = {}
    for item in ls:
        k = item[0]+item[1]
        associations[k] = sum(tf.loc[item[0]]*tf.loc[item[1]])
    association_matrix = pd.DataFrame(0,index=unique_words,columns=unique_words)
    for i in tqdm(unique_words):
        for j in unique_words:
            # association_matrix.loc[i,j] = sum(tf.loc[i]*tf.loc[j])
            if i+j in associations:
                association_matrix.loc[i,j] = associations[i+j]
            elif j+i in associations:
                association_matrix.loc[i,j] = associations[j+i]
    return association_matrix,unique_words

def create_norm_association_matrix(association_matrix, unique_words):
    norm_association_matrix = pd.DataFrame(index=unique_words,columns=unique_words)
    for i in tqdm(unique_words):
        for j in unique_words:
            if i!=j:
                norm_association_matrix.loc[i,j] = association_matrix.loc[i,j]/(association_matrix.loc[i,i]+association_matrix.loc[j,j]-association_matrix.loc[i,j])
    return norm_association_matrix

In [16]:
def update_scores(tf, relevant_courses, associated_words):
    courses = tf.columns.tolist()
    for course in relevant_courses:
        for word in associated_words:
            if word in tf.index.tolist():
                tf[course][word] += 1
            else:
                idx = courses.index(course)
                temp_lst = [1 if i == idx else 0 for i in range(len(courses))]
                temp_df = pd.DataFrame([temp_lst], columns=courses, index=[word])
                tf = tf.append(temp_df)
    tf.sort_index()

    tf_norm = tf.apply(lambda x: x/x.max(), axis=0) # normalise each column by dividing the max frequency
    tf_norm = tf_norm.replace(np.nan, 0)

    df_arr = np.count_nonzero(tf, axis=1) # compute df: count the nunber of non zero columns in each role 
    df = pd.DataFrame(data={'df':df_arr}, index = tf.index.tolist())

    idf = df.copy()
    idf['df'] = idf['df'].apply(lambda freq: math.log10((len(tf.columns.tolist())) / (freq))) # calc idf
    idf = idf.rename({'df': 'idf'}, axis=1, inplace=False)
    idf = idf.replace(np.nan, 0)
    
    return tf, tf_norm, df, idf

In [17]:
# bm-25 processing 
def bm25_basic(query, doc, tf, tf_norm, idf, vocab, avg_doc_len, k1=1.5, b=0.75):
    ''''
    returns the score for a document given a query based on the basic bm25 algorithm
    '''
    score = 0.0
    for term in query:
        if term not in vocab:
            continue
        numerator = idf['idf'][term] * tf_norm[doc][term] * (k1 + 1)
        doc_len = tf[doc].to_numpy().sum()
        denominator = tf_norm[doc][term] + k1 * \
            (1 - b + b * doc_len / avg_doc_len)
        score += (numerator / denominator)
    return score

def bm25_reformulated(query, doc, relevant_courses, tf, df, vocab, tf_norm, avg_doc_len, k1=1.5, b=0.75, k3=1.5):
    score = 0.0
    vr = len(relevant_courses)  # total retrieved relevant docs
    for term in query:
        if term not in vocab:
            continue
        vr_t = 0  # total retrieved relevant docs where term t appears
        for course in relevant_courses:
            words = tf.index[tf[course]>0].tolist()
            if term in words:
                vr_t+=1
        vnr_t = vr-vr_t  # total retrieved relevant docs where term t does not appear
        df_t = float(df.loc[term]) # document frequency with given term 
        doc_len = tf[doc].to_numpy().sum()
        tf_d = float(tf_norm[doc][term]) + 0.0001 # term frequency in document (include 0.0001 smoothing if not code will break lol)
        tf_q = query.count(term)/len(query) # term frequency in reformulated query normalised
        N = len(tf.columns) # total number of documents
        part_a = ((abs(vr_t)+0.5)/(abs(vnr_t)+0.5))/((df_t-abs(vr_t)+0.5)/(N-df_t-abs(vr)+abs(vr_t)+0.5))
        part_b = ((k1+1)*tf_d)/((k1*((1-b)+b*(doc_len/avg_doc_len)))+tf_d)
        part_c = ((k3+1)*tf_q)/(k3+tf_q)

        score += math.log10(part_a*part_b*part_c)
    return score

def bm25_prediction(query, tf, tf_norm, df, idf, vocab, avg_doc_len, reformulated, relevant_courses=[]):
    '''
    ranks the documents based on the scores of all the documents
    reformulated: if true run bm25_reformulated algorithm, else run bm25_basic algorithm.
    '''
    courses = tf.columns.tolist()
    result = {}
    if reformulated is False:
        print(f'running basic bm25')
        for course in courses:
            result[course] = bm25_basic(query, course, tf, tf_norm, idf, vocab, avg_doc_len)
    elif reformulated is True:
        print(f'running bm25 for reformulated query')
        for course in courses:
            result[course] = bm25_reformulated(query, course, relevant_courses, tf, df, vocab, tf_norm, avg_doc_len)      
    sorted_result = dict(
        sorted(result.items(), key=operator.itemgetter(1), reverse=True))
    ls = []
    for k, v in sorted_result.items():
        ls.append(k)
        # print(f"{k}: {v}")
    return result, ls



In [18]:
# cleaning of survey results used for training

for idx_i, courses in enumerate(added_data['expectedElectivesInOrder']):
    added_data['expectedElectivesInOrder'][idx_i] = ast.literal_eval(courses)
    for idx_j,course in enumerate(added_data['expectedElectivesInOrder'][idx_i]):
        if course == " 50.035 Computer Vision":
            added_data['expectedElectivesInOrder'][idx_i][idx_j] = '50.035 Computer Vision'
        elif course == "50.043 Database Systems / Database and Big Data Systems (for class 2021)":
            added_data['expectedElectivesInOrder'][idx_i][idx_j] = '50.043 Database Systems'
        elif course == "40.302 Advanced Optim/ 40.305 Advanced Stochastic":
            added_data['expectedElectivesInOrder'][idx_i].remove("40.302 Advanced Optim/ 40.305 Advanced Stochastic")
            added_data['expectedElectivesInOrder'][idx_i].insert(idx_j,'40.302 Advanced Topics in Optimisation#')
            # added_data['expectedElectivesInOrder'][idx_i].insert(idx_j+1, '40.305 Advanced Topics in Stochastic Modelling#')


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


In [19]:
added_data.head(3)

Unnamed: 0,querySample,expectedElectivesInOrder,expectedElectivesInOrderSumWordCount,queryType
0,"computational, analysis, solidity, mongodb, ev...","[50.038 Computational Data Science, 50.033 Fou...","[2.0, 2.0, 2.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, ...",1
1,"analysis, ampl, cast, sklearn, ui",[40.321 Airport Systems Modelling and Simulati...,"[6.0, 3.0, 2.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, ...",1
2,"real, soup, social, computational, tensorflow","[50.035 Computer Vision, 50.038 Computational ...","[4.0, 3.0, 2.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",1


In [20]:
# entire pipeline for training phase
top_retrieved = 10
start_time = time.time()

for idx,row in added_data.iterrows(): # accessing all sample queries from training set
    print(f'Training iteration: {idx+1}')
    vocab = tf.index.tolist()  # unique words
    total_length = tf.to_numpy().sum()
    avg_doc_len = total_length / len(tf.columns) # average document length across all courses
    query = row['querySample']  # take in query from training sample
    print(f'original query from training sample: {query}')
    query = process_query(query) # lemitize query 
    query = get_associated_words(query, norm_association_matrix) # included associated terms
    print(f'query expansion with correlation matrix: {query}')
    query = expand_query(query,glove_kv,topn=3)  # expand query by including words from pretrianed w2v corpus
    print(f'query expansion with pretrained corpus: {query}')
    result, ls = bm25_prediction(query=query, df=df, tf=tf, tf_norm=tf_norm, idf=idf, vocab=vocab, avg_doc_len=avg_doc_len, reformulated=False)  # intial bm25
    predicted = ls[:top_retrieved] # retrieve top 10 courses from predictions
    print(f'initial predicted courses: ')
    for course in predicted: 
        print(course)
    print('')
    gold_standard = added_data['expectedElectivesInOrder'][idx]
    relevant_courses = []
    for i in range(1,top_retrieved+1):
        if gold_standard[i-1] == predicted[i-1]:
            relevant_courses.append(predicted[i-1])
            print(f'Relevant & Retrieved: Rank {i} {predicted[i-1]}') # Documents which are relevant are retrieved, comparing with gold standard training sample
    associated_words = get_top_k_associated_words(relevant_courses, tf, norm_association_matrix, k=3)  # get top 3 associated words from each relevant and retrieved course 
    query+=associated_words  # add associated words to orginal query to form the eformulated query
    print('')
    print(f'reformulated query:{query}')
    result, ls = bm25_prediction(query=query, df=df, tf=tf, tf_norm=tf_norm, idf=idf, vocab=vocab, avg_doc_len=avg_doc_len, relevant_courses=relevant_courses, reformulated=True) # bm25 reformulated
    predicted_reformulated = ls[:top_retrieved] # retrieve top 10 courses from predictions
    print(f'prediction after query reformulation')
    for course in predicted_reformulated:
        print(course)
    
    tf, tf_norm, df, idf = update_scores(tf=tf, relevant_courses=relevant_courses, associated_words=associated_words)  # update tf, tf_norm, idf, df
    association_matrix,unique_words = create_association_matrix(tf=tf)
    norm_association_matrix = create_norm_association_matrix(association_matrix = association_matrix,unique_words = unique_words) # update correlation matrix
    print('updated tf, tf_norm, df, idf, asociation matrix, association matrix norm scores...')
    print(f'time elapsed: {(time.time()-start_time)//60}min {(time.time()-start_time)%60}s')
    print('')



print(f'time elapsed: {(time.time()-start_time)//60}min {(time.time()-start_time)%60}s') 
# saving trained scores 
tf.to_csv('../../../data/bm25/bm25_relevance_feedback/merged_courses_tf_trained.csv')
tf_norm.to_csv('../../../data/bm25/bm25_relevance_feedback/merged_courses_tf_norm_trained.csv')
df.to_csv('../../../data/bm25/bm25_relevance_feedback/merged_courses_df_trained.csv')
idf.to_csv('../../../data/bm25/bm25_relevance_feedback/merged_courses_idf_trained.csv')
association_matrix.to_csv('../../../data/bm25/bm25_relevance_feedback/association_matrix_trained.csv')
norm_association_matrix.to_csv('../../../data/bm25/bm25_relevance_feedback/association_matrix_norm_trained.csv')


Training iteration: 1
original query from training sample: computational, analysis, solidity, mongodb, evaluation
query expansion with correlation matrix: ['computational', 'analysis', 'solidity', 'mongodb', 'evaluation', 'fabrication', 'framework', 'set']
query expansion with pretrained corpus: ['computational', 'computation', 'mathematical', 'algorithms', 'analysis', 'analyses', 'study', 'data', 'solidity', 'soundness', 'robustness', 'predictability', 'mongodb', 'nosql', 'scriptable', 'rhizaria', 'evaluation', 'assessment', 'evaluations', 'evaluate', 'fabrication', 'wafer', 'fabricated', 'fabrications', 'framework', 'implementation', 'frameworks', 'principles', 'set', 'setting', 'sets', 'up']
running basic bm25
initial predicted courses: 
50.048 Computational Fabrication
40.323 Equity Valuation
40.230 Sustainable Engineering
50.038 Computational Data Science
01.117 Brain-Inspired Computing and its Applications (Term 8)
50.006 User Interface Design and Implementation
40.324 Fundamenta

100%|██████████| 1396/1396 [02:34<00:00,  9.02it/s]
100%|██████████| 1396/1396 [03:40<00:00,  6.34it/s]


updated tf, tf_norm, df, idf, asociation matrix, association matrix norm scores...
time elapsed: 11.0min 24.224261045455933s

Training iteration: 2
original query from training sample: analysis, ampl, cast, sklearn, ui
query expansion with correlation matrix: ['analysis', 'ampl', 'cast', 'sklearn', 'ui', 'framework', 'airtop', 'automatic']
query expansion with pretrained corpus: ['analysis', 'analyses', 'study', 'data', 'ampl', 'mangxamba', 'ufdots', 'uninitialized', 'cast', 'casting', 'casts', 'actors', 'sklearn', 'ui', 'fiachrach', 'uí', 'pak', 'framework', 'implementation', 'frameworks', 'principles', 'airtop', 'automatic', 'machine', 'automatically', 'automated']
running basic bm25
initial predicted courses: 
40.323 Equity Valuation
50.038 Computational Data Science
01.116 AI for Healthcare (Term 7)
40.321 Airport Systems Modelling and Simulation
50.007 Machine Learning
50.043 Database Systems
40.319 Statistical and Machine Learning
50.045 Information Retrieval
40.316 Game Theory
0

100%|██████████| 1396/1396 [02:27<00:00,  9.45it/s]
100%|██████████| 1396/1396 [03:33<00:00,  6.54it/s]


updated tf, tf_norm, df, idf, asociation matrix, association matrix norm scores...
time elapsed: 22.0min 19.784332752227783s

Training iteration: 3
original query from training sample: real, soup, social, computational, tensorflow
query expansion with correlation matrix: ['real', 'soup', 'social', 'computational', 'tensorflow', 'world', 'fairness', 'fabrication', 'localization']
query expansion with pretrained corpus: ['real', 'true', 'just', 'what', 'soup', 'soups', 'stew', 'noodle', 'social', 'welfare', 'education', 'political', 'computational', 'computation', 'mathematical', 'algorithms', 'tensorflow', 'world', 'ever', 'time', 'global', 'fairness', 'honesty', 'impartiality', 'objectivity', 'fabrication', 'wafer', 'fabricated', 'fabrications', 'localization', 'localisation', 'subcellular', 'internationalization']
running basic bm25
initial predicted courses: 
50.048 Computational Fabrication
01.104 Networked Life
50.017 Graphics and Visualisation
50.035 Computer Vision
40.319 Statist

100%|██████████| 1396/1396 [02:30<00:00,  9.26it/s]
100%|██████████| 1396/1396 [03:38<00:00,  6.40it/s]


updated tf, tf_norm, df, idf, asociation matrix, association matrix norm scores...
time elapsed: 33.0min 17.47965383529663s

Training iteration: 4
original query from training sample: cleaning, business, real, soup, concept
query expansion with correlation matrix: ['cleaning', 'business', 'real', 'soup', 'concept', 'value', 'world', 'course']
query expansion with pretrained corpus: ['cleaning', 'washing', 'cleaned', 'cleaners', 'business', 'businesses', 'industry', 'companies', 'real', 'true', 'just', 'what', 'soup', 'soups', 'stew', 'noodle', 'concept', 'concepts', 'idea', 'notion', 'value', 'values', 'valued', 'price', 'world', 'ever', 'time', 'global', 'course', 'courses', 'golf', 'way']
running basic bm25
initial predicted courses: 
40.305 Advanced Topics in Stochastic Modelling#
40.318 Supply Chain Digitalisation and Design
50.017 Graphics and Visualisation
40.323 Equity Valuation
01.104 Networked Life
40.324 Fundamentals of Investing
40.316 Game Theory
40.240 Investment Science
4

100%|██████████| 1396/1396 [02:28<00:00,  9.38it/s]
100%|██████████| 1396/1396 [03:29<00:00,  6.68it/s]


updated tf, tf_norm, df, idf, asociation matrix, association matrix norm scores...
time elapsed: 44.0min 5.946018934249878s

Training iteration: 5
original query from training sample: wireframing, inventory, long, system, technology
query expansion with correlation matrix: ['wireframing', 'inventory', 'long', 'technology', 'production', 'localization', 'cloud']
query expansion with pretrained corpus: ['wireframing', 'inventory', 'inventories', 'unsold', 'stockpiles', 'long', 'short', 'longer', 'years', 'technology', 'technologies', 'tech', 'technological', 'production', 'producing', 'output', 'producers', 'localization', 'localisation', 'subcellular', 'internationalization', 'cloud', 'clouds', 'ash', 'shadow']
running basic bm25
initial predicted courses: 
50.046 Cloud Computing and Internet of Things
40.318 Supply Chain Digitalisation and Design
40.260 Supply Chain Management
50.017 Graphics and Visualisation
50.035 Computer Vision
Service Design Studio
40.232 Water Resources Manageme

100%|██████████| 1396/1396 [02:27<00:00,  9.47it/s]
100%|██████████| 1396/1396 [03:31<00:00,  6.59it/s]


updated tf, tf_norm, df, idf, asociation matrix, association matrix norm scores...
time elapsed: 54.0min 50.29411196708679s

Training iteration: 6
original query from training sample: solidity, prototyping, accessibility, network, service
query expansion with correlation matrix: ['solidity', 'prototyping', 'accessibility', 'network', 'service', 'audio', 'neural', 'technology']
query expansion with pretrained corpus: ['solidity', 'soundness', 'robustness', 'predictability', 'prototyping', 'workflow', 'computer-aided', 'simulation', 'accessibility', 'affordability', 'availability', 'usability', 'network', 'networks', 'cable', 'channel', 'service', 'services', '.', 'news', 'audio', 'video', 'stereo', 'dvd', 'neural', 'neuronal', 'neurons', 'cortical', 'technology', 'technologies', 'tech', 'technological']
running basic bm25
initial predicted courses: 
01.117 Brain-Inspired Computing and its Applications (Term 8)
50.035 Computer Vision
50.006 User Interface Design and Implementation
50.042

100%|██████████| 1396/1396 [02:30<00:00,  9.29it/s]
100%|██████████| 1396/1396 [03:33<00:00,  6.55it/s]


updated tf, tf_norm, df, idf, asociation matrix, association matrix norm scores...
time elapsed: 65.0min 43.255290031433105s

Training iteration: 7
original query from training sample: software, personalisation, math, science, different
query expansion with correlation matrix: ['software', 'personalisation', 'math', 'science', 'different', 'analyze', 'branching', 'computation', 'application']
query expansion with pretrained corpus: ['software', 'computer', 'microsoft', 'hardware', 'personalisation', 'cychropsis', 'self-righteousness', 'systemd', 'math', 'mathematics', 'graders', 'maths', 'science', 'sciences', 'physics', 'scientific', 'different', 'various', 'these', 'types', 'analyze', 'evaluate', 'analyse', 'examine', 'branching', 'branched', 'stems', 'unbranched', 'computation', 'computations', 'computational', 'algorithms', 'application', 'applications', 'applied', 'apply']
running basic bm25
initial predicted courses: 
40.305 Advanced Topics in Stochastic Modelling#
50.048 Computa

100%|██████████| 1396/1396 [02:29<00:00,  9.35it/s]
100%|██████████| 1396/1396 [03:32<00:00,  6.58it/s]


updated tf, tf_norm, df, idf, asociation matrix, association matrix norm scores...
time elapsed: 76.0min 34.395751953125s

Training iteration: 8
original query from training sample: aspect, investment, beautiful, chain, hadoop
query expansion with correlation matrix: ['aspect', 'investment', 'beautiful', 'chain', 'hadoop', 'idea', 'combine', 'supply', 'aware']
query expansion with pretrained corpus: ['aspect', 'aspects', 'perspective', 'facet', 'investment', 'investments', 'investing', 'fund', 'beautiful', 'lovely', 'gorgeous', 'wonderful', 'chain', 'chains', 'stores', 'store', 'hadoop', 'mapreduce', 'open-source', 'openoffice', 'idea', 'notion', 'concept', 'something', 'combine', 'add', 'mixture', 'mix', 'supply', 'supplies', 'shortage', 'supplied', 'aware', 'concerned', 'obviously', 'unaware']
running basic bm25
initial predicted courses: 
40.260 Supply Chain Management
40.318 Supply Chain Digitalisation and Design
40.323 Equity Valuation
40.324 Fundamentals of Investing
50.033 Found

100%|██████████| 1396/1396 [02:26<00:00,  9.52it/s]
100%|██████████| 1396/1396 [03:28<00:00,  6.71it/s]


updated tf, tf_norm, df, idf, asociation matrix, association matrix norm scores...
time elapsed: 87.0min 14.269365072250366s

Training iteration: 9
original query from training sample: infrastructure, statistic, relevance, operation, mining
query expansion with correlation matrix: ['infrastructure', 'statistic', 'relevance', 'operation', 'mining', 'public', 'challenge']
query expansion with pretrained corpus: ['infrastructure', 'infrastructures', 'projects', 'upgrading', 'statistic', 'statistics', 'bps', 'statistical', 'relevance', 'usefulness', 'significance', 'validity', 'operation', 'operations', 'raid', 'forces', 'mining', 'mines', 'coal', 'mine', 'public', 'private', 'government', 'education', 'challenge', 'challenges', 'challenging', 'faced']
running basic bm25
initial predicted courses: 
01.102 Energy Systems and Management
50.037 Blockchain Technology
40.230 Sustainable Engineering
50.042 Foundations of Cybersecurity
01.107 Urban Transportation
40.260 Supply Chain Management
40

100%|██████████| 1396/1396 [02:38<00:00,  8.78it/s]
100%|██████████| 1396/1396 [03:33<00:00,  6.54it/s]


updated tf, tf_norm, df, idf, asociation matrix, association matrix norm scores...
time elapsed: 98.0min 12.452136993408203s

Training iteration: 10
original query from training sample: test, cast, mongodb, vector, social
query expansion with correlation matrix: ['test', 'cast', 'mongodb', 'vector', 'social', 'aspect', 'airtop', 'discriminative', 'fairness']
query expansion with pretrained corpus: ['test', 'tests', 'testing', 'tested', 'cast', 'casting', 'casts', 'actors', 'mongodb', 'nosql', 'scriptable', 'rhizaria', 'vector', 'vectors', 'formula_1', 'formula_2', 'social', 'welfare', 'education', 'political', 'aspect', 'aspects', 'perspective', 'facet', 'airtop', 'discriminative', 'discriminatory', 'high-handed', 'cobweb', 'fairness', 'honesty', 'impartiality', 'objectivity']
running basic bm25
initial predicted courses: 
01.104 Networked Life
40.321 Airport Systems Modelling and Simulation
50.033 Foundations of Game Design and Development
40.230 Sustainable Engineering
40.232 Water R

100%|██████████| 1396/1396 [02:27<00:00,  9.46it/s]
100%|██████████| 1396/1396 [03:29<00:00,  6.65it/s]


updated tf, tf_norm, df, idf, asociation matrix, association matrix norm scores...
time elapsed: 108.0min 59.47857689857483s

Training iteration: 11
original query from training sample: boolean, autocad, database, answer, nash
query expansion with correlation matrix: ['boolean', 'autocad', 'database', 'answer', 'nash', 'big', 'question']
query expansion with pretrained corpus: ['boolean', 'propositional', 'commutative', 'algebra', 'autocad', 'autodesk', 'cad', 'revit', 'database', 'databases', 'data', 'searchable', 'answer', 'answers', 'questions', 'question', 'nash', 'finley', 'nowitzki', 'kidd', 'big', 'bigger', 'huge', 'biggest', 'question', 'questions', 'answer', 'whether']
running basic bm25
initial predicted courses: 
50.043 Database Systems
01.104 Networked Life
50.045 Information Retrieval
50.038 Computational Data Science
50.017 Graphics and Visualisation
01.117 Brain-Inspired Computing and its Applications (Term 8)
40.323 Equity Valuation
50.039 Theory and Practice of Deep Le

100%|██████████| 1396/1396 [02:28<00:00,  9.41it/s]
100%|██████████| 1396/1396 [03:31<00:00,  6.61it/s]


updated tf, tf_norm, df, idf, asociation matrix, association matrix norm scores...
time elapsed: 119.0min 49.81314492225647s

Training iteration: 12
original query from training sample: swot, probability, software, certain, info
query expansion with correlation matrix: ['swot', 'probability', 'software', 'certain', 'info', 'branching', 'analyze', 'chunk']
query expansion with pretrained corpus: ['swot', 'stft', 'onomasticon', 'stochastics', 'probability', 'probabilities', 'likelihood', 'parameter', 'software', 'computer', 'microsoft', 'hardware', 'certain', 'particular', 'such', 'these', 'info', 'information', '(800)', 'tidbits', 'branching', 'branched', 'stems', 'unbranched', 'analyze', 'evaluate', 'analyse', 'examine', 'chunk', 'chunks', 'sizable', 'slice']
running basic bm25
initial predicted courses: 
40.305 Advanced Topics in Stochastic Modelling#
50.017 Graphics and Visualisation
50.036 Foundations of Distributed Autonomous Systems
50.042 Foundations of Cybersecurity
50.044 Syste

100%|██████████| 1396/1396 [02:28<00:00,  9.41it/s]
100%|██████████| 1396/1396 [03:31<00:00,  6.60it/s]


updated tf, tf_norm, df, idf, asociation matrix, association matrix norm scores...
time elapsed: 130.0min 39.5250780582428s

Training iteration: 13
original query from training sample: mongodb, unity, pytorch, system, jepsen
query expansion with correlation matrix: ['mongodb', 'unity', 'pytorch', 'jepsen', 'order']
query expansion with pretrained corpus: ['mongodb', 'nosql', 'scriptable', 'rhizaria', 'unity', 'reconciliation', 'solidarity', 'peace', 'pytorch', 'jepsen', 'rae', 'kheda', 'carly', 'order', 'orders', 'ordered', 'to']
running basic bm25
initial predicted courses: 
50.038 Computational Data Science
50.021 Artificial Intelligence
50.006 User Interface Design and Implementation
50.007 Machine Learning
50.012 Networks
50.017 Graphics and Visualisation
50.020 Network Security
50.033 Foundations of Game Design and Development
50.035 Computer Vision
50.036 Foundations of Distributed Autonomous Systems

Relevant & Retrieved: Rank 8 50.033 Foundations of Game Design and Development


100%|██████████| 1396/1396 [57:17<00:00,  2.46s/it]
100%|██████████| 1396/1396 [4:04:03<00:00, 10.49s/it]


updated tf, tf_norm, df, idf, asociation matrix, association matrix norm scores...
time elapsed: 497.0min 31.037185192108154s

Training iteration: 14
original query from training sample: ui, nash, investment, prototyping, finance
query expansion with correlation matrix: ['ui', 'nash', 'investment', 'prototyping', 'finance', 'combine', 'box', 'return']
query expansion with pretrained corpus: ['ui', 'fiachrach', 'uí', 'pak', 'nash', 'finley', 'nowitzki', 'kidd', 'investment', 'investments', 'investing', 'fund', 'prototyping', 'workflow', 'computer-aided', 'simulation', 'finance', 'minister', 'financial', 'financing', 'combine', 'add', 'mixture', 'mix', 'box', 'boxes', 'p.o.', 'dvd', 'return', 'returning', 'returned', 'returns']
running basic bm25
initial predicted courses: 
40.324 Fundamentals of Investing
40.323 Equity Valuation
40.240 Investment Science
40.317 Financial Systems Design
40.305 Advanced Topics in Stochastic Modelling#
50.033 Foundations of Game Design and Development
50.0

100%|██████████| 1396/1396 [56:03<00:00,  2.41s/it]
100%|██████████| 1396/1396 [03:36<00:00,  6.44it/s]


updated tf, tf_norm, df, idf, asociation matrix, association matrix norm scores...
time elapsed: 575.0min 12.373286962509155s

Training iteration: 15
original query from training sample: boolean, aviation, supply, urban, aws
query expansion with correlation matrix: ['boolean', 'aviation', 'supply', 'urban', 'aws', 'chain', 'basin', 'differ']
query expansion with pretrained corpus: ['boolean', 'propositional', 'commutative', 'algebra', 'aviation', 'aircraft', 'airline', 'aerospace', 'supply', 'supplies', 'shortage', 'supplied', 'urban', 'rural', 'sprawl', 'areas', 'aws', 'khazraj', 'rds', 'nahas', 'chain', 'chains', 'stores', 'store', 'basin', 'basins', 'river', 'watershed', 'differ', 'differing', 'differed', 'vary']
running basic bm25
initial predicted courses: 
40.232 Water Resources Management
40.260 Supply Chain Management
40.318 Supply Chain Digitalisation and Design
01.107 Urban Transportation
01.102 Energy Systems and Management
50.043 Database Systems
40.317 Financial Systems De

100%|██████████| 1396/1396 [02:36<00:00,  8.92it/s]
100%|██████████| 1396/1396 [03:37<00:00,  6.42it/s]


updated tf, tf_norm, df, idf, asociation matrix, association matrix norm scores...
time elapsed: 586.0min 36.85682201385498s

Training iteration: 16
original query from training sample: gurobi, ai, unity, digitalisation, decision
query expansion with correlation matrix: ['gurobi', 'ai', 'unity', 'digitalisation', 'decision', 'healthcare', 'align', 'linear']
query expansion with pretrained corpus: ['gurobi', 'ai', 'sugiyama', 'n’t', 'gonna', 'unity', 'reconciliation', 'solidarity', 'peace', 'digitalisation', 'decision', 'decisions', 'decided', 'move', 'healthcare', 'care', 'health', 'provider', 'align', 'colspan', '|', '9e9e9', 'linear', 'nonlinear', 'non-linear', 'equations']
running basic bm25
initial predicted courses: 
01.116 AI for Healthcare (Term 7)
40.324 Fundamentals of Investing
40.318 Supply Chain Digitalisation and Design
01.117 Brain-Inspired Computing and its Applications (Term 8)
40.319 Statistical and Machine Learning
50.039 Theory and Practice of Deep Learning
40.230 Su

100%|██████████| 1396/1396 [02:31<00:00,  9.23it/s]
100%|██████████| 1396/1396 [03:33<00:00,  6.53it/s]


updated tf, tf_norm, df, idf, asociation matrix, association matrix norm scores...
time elapsed: 597.0min 39.96846604347229s

Training iteration: 17
original query from training sample: c, finance, retrieval, mining, opponent
query expansion with correlation matrix: ['c', 'finance', 'retrieval', 'mining', 'opponent', 'return', 'video']
query expansion with pretrained corpus: ['c', 'b', 'f', 'g', 'finance', 'minister', 'financial', 'financing', 'retrieval', 'archiving', 'retrieving', 'semantic', 'mining', 'mines', 'coal', 'mine', 'opponent', 'opponents', 'foe', 'defeating', 'return', 'returning', 'returned', 'returns', 'video', 'videos', 'audio', 'dvd']
running basic bm25
initial predicted courses: 
40.324 Fundamentals of Investing
40.240 Investment Science
40.305 Advanced Topics in Stochastic Modelling#
40.317 Financial Systems Design
50.045 Information Retrieval
50.033 Foundations of Game Design and Development
50.017 Graphics and Visualisation
40.242 Derivative Pricing and Risk Manag

100%|██████████| 1396/1396 [02:27<00:00,  9.45it/s]
100%|██████████| 1396/1396 [03:30<00:00,  6.65it/s]


updated tf, tf_norm, df, idf, asociation matrix, association matrix norm scores...
time elapsed: 608.0min 27.324569940567017s

Training iteration: 18
original query from training sample: soup, business, balance, average, xd
query expansion with correlation matrix: ['soup', 'business', 'balance', 'average', 'xd', 'value', 'bsm']
query expansion with pretrained corpus: ['soup', 'soups', 'stew', 'noodle', 'business', 'businesses', 'industry', 'companies', 'balance', 'balanced', 'balances', 'balancing', 'average', 'averages', 'per', 'compared', 'xd', 'jetix', 'xt', 'radeon', 'value', 'values', 'valued', 'price', 'bsm', 'itsm', 'stm', 'white-necked']
running basic bm25
initial predicted courses: 
40.323 Equity Valuation
40.318 Supply Chain Digitalisation and Design
40.242 Derivative Pricing and Risk Management
40.324 Fundamentals of Investing
40.305 Advanced Topics in Stochastic Modelling#
40.240 Investment Science
40.317 Financial Systems Design
01.104 Networked Life
Service Design Studio


100%|██████████| 1396/1396 [02:27<00:00,  9.46it/s]
100%|██████████| 1396/1396 [03:29<00:00,  6.66it/s]


updated tf, tf_norm, df, idf, asociation matrix, association matrix norm scores...
time elapsed: 619.0min 13.700055122375488s

Training iteration: 19
original query from training sample: decision, derivative, artificial, relevance, space
query expansion with correlation matrix: ['decision', 'derivative', 'artificial', 'relevance', 'space', 'linear', 'option', 'intelligence', 'search']
query expansion with pretrained corpus: ['decision', 'decisions', 'decided', 'move', 'derivative', 'derivatives', 'covariant', 'inverse', 'artificial', 'synthetic', 'insemination', 'man-made', 'relevance', 'usefulness', 'significance', 'validity', 'space', 'nasa', 'spacecraft', 'spaces', 'linear', 'nonlinear', 'non-linear', 'equations', 'option', 'options', 'choice', 'choose', 'intelligence', 'cia', 'information', 'security', 'search', 'searching', 'searches', 'find']
running basic bm25
initial predicted courses: 
40.242 Derivative Pricing and Risk Management
50.042 Foundations of Cybersecurity
50.021 Art

100%|██████████| 1396/1396 [02:29<00:00,  9.35it/s]
100%|██████████| 1396/1396 [03:34<00:00,  6.51it/s]


updated tf, tf_norm, df, idf, asociation matrix, association matrix norm scores...
time elapsed: 630.0min 7.59909200668335s

Training iteration: 20
original query from training sample: business, amazon, figma, strategy, solidity
query expansion with correlation matrix: ['business', 'amazon', 'figma', 'strategy', 'solidity', 'value', 'option']
query expansion with pretrained corpus: ['business', 'businesses', 'industry', 'companies', 'amazon', 'amazon.com', 'rainforest', 'amazonian', 'figma', 'strategy', 'strategies', 'policy', 'plan', 'solidity', 'soundness', 'robustness', 'predictability', 'value', 'values', 'valued', 'price', 'option', 'options', 'choice', 'choose']
running basic bm25
initial predicted courses: 
40.242 Derivative Pricing and Risk Management
40.318 Supply Chain Digitalisation and Design
40.323 Equity Valuation
40.324 Fundamentals of Investing
40.317 Financial Systems Design
40.305 Advanced Topics in Stochastic Modelling#
40.240 Investment Science
01.107 Urban Transpor

100%|██████████| 1396/1396 [02:37<00:00,  8.84it/s]
100%|██████████| 1396/1396 [03:39<00:00,  6.35it/s]


updated tf, tf_norm, df, idf, asociation matrix, association matrix norm scores...
time elapsed: 641.0min 28.5466730594635s

Training iteration: 21
original query from training sample: risk, overview, certain, learn, interview
query expansion with correlation matrix: ['risk', 'overview', 'certain', 'learn', 'interview', 'arbitrage', 'audio', 'chunk', 'algorithm']
query expansion with pretrained corpus: ['risk', 'risks', 'danger', 'likelihood', 'overview', 'synopsis', 'in-depth', 'chronology', 'certain', 'particular', 'such', 'these', 'learn', 'learned', 'teach', 'understand', 'interview', 'told', 'interviews', 'saying', 'arbitrage', 'trades', 'unwinding', 'hedging', 'audio', 'video', 'stereo', 'dvd', 'chunk', 'chunks', 'sizable', 'slice', 'algorithm', 'algorithms', 'iterative', 'approximation']
running basic bm25
initial predicted courses: 
40.324 Fundamentals of Investing
50.033 Foundations of Game Design and Development
40.242 Derivative Pricing and Risk Management
50.017 Graphics an

100%|██████████| 1396/1396 [02:31<00:00,  9.24it/s]
100%|██████████| 1396/1396 [03:32<00:00,  6.55it/s]


updated tf, tf_norm, df, idf, asociation matrix, association matrix norm scores...
time elapsed: 652.0min 34.307454109191895s

Training iteration: 22
original query from training sample: ec, hierarchy, video, decision, gain
query expansion with correlation matrix: ['ec', 'hierarchy', 'video', 'decision', 'gain', 'retrieval', 'linear', 'video']
query expansion with pretrained corpus: ['ec', 'bimst', 'barroso', 'eu', 'hierarchy', 'hierarchical', 'hierarchies', 'clerical', 'video', 'videos', 'audio', 'dvd', 'decision', 'decisions', 'decided', 'move', 'gain', 'gaining', 'gained', 'gains', 'retrieval', 'archiving', 'retrieving', 'semantic', 'linear', 'nonlinear', 'non-linear', 'equations', 'video', 'videos', 'audio', 'dvd']
running basic bm25
initial predicted courses: 
50.045 Information Retrieval
50.033 Foundations of Game Design and Development
50.017 Graphics and Visualisation
40.324 Fundamentals of Investing
50.035 Computer Vision
40.319 Statistical and Machine Learning
50.040 Natural 

100%|██████████| 1396/1396 [02:28<00:00,  9.38it/s]
100%|██████████| 1396/1396 [03:32<00:00,  6.56it/s]


updated tf, tf_norm, df, idf, asociation matrix, association matrix norm scores...
time elapsed: 663.0min 25.777184009552002s

Training iteration: 23
original query from training sample: nash, algebra, user, hidden, logistics
query expansion with correlation matrix: ['nash', 'algebra', 'user', 'hidden', 'logistics', 'interface', 'regression', 'alignment']
query expansion with pretrained corpus: ['nash', 'finley', 'nowitzki', 'kidd', 'algebra', 'algebras', 'commutative', 'algebraic', 'user', 'users', 'interface', 'allows', 'hidden', 'concealed', 'hiding', 'hid', 'logistics', 'logistic', 'logistical', 'transport', 'interface', 'interfaces', 'graphical', 'functionality', 'regression', 'multivariate', 'nonlinear', 'regressions', 'alignment', 'alignments', 'routing', 'terminus']
running basic bm25
initial predicted courses: 
50.006 User Interface Design and Implementation
40.317 Financial Systems Design
40.302 Advanced Topics in Optimisation#
50.007 Machine Learning
40.318 Supply Chain Digi

100%|██████████| 1396/1396 [02:31<00:00,  9.19it/s]
100%|██████████| 1396/1396 [03:37<00:00,  6.42it/s]


updated tf, tf_norm, df, idf, asociation matrix, association matrix norm scores...
time elapsed: 674.0min 27.61254620552063s

Training iteration: 24
original query from training sample: heuristic, demand, building, autocad, airport
query expansion with correlation matrix: ['heuristic', 'demand', 'building', 'autocad', 'airport', 'guideline', 'capacity', 'extend', 'management']
query expansion with pretrained corpus: ['heuristic', 'heuristics', 'algorithms', 'probabilistic', 'demand', 'demands', 'prices', 'increased', 'building', 'buildings', 'built', 'construction', 'autocad', 'autodesk', 'cad', 'revit', 'airport', 'airports', 'flights', 'heathrow', 'guideline', 'guidelines', 'specifies', 'stipulated', 'capacity', 'capacities', 'capability', 'increase', 'extend', 'extending', 'extended', 'extends', 'management', 'managers', 'managing', 'asset']
running basic bm25
initial predicted courses: 
40.321 Airport Systems Modelling and Simulation
40.320 Airport Systems Planning and Design
40.26

100%|██████████| 1396/1396 [02:34<00:00,  9.06it/s]
100%|██████████| 1396/1396 [03:39<00:00,  6.37it/s]


updated tf, tf_norm, df, idf, asociation matrix, association matrix norm scores...
time elapsed: 685.0min 32.67095685005188s

Training iteration: 25
original query from training sample: evaluate, ui, istd, accessibility, price
query expansion with correlation matrix: ['evaluate', 'ui', 'istd', 'accessibility', 'price', 'deep', 'financial']
query expansion with pretrained corpus: ['evaluate', 'assess', 'evaluating', 'examine', 'ui', 'fiachrach', 'uí', 'pak', 'istd', 'idta', 'tfa', 'lcci', 'accessibility', 'affordability', 'availability', 'usability', 'price', 'prices', 'cost', 'market', 'deep', 'deeper', 'deepest', 'shallow', 'financial', 'economic', 'banking', 'crisis']
running basic bm25
initial predicted courses: 
40.240 Investment Science
40.317 Financial Systems Design
50.042 Foundations of Cybersecurity
40.230 Sustainable Engineering
01.102 Energy Systems and Management
40.324 Fundamentals of Investing
50.006 User Interface Design and Implementation
40.316 Game Theory
40.242 Deriv

100%|██████████| 1396/1396 [02:35<00:00,  9.00it/s]
100%|██████████| 1396/1396 [03:39<00:00,  6.35it/s]


updated tf, tf_norm, df, idf, asociation matrix, association matrix norm scores...
time elapsed: 696.0min 39.70691394805908s

Training iteration: 26
original query from training sample: skill, different, calculation, cleaning, derivative
query expansion with correlation matrix: ['skill', 'different', 'calculation', 'cleaning', 'derivative', 'long', 'application', 'option']
query expansion with pretrained corpus: ['skill', 'skills', 'abilities', 'ability', 'different', 'various', 'these', 'types', 'calculation', 'calculations', 'calculating', 'calculate', 'cleaning', 'washing', 'cleaned', 'cleaners', 'derivative', 'derivatives', 'covariant', 'inverse', 'long', 'short', 'longer', 'years', 'application', 'applications', 'applied', 'apply', 'option', 'options', 'choice', 'choose']
running basic bm25
initial predicted courses: 
40.242 Derivative Pricing and Risk Management
40.317 Financial Systems Design
40.232 Water Resources Management
40.240 Investment Science
50.017 Graphics and Visuali

100%|██████████| 1396/1396 [03:08<00:00,  7.41it/s]
100%|██████████| 1396/1396 [04:10<00:00,  5.57it/s]


updated tf, tf_norm, df, idf, asociation matrix, association matrix norm scores...
time elapsed: 708.0min 54.93669891357422s

Training iteration: 27
original query from training sample: excel, hadoop, approach, learn, jepsen
query expansion with correlation matrix: ['excel', 'hadoop', 'approach', 'learn', 'jepsen', 'aware', 'function', 'algorithm']
query expansion with pretrained corpus: ['excel', 'spreadsheet', 'spreadsheets', 'powerpoint', 'hadoop', 'mapreduce', 'open-source', 'openoffice', 'approach', 'approaches', 'strategy', 'way', 'learn', 'learned', 'teach', 'understand', 'jepsen', 'rae', 'kheda', 'carly', 'aware', 'concerned', 'obviously', 'unaware', 'function', 'functions', 'i.e.', 'functional', 'algorithm', 'algorithms', 'iterative', 'approximation']
running basic bm25
initial predicted courses: 
40.318 Supply Chain Digitalisation and Design
40.242 Derivative Pricing and Risk Management
50.017 Graphics and Visualisation
50.007 Machine Learning
50.038 Computational Data Scienc

100%|██████████| 1396/1396 [02:35<00:00,  8.96it/s]
100%|██████████| 1396/1396 [03:39<00:00,  6.36it/s]


updated tf, tf_norm, df, idf, asociation matrix, association matrix norm scores...
time elapsed: 720.0min 32.87126684188843s

Training iteration: 28
original query from training sample: financial, system, element, supply, long
query expansion with correlation matrix: ['financial', 'element', 'supply', 'long', 'price', 'account', 'chain', 'localization']
query expansion with pretrained corpus: ['financial', 'economic', 'banking', 'crisis', 'element', 'elements', 'component', 'aspect', 'supply', 'supplies', 'shortage', 'supplied', 'long', 'short', 'longer', 'years', 'price', 'prices', 'cost', 'market', 'account', 'accounts', 'personal', 'savings', 'chain', 'chains', 'stores', 'store', 'localization', 'localisation', 'subcellular', 'internationalization']
running basic bm25
initial predicted courses: 
40.260 Supply Chain Management
40.318 Supply Chain Digitalisation and Design
40.317 Financial Systems Design
40.240 Investment Science
01.102 Energy Systems and Management
40.324 Fundamental

100%|██████████| 1396/1396 [02:35<00:00,  8.97it/s]
100%|██████████| 1396/1396 [03:35<00:00,  6.48it/s]


updated tf, tf_norm, df, idf, asociation matrix, association matrix norm scores...
time elapsed: 731.0min 39.64076900482178s

Training iteration: 29
original query from training sample: ampl, python, optimize, aws, learn
query expansion with correlation matrix: ['ampl', 'python', 'optimize', 'aws', 'learn', 'abstract', 'arp', 'differ', 'algorithm']
query expansion with pretrained corpus: ['ampl', 'mangxamba', 'ufdots', 'uninitialized', 'python', 'monty', 'perl', 'cleese', 'optimize', 'optimizing', 'optimise', 'readjust', 'aws', 'khazraj', 'rds', 'nahas', 'learn', 'learned', 'teach', 'understand', 'abstract', 'expressionist', 'expressionism', 'figurative', 'arp', 'schnitger', '2600', 'synthesiser', 'differ', 'differing', 'differed', 'vary', 'algorithm', 'algorithms', 'iterative', 'approximation']
running basic bm25
initial predicted courses: 
40.317 Financial Systems Design
50.007 Machine Learning
50.017 Graphics and Visualisation
40.319 Statistical and Machine Learning
50.043 Database 

100%|██████████| 1396/1396 [02:34<00:00,  9.01it/s]
100%|██████████| 1396/1396 [03:37<00:00,  6.42it/s]


updated tf, tf_norm, df, idf, asociation matrix, association matrix norm scores...
time elapsed: 742.0min 45.90479397773743s

Training iteration: 30
original query from training sample: security, solidity, c, statistic, calculation
query expansion with correlation matrix: ['security', 'solidity', 'c', 'statistic', 'calculation', 'common']
query expansion with pretrained corpus: ['security', 'forces', 'military', 'intelligence', 'solidity', 'soundness', 'robustness', 'predictability', 'c', 'b', 'f', 'g', 'statistic', 'statistics', 'bps', 'statistical', 'calculation', 'calculations', 'calculating', 'calculate', 'common', 'commonly', 'example', 'similar']
running basic bm25
initial predicted courses: 
50.020 Network Security
01.117 Brain-Inspired Computing and its Applications (Term 8)
50.044 System Security
50.042 Foundations of Cybersecurity
40.305 Advanced Topics in Stochastic Modelling#
40.232 Water Resources Management
40.319 Statistical and Machine Learning
50.021 Artificial Intelli

100%|██████████| 1396/1396 [02:37<00:00,  8.88it/s]
100%|██████████| 1396/1396 [03:40<00:00,  6.33it/s]


updated tf, tf_norm, df, idf, asociation matrix, association matrix norm scores...
time elapsed: 754.0min 18.709813117980957s

time elapsed: 754.0min 18.714138984680176s


In [41]:
# main code, once pipeline is up, will transfer all to training.py
start_time = time.time()
vocab = tf.index.tolist()
total_length = tf.to_numpy().sum()
avg_doc_len = total_length / len(tf.columns)
top_retrieved = 10
sample = 0

# take for example first iteration 
query = added_data['querySample'][sample]
# lemitization
query = process_query(query)
# included associated terms
query = get_associated_words(query, norm_association_matrix)
# include words from pretrained w2v corpus
query = expand_query(query,glove_kv,topn=3)
print('initial query after passing through association matrix and expanded with pretrained corpus')
print(query)
result, ls = bm25_prediction(query=query, df=df, tf=tf, tf_norm=tf_norm, idf=idf, vocab=vocab, \
                             avg_doc_len=avg_doc_len, reformulated=False)
# only want top 10 predicted courses
predicted = ls[:top_retrieved]
print(f'initial prediction: ')
for course in predicted:
    print(course)
print('')

gold_standard = added_data['expectedElectivesInOrder'][sample]
relevant_courses = []

# print out documents which are relevant are retrieved, comparing with gold standard
for i in range(1,top_retrieved+1):
    if gold_standard[i-1] == predicted[i-1]:
        relevant_courses.append(predicted[i-1])
        print(f'Relevant & Retrieved: Rank {i} {predicted[i-1]}')

# get top 3 associated words from each relevant and retrieved course 
associated_words = get_top_k_associated_words(relevant_courses, tf, norm_association_matrix, k=3)
# add associated words to orginal query to form reformulated query
query+=associated_words
print('')
print('reformulated query after passing through association matrix')
print(query)

# insert reformulated query into bm25_reformulated 
result, ls = bm25_prediction(query=query, df=df, tf=tf, tf_norm=tf_norm, idf=idf, vocab=vocab, \
                             avg_doc_len=avg_doc_len, relevant_courses=relevant_courses, reformulated=True)
predicted_reformulated = ls[:top_retrieved]
print(f'prediction after query reformulation')
for course in predicted_reformulated:
    print(course)

# update tf, tf_norm, idf, df
tf, tf_norm, df, idf = update_scores(tf=tf, relevant_courses=relevant_courses, associated_words=associated_words)

# update correlation matrix
association_matrix,unique_words = create_association_matrix(tf=tf)
norm_association_matrix = create_norm_association_matrix(association_matrix = association_matrix,unique_words = unique_words)



print(f'time elapsed: {(time.time()-start_time)//60}min {(time.time()-start_time)%60}s')
tf.to_csv('../../../data/bm25/bm25_relevance_feedback/merged_courses_tf_trained.csv')
tf_norm.to_csv('../../../data/bm25/bm25_relevance_feedback/merged_courses_tf_norm_trained.csv')
df.to_csv('../../../data/bm25/bm25_relevance_feedback/merged_courses_df_trained.csv')
idf.to_csv('../../../data/bm25/bm25_relevance_feedback/merged_courses_idf_trained.csv')
association_matrix.to_csv('../../../data/bm25/bm25_relevance_feedback/association_matrix_trained.csv')
norm_association_matrix.to_csv('../../../data/bm25/bm25_relevance_feedback/association_matrix_norm_trained.csv')


initial query after passing through association matrix and expanded with pretrained corpus
['golang', 'cod', 'haddock', 'halibut', 'nantucket', 'feedback', 'input', 'responses', 'interaction', 'aviation', 'aircraft', 'airline', 'aerospace', 'statistic', 'statistics', 'bps', 'statistical', 'load', 'loads', 'loading', 'loaded', 'fairness', 'honesty', 'impartiality', 'objectivity']
running basic bm25
initial prediction: 
01.104 Networked Life
40.317 Financial Systems Design
50.006 User Interface Design and Implementation
40.232 Water Resources Management
50.039 Theory and Practice of Deep Learning
40.320 Airport Systems Planning and Design
01.107 Urban Transportation
01.117 Brain-Inspired Computing and its Applications (Term 8)
40.321 Airport Systems Modelling and Simulation
40.319 Statistical and Machine Learning

Relevant & Retrieved: Rank 6 40.320 Airport Systems Planning and Design

reformulated query after passing through association matrix
['golang', 'cod', 'haddock', 'halibut', 'na

  0%|          | 0/1396 [00:00<?, ?it/s]

prediction after query reformulation
40.320 Airport Systems Planning and Design
50.047 Mobile Robotics
01.104 Networked Life
50.039 Theory and Practice of Deep Learning
01.107 Urban Transportation
40.232 Water Resources Management
40.317 Financial Systems Design
01.117 Brain-Inspired Computing and its Applications (Term 8)
40.305 Advanced Topics in Stochastic Modelling#
50.006 User Interface Design and Implementation


100%|██████████| 1396/1396 [15:21<00:00,  1.51it/s]
100%|██████████| 1396/1396 [04:14<00:00,  5.49it/s]


time elapsed: 19.0min 39.429699659347534s


In [82]:
norm_asc_matrix = pd.read_csv('../../../data/bm25/bm25_relevance_feedback/association_matrix_norm_trained.csv',header = 0 , index_col = 0)
norm_asc_matrix.head()

Unnamed: 0,ability,able,abstract,abstraction,accommodate,accompanies,account,accounting,achieve,acquaint,...,window,wireless,word,wordvec,work,world,write,xl,year,zigbee
ability,,0.006944,0.0,0.0,0.0,0.333333,0.0,0.0,0.2,0.0,...,0.0,0.0,0.0,0.0,0.047619,0.0,0.1,0.0,0.0,0.0
able,0.006944,,0.0,0.0,0.0,0.0,0.014286,0.028369,0.0,0.0,...,0.0,0.0,0.014184,0.014286,0.06,0.154321,0.0,0.0,0.014286,0.0
abstract,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
abstraction,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
accommodate,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [90]:
norm_asc_matrix['cod'].astype(float).idxmax()

'load'