# Libraries

In [1]:
import pandas as pd
import numpy as np
import regex as re
import heapq
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem import PorterStemmer
from importlib import reload  # To reload the imported modules

# Importing our custom modules for ceating the index
from modules import engine_v1, engine_v2
reload(engine_v1)
reload(engine_v2)

# Setting the NLTK environment to work with English language
nltk.download("stopwords", quiet=True)
nltk.download("punkt", quiet=True)
stops = set(stopwords.words('english'))

porterStemmer = PorterStemmer()


# To display all columns and not only a sample
pd.set_option('display.max_columns', None)

dataset_folder = "data/TSVs/"

NUM_COURSES = 100

---
# [2] Search Engine

## [2.0] Preprocessing the text

We use the nltk library to preprocess the dataset before using it. 

First of all we take a look at the dataset and see how it looks like. In particular we print all the unique values for the columns `isitFullTime`, `startDate`, `duration`, `administration` because they must be categorical variables.

Mayvbe check concordancy between `isItFullTime` and `duration`

The column `description`  will be preprocessed with the following operations:
  1. Removing punctuation 
  2. Tokenization
  2. Removing stopwords
  3. Stemming
  4. Lowering the case of all the words

In [2]:
# First of all we import all the files and we create a single big dataframe to understand the data
df = pd.DataFrame()

col_names = ['courseName','universityName','facultyName', 'isItFullTime','description','startDate','fees','modality','duration','city','country','administration','url']

for i in range(1, NUM_COURSES + 1):
  # Read the i-th course_i.tsv file
  df_course = pd.read_csv(dataset_folder + "course_" + str(i) + ".tsv", sep='\t', names = col_names, header=None)
  # Add the index 
  df_course['index'] = i
  df_course.set_index('index', inplace=True) 
  # Append the dataframe to the full dataframe 
  df = pd.concat([df, df_course])

# so that we can always use it when given the output of a query
df_original = df.copy()

#Print the dataframe
df

Unnamed: 0_level_0,courseName,universityName,facultyName,isItFullTime,description,startDate,fees,modality,duration,city,country,administration,url
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
1,3D Design for Virtual Environments - MSc,Glasgow Caledonian University,School of Engineering and Built Environment,Full time,3D visualisation and animation play a role in ...,September,Please see the university website for further ...,MSc,1 year full-time,Glasgow,United Kingdom,On Campus,https://www.findamasters.com/masters-degrees/c...
2,"Accounting, Accountability & Financial Managem...",King’s College London,King’s Business School,Full time,"Our Accounting, Accountability & Financial Man...",September,Please see the university website for further ...,MSc,1 year FT,London,United Kingdom,On Campus,https://www.findamasters.com/masters-degrees/c...
3,Accounting and Finance (MSc),University of Bath,School of Management,Full time,Develop in-depth knowledge of accounting and f...,September,Please see the university website for further ...,MSc,1 year full-time,Bath,United Kingdom,On Campus,https://www.findamasters.com/masters-degrees/c...
4,Accounting and Finance - MSc,University of Leeds,Leeds University Business School,Full time,Businesses and governments rely on sound finan...,September,"UK: £18,000 (Total)International: £34,750 (Total)",MSc,1 year full time,Leeds,United Kingdom,On Campus,https://www.findamasters.com/masters-degrees/c...
5,"Accounting, Financial Management and Digital B...",University of Reading,Henley Business School,Full time,Embark on a professional accounting career wit...,September,Please see the university website for further ...,MSc,1 year full time,Reading,United Kingdom,On Campus,https://www.findamasters.com/masters-degrees/c...
...,...,...,...,...,...,...,...,...,...,...,...,...,...
96,Biomedical Science MSc,Coventry University,Faculty of Health and Life Sciences,Full time,"As key contributors to modern healthcare, incl...",September,UK Fees: 2022/23 fees TBC*;2021/22 fees - 1040...,MSc,"1 year full time, 2 years part time, 20 months...",Coventry,United Kingdom,On Campus,https://www.findamasters.com/masters-degrees/c...
97,Biomedical Sciences,Maastricht University,Faculty of Health Medicine and Life Sciences,Full time,From gene to health. Explore the clinical and ...,September,Academic year 2024/2025The statutory fee for t...,MSc,2 Years,Maastricht,Netherlands,On Campus,https://www.findamasters.com/masters-degrees/c...
98,Biomedical Sciences - Cancer Biology MSc,University of Westminster,Biological and Biomedical Sciences,Full time,Our Biomedical Sciences (Cancer Biology) MSc h...,September,Please see the university website for further ...,MSc,Full-time: 1 year; Part-time: 2 years,London,United Kingdom,On Campus,https://www.findamasters.com/masters-degrees/c...
99,Biomedical Sciences - Clinical Biochemistry MSc,University of Westminster,Biological and Biomedical Sciences,Full time,Our Biomedical Sciences (Clinical Biochemistry...,September,Please see the university website for further ...,MSc,Full-time: 1 year; Part-time: 2-5 years,London,United Kingdom,On Campus,https://www.findamasters.com/masters-degrees/c...


In [3]:
# Now we finally focus on preprocessing the 'description' column

# We subtitute the NaN values with an empty string
df.fillna('', inplace = True)

# We lower the case of all the words
df['prep_description'] = df['description'].apply(lambda text: text.lower())

# REMOVE PUNCTUATION using regex
# # n particular we substitute all the punctuation with an empty string, avoiding to remove the dashes inside the words
# The regex has tre filters: 
# - the first eliminates everything that is not a letter, a number, a space or a dash
# - the second and the third eliminate the dashes that are not between two letters
# Some examples: eye- --> eye, -eye --> eye, eye-catching --> eye-catching

def remove_punctuation(text):
    # Sometime raises TypeError: expected string or buffer
    # So we must check that it's a string before removing punctuation
    if isinstance(text, str):
        return re.sub(r"[^a-zA-Z0-9\s\-]|((?<=[a-zA-Z\W])\-(?=[^a-zA-Z]))|((?<=[^a-zA-Z])\-(?=[a-zA-Z\W]))", "", text)
    else:
        return text

# Apply the function to 'description' column
df['prep_description'] = df['prep_description'].apply(remove_punctuation)

# TOKENIZATION using the word_tokenize() function of NLTK
df['prep_description'] = df['prep_description'].apply(lambda text: nltk.word_tokenize(text))


# REMOVING STOPWORDS using the stopwords list of NLTK
df['prep_description'] = df['prep_description'].apply(lambda words: [x for x in words if x not in stops] )


# STEMMING using the PorterStemmer of NLTK
porterStemmer = PorterStemmer()
df['prep_description'] = df['prep_description'].apply(lambda words: [porterStemmer.stem(x) for x in words])

 ## [2.1] Conjunctive query

### [2.1.1] Create your index!

Most of the function that we implemented in this section are in the module `engine_v1`. Here we report a brief description. Read the comments in the `engine_v1.py` module further details.

> The module `engine_v1` has the following methods: 
> * `create_vocabulary(df: pd.DataFrame) -> dict`   
>
>   (Creates the vocabulary. **This function must be called to initializate the Search Engine**. It saves two files: `vocabulary.json` and `vocabulary_inverted.json`. The first associates a word with its integer index, while the second does the opposite)
>
> * `get_vocabulary() -> dict`
>
>   (Retrives the vocabulary (word -> term_id) from the saved file `vocabulary.json`)
>
> * `get_vocabulary_inverted() -> dict`
>
>   (Retrives the inverted vocabulary (term_id -> word) from the saved file `vocabulary_inverted.json`)
>
> * `get_term_id(word: str) -> int:`
>
>   (From a word get the corresponding term_id)
>
> * `get_word_from_id(term_id: int) -> str:`
>
>   (From a term_id get the corresponding word)
>
> * `create_inverted_index() -> dict:`
>
>   (Creates the inverted index and saves it in the file `inverted_index.json`)
>
> * `get_inverted_index() -> dict:`
>
>   (Retrives the inverted index from the file `inverted_index.json`)
>
> * `preprocess(text: str) -> list:`
>
>   (Preprocesses a string, in our case it's used to preprocess the query)
>
> * `search(query: str) -> pd.DataFrame:`
>
>   (Given a query it outputs the documents that contains all the words in the preprocessed query)
> 

The first time that we call the Search Engine (v1) we pass the entire dataframe to it. The dataframe must have a column named `prep_description`, that is the preprocessed version of the coumn `description`.

So we create the vocabulary that contains a dictionary mapping each word (in the description field) to an integer `term_id` (this will initializate the Search Engine).

In [4]:
# Create vocabulary
vocabulary = engine_v1.create_vocabulary(df)

Now we focus on creating the inverted index. 

We create a new dictionary where the index is the `term_id` and as values has the indexes of the courses that cointain that word.

This process is done using the function `create_inverted_index()` in our module `engine_v1`.

In [5]:
# Create inverted index
inverted_index = engine_v1.create_inverted_index()

### [2.1.2] Execute the query

In [6]:
## Uncomment the following two lines for manually inputting the query
# print("Input the query: ")
# query = input()

# An example of query
query = "advanced knowledge"

df_result = engine_v1.search(query)

# Showing, at most, the first 10 results of the query
df_result[['courseName','universityName','description','url']][:10]

Unnamed: 0_level_0,courseName,universityName,description,url
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
4,Accounting and Finance - MSc,University of Leeds,Businesses and governments rely on sound finan...,https://www.findamasters.com/masters-degrees/c...
6,Addictions MSc,King’s College London,Join us for an online session for prospective ...,https://www.findamasters.com/masters-degrees/c...
12,Analytical Toxicology MSc,King’s College London,The Analytical Toxicology MSc is a unique stud...,https://www.findamasters.com/masters-degrees/c...
39,Biomaterials and Tissue Engineering MSc,University College London,Register your interest in graduate study at UC...,https://www.findamasters.com/masters-degrees/c...
40,Biomedical and Analytical Science MSc,University of Huddersfield,The Biomedical and Analytical Science MSc cour...,https://www.findamasters.com/masters-degrees/c...
62,Biomedical Engineering with Data Analytics MSc,"City, University of London",Key informationThis course provides a comprehe...,https://www.findamasters.com/masters-degrees/c...
84,Biomedical Science (Medical Immunology) MSc,Middlesex University,This master's programme will allow you to deve...,https://www.findamasters.com/masters-degrees/c...


# [2.2] Conjunctive query & Ranking score

# [2.2.1] Inverted index

Here we use functions of our module `engine_v2`. In this module we 'recycle' some functions on the previous version `engine_v1`. 
Here we report a brief description. Read the comments in the `engine_v2.py` module further details.

> The module `engine_v2` has the following methods: 
>
> * `create_inverted_index(df: pd.DataFrame) -> dict`
>
>   (**This is the first function that must be run in order to initializate the Search Egine v2**. It saves the dataframe and computes the inverted tf-idf index for each word in the vocabulary. Returns a dictionary where the key is the `term_id` and the value is a lsit of tuple. The dictionary is saved in the file `inverted_index_tf_idf.json`)
>
> * `compute_if_idf() -> pd.DataFrame`
>
>   (Computes the tf-idf score for each word in each document (i.e. course) where tf-idf = tf * idf = term frequency * inverse document frequency. We use TfidfVectorizer from sklearn to create the tf-idf matrix. This matrix will be saved in the file `courses_matrix_tf_idf.csv` to access it later. This function also computes the norm of each document and stores them in  Returns the correspondent tf-idf dataframe `inverted_index_tf_idf.json`)
>
> * `get_tf_idf() -> pd.DataFrame`
>
>   (This function reads the `courses_matrix_tf_idf.csv` and returns the tf-idf dataframe)
>
> * `get_norms() -> pd.DataFrame`
>
>   (Retrives the precomputed l2 norms of the documents from the file `norms.csv`)
>
> * `search(query: str, k: int) -> list`
>
>   (Given a query, this function outputs the `k` most simlar documents to the query by the cosine similarity score. It is applied to the `description` column of the dataset (among the documents that contains all the words in the preprocessed query))
> 

In [7]:
inverted_index_tf_idf = engine_v2.create_inverted_index(df)



### [2.2.2] Execute the query

In [8]:
# ## Uncomment the following two lines for manually inputting the query
# # print("Input the query: ")
# # query = input()

# An example of query
query = "advanced knowledge"

heap_result = engine_v2.search(query, k = 10)

# 'Popping' all the element of the heap to build the DataFrame
df_results = pd.DataFrame()
for _ in range(len(heap_result)):
  similarity, elem = heapq.heappop(heap_result)
  row = pd.DataFrame(data = [elem], columns = ['index'] + df.columns.tolist() + ['Similarity'])
  # We need to invert the similarity because we used a max heap
  # and by default it is a min heap
  row.reset_index(drop=True, inplace=True)
  row.set_index('index', inplace=True)
  df_results = pd.concat([df_results, row])
  
df_results[['courseName','universityName','description','url','Similarity']]


Unnamed: 0_level_0,courseName,universityName,description,url,Similarity
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
7,Advanced Chemical Engineering - MSc,University of Leeds,The Advanced Chemical Engineering MSc at Leeds...,https://www.findamasters.com/masters-degrees/c...,0.220551
4,Accounting and Finance - MSc,University of Leeds,Businesses and governments rely on sound finan...,https://www.findamasters.com/masters-degrees/c...,0.21442
84,Biomedical Science (Medical Immunology) MSc,Middlesex University,This master's programme will allow you to deve...,https://www.findamasters.com/masters-degrees/c...,0.195067
12,Analytical Toxicology MSc,King’s College London,The Analytical Toxicology MSc is a unique stud...,https://www.findamasters.com/masters-degrees/c...,0.154413
39,Biomaterials and Tissue Engineering MSc,University College London,Register your interest in graduate study at UC...,https://www.findamasters.com/masters-degrees/c...,0.140614
62,Biomedical Engineering with Data Analytics MSc,"City, University of London",Key informationThis course provides a comprehe...,https://www.findamasters.com/masters-degrees/c...,0.139455
23,Global Health MSc,King’s College London,﻿The Global Health MSc is a pioneering set of ...,https://www.findamasters.com/masters-degrees/c...,0.113282
40,Biomedical and Analytical Science MSc,University of Huddersfield,The Biomedical and Analytical Science MSc cour...,https://www.findamasters.com/masters-degrees/c...,0.112441
48,Biomedical Engineering (M.Sc. / P.Grad.Dip.),Trinity College Dublin,The MSc in Biomedical Engineering provides an ...,https://www.findamasters.com/masters-degrees/c...,0.110858
44,Biomedical Engineering,Vrije Universiteit Brussel,"About the programmeIn this programme, you’ll a...",https://www.findamasters.com/masters-degrees/c...,0.101448
