In [2]:
import numpy as np
import pandas as pd

# Data Preprocessing

In [23]:
# prepare internal FAQ dataset
df_faq = pd.read_excel('faq-data/df_undergrad_faq.xlsx')

df_temp = pd.read_csv('faq-data/df_6901_faq.csv', index_col=0)
df_temp['Type'] = 'BSc 6901'
df_faq = df_faq.append(df_temp)

df_temp = pd.read_excel('faq-data/df_basc_faq.xlsx')
df_temp['Type'] = 'BASc'
df_faq = df_faq.append(df_temp)

df_temp = pd.read_csv('faq-data/df_aao_faq.csv', index_col=0)
df_temp['Type'] = 'AAO'
df_faq = df_faq.append(df_temp)

df_faq.reset_index(drop=True)

Unnamed: 0,Question,Answer,Type
0,How do I apply to HKU through JUPAS scheme?,We welcome your application to HKU through the...,HKDSE
1,What are the common mistakes as a JUPAS applic...,Students should not forget that in addition to...,HKDSE
2,How can I apply to HKU as a HKDSE repeater?,All students who apply to HKU on the basis of ...,HKDSE
3,What are the minimum university entrance requi...,To have your application considered for admiss...,HKDSE
4,How is the admission score calculated?,"Starting from the academic year 2020/2021, HKU...",HKDSE
...,...,...,...
170,I still have other questions regarding the set...,You might try to look at the FAQ compiled by t...,AAO
171,Are there scholarships that accept application...,Please visit the website of the Scholarships O...,AAO
172,When and how do I apply for leave of absence?,You need to apply for leave of absence if you ...,AAO
173,What is plagiarism and what happens if I am fo...,"To put it simply, plagiarism is defined as the...",AAO


In [28]:
# save preproecessed faq dataset
df_faq.to_csv('faq-data/df_faq.csv')

# Helper Functions

In [31]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')

from nltk import word_tokenize
from nltk.corpus import stopwords
import string

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Kackie\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Kackie\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [32]:
class Sentence:
    def __init__(self, sentence):
        self.sentence = sentence

    # METHOD: preprocess the sentence
    # return: list of tokenized words
    def preprocess(self, with_stopwords=False):
        text = self.sentence

        # convert text to lower case
        text = text.lower()

        # tokenize text into list of words
        words = word_tokenize(text)

        # remove punctuation
        words = [word for word in words if word.isalpha()]

        if with_stopwords == False:
            # remove stopwords
            stop_words = set(stopwords.words('english'))
            words = [word for word in words if not word in stop_words]

        return words
        
    # METHOD: get word embeddings based on specific model
    # return: matrix word embeddings
    def get_vector(self, model, with_stopwords=False):
        return np.sum(np.array([model[i] for i in self.preprocess(with_stopwords=with_stopwords)]), axis=0)

In [11]:
from scipy import spatial

# HELPER FUNCTION: get similarities of a specific query against all questions in our FAQ database
# return: dataframe with new column 'Similarity'
def get_similarities(query, df, model, with_stopwords=False):
    similarities = []
    query = Sentence(query)
    for q in df['Question']:
        s = Sentence(q)
        try:
            similarity = 1 - spatial.distance.cosine(
                query.get_vector(model, with_stopwords), 
                s.get_vector(model, with_stopwords)
            )
            similarities.append(similarity)
        except:
            similarities.append(0)
            continue
    df['Similarity'] = similarities
    
    df = df.sort_values(by='Similarity', ascending=False)

    return df

---
# Models

In [25]:
# define any query
QUERY = 'I want to apply for a scholarship.'

## Model 1: GloVe

In [10]:
import gensim.downloader as api

# define GloVe model
model_glove = api.load('glove-wiki-gigaword-300')

**WITHOUT stopwords**

In [29]:
# get cosine similarity scores for every entry in our FAQ database
# WITHOUT stopwords
get_similarities(QUERY, df_faq, model_glove, with_stopwords=False).to_csv('model-sample-results/glove-without-stopwords.csv')

**WITH stopwords**

In [30]:
# get cosine similarity scores for every entry in our FAQ database
# WITH stopwords
get_similarities(QUERY, df_faq, model_glove, with_stopwords=True).to_csv('model-sample-results/glove-with-stopwords.csv')