In [1]:
import numpy as np
import pandas as pd

# Data Preprocessing

In [2]:
# prepare internal FAQ dataset
df_faq = pd.read_excel('faq-data/df_undergrad_faq.xlsx')

df_temp = pd.read_csv('faq-data/df_6901_faq.csv', index_col=0)
df_temp['Type'] = 'BSc 6901'
df_faq = df_faq.append(df_temp)

df_temp = pd.read_excel('faq-data/df_basc_faq.xlsx')
df_temp['Type'] = 'BASc'
df_faq = df_faq.append(df_temp)

df_temp = pd.read_csv('faq-data/df_aao_faq.csv', index_col=0)
df_temp['Type'] = 'AAO'
df_faq = df_faq.append(df_temp)

df_faq = df_faq.reset_index(drop=True)

# include the type of the question into both 'Question' and 'Answer'
df_faq['Question'] = '('+ df_faq['Type'] + ') ' + df_faq['Question']
df_faq['Answer'] = '('+ df_faq['Type'] + ') ' + df_faq['Answer']

In [3]:
# save preproecessed faq dataset
df_faq.to_csv('faq-data/df_faq.csv')

In [4]:
df_faq.sample(10)

Unnamed: 0,Question,Answer,Type
15,(HKDSE (Non-local)) Where should I submit my a...,(HKDSE (Non-local)) All application to the Uni...,HKDSE (Non-local)
87,(AAO) I am interested in the programmes of HKU...,"(AAO) For matters related to admission, please...",AAO
54,(BSc 6901) What are the selection criteria for...,"(BSc 6901) Science elective subjects*, Mathema...",BSc 6901
95,(AAO) How and when should I select courses?,(AAO) You should select courses via SIS (Stude...,AAO
119,(AAO) Different majors/minors may have overlap...,(AAO) Double-counting of courses for fulfillin...,AAO
126,(AAO) Where can I check the approval status of...,(AAO) Under their SIS menu -> Enrollment -> En...,AAO
23,(International) What are the requirements to g...,(International) The University will consider y...,International
30,(BSc 6901) What are the special features in th...,(BSc 6901) There are several special features ...,BSc 6901
25,(Bachelor's Degree) Do I need Chinese to study...,(Bachelor's Degree) Chinese is not a must for ...,Bachelor's Degree
64,(BSc 6901) How many places are available?,(BSc 6901) 85 places for 2022.,BSc 6901


# Helper Functions

In [5]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')

from nltk import word_tokenize
from nltk.corpus import stopwords
import string

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Kackie\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Kackie\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [6]:
class Sentence:
    def __init__(self, sentence):
        self.sentence = sentence

    # METHOD: preprocess the sentence
    # return: list of tokenized words
    def preprocess(self, with_stopwords=False):
        text = self.sentence

        # convert text to lower case
        text = text.lower()

        # tokenize text into list of words
        words = word_tokenize(text)

        # remove punctuation
        words = [word for word in words if word.isalpha()]

        if with_stopwords == False:
            # remove stopwords
            stop_words = set(stopwords.words('english'))
            words = [word for word in words if not word in stop_words]

        return words
        
    # METHOD: get word embeddings based on specific model
    # return: matrix word embeddings
    def get_vector(self, model, with_stopwords=False):
        return np.sum(np.array([model[i] for i in self.preprocess(with_stopwords=with_stopwords)]), axis=0)

In [7]:
from scipy import spatial

# HELPER FUNCTION: get similarities of a specific query against all questions in our FAQ database
# return: dataframe with new column 'Similarity'
def get_similarities(query, df, model, with_stopwords=False):
    similarities = []
    query = Sentence(query)
    for q in df['Question']:
        s = Sentence(q)
        try:
            similarity = 1 - spatial.distance.cosine(
                query.get_vector(model, with_stopwords), 
                s.get_vector(model, with_stopwords)
            )
            similarities.append(similarity)
        except:
            similarities.append(0)
            continue
    df['Similarity'] = similarities
    
    df = df.sort_values(by='Similarity', ascending=False)

    return df

---
# Models

In [24]:
# define any query
QUERY = 'I wanna know what the application deadline is.'

## Model 1: Word2Vec

In [9]:
import gensim.downloader as api

# define word2vec model
model_w2v = api.load('word2vec-google-news-300')

**WITHOUT stopwords**

In [10]:
# get cosine similarity scores for every entry in our FAQ database
# WITHOUT stopwords
get_similarities(QUERY, df_faq, model_w2v, with_stopwords=False).to_csv('model-sample-results/w2v-without-stopwords.csv')

In [25]:
# try another query
get_similarities(
    'I am an international student. I wanna know how much the programme costs.', 
    df_faq, model_w2v, with_stopwords=False)

Unnamed: 0,Question,Answer,Type,Similarity
0,(HKDSE) How do I apply to HKU through JUPAS sc...,(HKDSE) We welcome your application to HKU thr...,HKDSE,0
120,(AAO) How do I know if I have fulfilled all th...,(AAO) It is your responsibility to check the c...,AAO,0
112,(AAO) Is it possible to double major or double...,"(AAO) Yes, as long as you have enough credits ...",AAO,0
113,(AAO) How many majors/ minors I can take to th...,(AAO) There are no rigid regulations in this r...,AAO,0
114,(AAO) Can I change my major/ minor later? How ...,(AAO) It depends on the setup which may vary f...,AAO,0
...,...,...,...,...
60,(BSc 6901) Other universities are also adoptin...,(BSc 6901) HKU Science is the first university...,BSc 6901,0
61,(BSc 6901) What is Young Scientist Scheme (YSS)?,(BSc 6901) Enrolment in our flagship Summer Re...,BSc 6901,0
62,(BSc 6901) Who is entitled to join YSS?,(BSc 6901) have a second chance to enrol in YS...,BSc 6901,0
63,(BSc 6901) Who is eligible to apply for the sc...,(BSc 6901) Candidates who hold a recognised fu...,BSc 6901,0


**WITH stopwords**

In [11]:
# get cosine similarity scores for every entry in our FAQ database
# WITH stopwords
get_similarities(QUERY, df_faq, model_w2v, with_stopwords=True).to_csv('model-sample-results/w2v-with-stopwords.csv')

In [13]:
# save model
model_w2v.save('models/w2v.model')

## Model 2: GloVe

In [14]:
import gensim.downloader as api

# define GloVe model
model_glove = api.load('glove-wiki-gigaword-300')

**WITHOUT stopwords**

In [19]:
# get cosine similarity scores for every entry in our FAQ database
# WITHOUT stopwords
get_similarities(QUERY, df_faq, model_glove, with_stopwords=False).to_csv('model-sample-results/glove-without-stopwords.csv')

In [26]:
# try another query
get_similarities(
    'I am an international student. I wanna know how much the programme costs.', 
    df_faq, model_glove, with_stopwords=False)

Unnamed: 0,Question,Answer,Type,Similarity
28,(Hong Kong Sub-degrees (for senior year entry)...,(Hong Kong Sub-degrees (for senior year entry)...,Hong Kong Sub-degrees (for senior year entry),0.743657
23,(International) What are the requirements to g...,(International) The University will consider y...,International,0.741171
19,(International) Do I need Chinese to study at ...,(International) Chinese is not a must for the ...,International,0.729584
24,(International) How much is the tuition fee?,(International) You may refer to our Fees and ...,International,0.707611
79,(BASc) I understand that this degree opens me ...,"(BASc) What ""qualifies you"" for the job you wa...",BASc,0.707343
...,...,...,...,...
59,(BSc 6901) Will HKU Science accept combined HK...,"(BSc 6901) For 6901 BSc Programme, the best re...",BSc 6901,0.000000
54,(BSc 6901) What are the selection criteria for...,"(BSc 6901) Science elective subjects*, Mathema...",BSc 6901,0.000000
0,(HKDSE) How do I apply to HKU through JUPAS sc...,(HKDSE) We welcome your application to HKU thr...,HKDSE,0.000000
131,(AAO) How should I declare major/minor in SIS?,(AAO) Students can follow the steps stated in ...,AAO,-0.042905


**WITH stopwords**

In [20]:
# get cosine similarity scores for every entry in our FAQ database
# WITH stopwords
get_similarities(QUERY, df_faq, model_glove, with_stopwords=True).to_csv('model-sample-results/glove-with-stopwords.csv')

In [21]:
# save model
model_glove.save('models/glove.model')

## Model 3: FastText

In [27]:
import gensim.downloader as api

# define FastText model
model_fasttext = api.load('fasttext-wiki-news-subwords-300')



In [29]:
# get cosine similarity scores for every entry in our FAQ database
# WITHOUT stopwords
get_similarities(QUERY, df_faq, model_fasttext, with_stopwords=False).to_csv('model-sample-results/fasttext-without-stopwords.csv')

In [32]:
# try another query
get_similarities(
    'I am a HKDSE student. What is the tuition fee?', df_faq, model_fasttext, with_stopwords=False)

Unnamed: 0,Question,Answer,Type,Similarity
0,(HKDSE) How do I apply to HKU through JUPAS sc...,(HKDSE) We welcome your application to HKU thr...,HKDSE,0
120,(AAO) How do I know if I have fulfilled all th...,(AAO) It is your responsibility to check the c...,AAO,0
112,(AAO) Is it possible to double major or double...,"(AAO) Yes, as long as you have enough credits ...",AAO,0
113,(AAO) How many majors/ minors I can take to th...,(AAO) There are no rigid regulations in this r...,AAO,0
114,(AAO) Can I change my major/ minor later? How ...,(AAO) It depends on the setup which may vary f...,AAO,0
...,...,...,...,...
60,(BSc 6901) Other universities are also adoptin...,(BSc 6901) HKU Science is the first university...,BSc 6901,0
61,(BSc 6901) What is Young Scientist Scheme (YSS)?,(BSc 6901) Enrolment in our flagship Summer Re...,BSc 6901,0
62,(BSc 6901) Who is entitled to join YSS?,(BSc 6901) have a second chance to enrol in YS...,BSc 6901,0
63,(BSc 6901) Who is eligible to apply for the sc...,(BSc 6901) Candidates who hold a recognised fu...,BSc 6901,0


## Model 4: ELMo