In [1]:
import numpy as np
import pandas as pd

# Data Preprocessing

In [2]:
# prepare internal FAQ dataset
df_faq = pd.read_excel('faq-data/df_undergrad_faq.xlsx')

df_temp = pd.read_csv('faq-data/df_6901_faq.csv', index_col=0)
df_temp['Type'] = 'BSc 6901'
df_faq = df_faq.append(df_temp)

df_temp = pd.read_excel('faq-data/df_basc_faq.xlsx')
df_temp['Type'] = 'BASc'
df_faq = df_faq.append(df_temp)

df_temp = pd.read_csv('faq-data/df_aao_faq.csv', index_col=0)
df_temp['Type'] = 'AAO'
df_faq = df_faq.append(df_temp)

df_faq = df_faq.reset_index(drop=True)

# include the type of the question into both 'Question' and 'Answer'
df_faq['Question'] = '('+ df_faq['Type'] + ') ' + df_faq['Question']
df_faq['Answer'] = '('+ df_faq['Type'] + ') ' + df_faq['Answer']

In [3]:
# save preproecessed faq dataset
df_faq.to_csv('faq-data/df_faq.csv')

In [4]:
df_faq.sample(10)

Unnamed: 0,Question,Answer,Type
45,(BSc 6901) If I do the Major-Minor instead of ...,(BSc 6901) For those who intend to develop a c...,BSc 6901
83,(AAO) Can I change my FAA?,"(AAO) Under normal circumstances, FAA cannot b...",AAO
63,(BSc 6901) Who is eligible to apply for the sc...,(BSc 6901) Candidates who hold a recognised fu...,BSc 6901
76,(BASc) Can I participate in the exchange progr...,(BASc) Though it is not a compulsory graduatio...,BASc
81,(AAO) How can I find my Faculty Academic Advis...,(AAO) You can check who your FAA is via the we...,AAO
38,"(BSc 6901) If that situation arises, would tha...",(BSc 6901) Absolutely not. The Faculty will ad...,BSc 6901
57,"(BSc 6901) If I get 5 or above in DSE English,...","(BSc 6901) Yes, the first English course will ...",BSc 6901
51,(BSc 6901) Do I need to put the Programme in B...,(BSc 6901) Over 98% of students admitted in th...,BSc 6901
87,(AAO) I am interested in the programmes of HKU...,"(AAO) For matters related to admission, please...",AAO
167,(AAO) What fees do I have to pay?,(AAO) You can find detailed information on “Un...,AAO


# Helper Functions

In [2]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')

from nltk import word_tokenize
from nltk.corpus import stopwords
import string

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Kackie\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Kackie\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
class Sentence:
    def __init__(self, sentence):
        self.sentence = sentence

    # METHOD: preprocess the sentence
    # return: list of tokenized words
    def preprocess(self, with_stopwords=False):
        text = self.sentence

        # convert text to lower case
        text = text.lower()

        # tokenize text into list of words
        words = word_tokenize(text)

        # remove punctuation
        words = [word for word in words if word.isalpha()]

        if with_stopwords == False:
            # remove stopwords
            stop_words = set(stopwords.words('english'))
            words = [word for word in words if not word in stop_words]

        return words
        
    # METHOD: get word embeddings based on specific model
    # return: matrix word embeddings
    def get_vector(self, model, with_stopwords=False):
        return np.sum(np.array([model[i] for i in self.preprocess(with_stopwords=with_stopwords)]), axis=0)

In [4]:
from scipy import spatial

# HELPER FUNCTION: get similarities of a specific query against all questions in our FAQ database
# return: dataframe with new column 'Similarity'
def get_similarities(query, df, model, with_stopwords=False):
    similarities = []
    query = Sentence(query)
    for q in df['Question']:
        s = Sentence(q)
        try:
            similarity = 1 - spatial.distance.cosine(
                query.get_vector(model, with_stopwords), 
                s.get_vector(model, with_stopwords)
            )
            similarities.append(similarity)
        except:
            similarities.append(0)
            continue
    df['Similarity'] = similarities
    
    df = df.sort_values(by='Similarity', ascending=False)

    return df

---
# Models

In [9]:
# define any query
QUERY = 'I wanna know what the application deadline is.'

## Model 1: Word2Vec

In [9]:
import gensim.downloader as api

# define word2vec model
model_w2v = api.load('word2vec-google-news-300')

**WITHOUT stopwords**

In [10]:
# get cosine similarity scores for every entry in our FAQ database
# WITHOUT stopwords
get_similarities(QUERY, df_faq, model_w2v, with_stopwords=False).to_csv('model-sample-results/w2v-without-stopwords.csv')

In [30]:
# try another query
get_similarities(
    'I am an international student. I wanna know how much the programme costs.', 
    df_faq, model_w2v, with_stopwords=False)

Unnamed: 0,Question,Answer,Type,Similarity
0,(HKDSE) How do I apply to HKU through JUPAS sc...,(HKDSE) We welcome your application to HKU thr...,HKDSE,0
120,(AAO) How do I know if I have fulfilled all th...,(AAO) It is your responsibility to check the c...,AAO,0
112,(AAO) Is it possible to double major or double...,"(AAO) Yes, as long as you have enough credits ...",AAO,0
113,(AAO) How many majors/ minors I can take to th...,(AAO) There are no rigid regulations in this r...,AAO,0
114,(AAO) Can I change my major/ minor later? How ...,(AAO) It depends on the setup which may vary f...,AAO,0
...,...,...,...,...
60,(BSc 6901) Other universities are also adoptin...,(BSc 6901) HKU Science is the first university...,BSc 6901,0
61,(BSc 6901) What is Young Scientist Scheme (YSS)?,(BSc 6901) Enrolment in our flagship Summer Re...,BSc 6901,0
62,(BSc 6901) Who is entitled to join YSS?,(BSc 6901) have a second chance to enrol in YS...,BSc 6901,0
63,(BSc 6901) Who is eligible to apply for the sc...,(BSc 6901) Candidates who hold a recognised fu...,BSc 6901,0


**WITH stopwords**

In [12]:
# get cosine similarity scores for every entry in our FAQ database
# WITH stopwords
get_similarities(QUERY, df_faq, model_w2v, with_stopwords=True).to_csv('model-sample-results/w2v-with-stopwords.csv')

In [13]:
# save model
model_w2v.save('models/w2v.model')

## Model 2: GloVe

In [14]:
import gensim.downloader as api

# define GloVe model
model_glove = api.load('glove-wiki-gigaword-300')

**WITHOUT stopwords**

In [15]:
# get cosine similarity scores for every entry in our FAQ database
# WITHOUT stopwords
get_similarities(QUERY, df_faq, model_glove, with_stopwords=False).to_csv('model-sample-results/glove-without-stopwords.csv')

In [46]:
# try another query
get_similarities(
    'I am an HKDSE student. I wanna know how much the programme costs.', 
    df_faq, model_glove, with_stopwords=False)

Unnamed: 0,Question,Answer,Type,Similarity
0,(HKDSE) How do I apply to HKU through JUPAS sc...,(HKDSE) We welcome your application to HKU thr...,HKDSE,0
120,(AAO) How do I know if I have fulfilled all th...,(AAO) It is your responsibility to check the c...,AAO,0
112,(AAO) Is it possible to double major or double...,"(AAO) Yes, as long as you have enough credits ...",AAO,0
113,(AAO) How many majors/ minors I can take to th...,(AAO) There are no rigid regulations in this r...,AAO,0
114,(AAO) Can I change my major/ minor later? How ...,(AAO) It depends on the setup which may vary f...,AAO,0
...,...,...,...,...
60,(BSc 6901) Other universities are also adoptin...,(BSc 6901) HKU Science is the first university...,BSc 6901,0
61,(BSc 6901) What is Young Scientist Scheme (YSS)?,(BSc 6901) Enrolment in our flagship Summer Re...,BSc 6901,0
62,(BSc 6901) Who is entitled to join YSS?,(BSc 6901) have a second chance to enrol in YS...,BSc 6901,0
63,(BSc 6901) Who is eligible to apply for the sc...,(BSc 6901) Candidates who hold a recognised fu...,BSc 6901,0


**WITH stopwords**

In [17]:
# get cosine similarity scores for every entry in our FAQ database
# WITH stopwords
get_similarities(QUERY, df_faq, model_glove, with_stopwords=True).to_csv('model-sample-results/glove-with-stopwords.csv')

In [18]:
# save model
model_glove.save('models/glove.model')

## Model 3: FastText

In [19]:
import gensim.downloader as api

# define FastText model
model_fasttext = api.load('fasttext-wiki-news-subwords-300')

In [20]:
# get cosine similarity scores for every entry in our FAQ database
# WITHOUT stopwords
get_similarities(QUERY, df_faq, model_fasttext, with_stopwords=False).to_csv('model-sample-results/fasttext-without-stopwords.csv')

In [52]:
# try another query
get_similarities(
    'I am a HKDSE student. I wanna know how much the programme costs.', df_faq, model_fasttext, with_stopwords=False)

Unnamed: 0,Question,Answer,Type,Similarity
0,(HKDSE) How do I apply to HKU through JUPAS sc...,(HKDSE) We welcome your application to HKU thr...,HKDSE,0
120,(AAO) How do I know if I have fulfilled all th...,(AAO) It is your responsibility to check the c...,AAO,0
112,(AAO) Is it possible to double major or double...,"(AAO) Yes, as long as you have enough credits ...",AAO,0
113,(AAO) How many majors/ minors I can take to th...,(AAO) There are no rigid regulations in this r...,AAO,0
114,(AAO) Can I change my major/ minor later? How ...,(AAO) It depends on the setup which may vary f...,AAO,0
...,...,...,...,...
60,(BSc 6901) Other universities are also adoptin...,(BSc 6901) HKU Science is the first university...,BSc 6901,0
61,(BSc 6901) What is Young Scientist Scheme (YSS)?,(BSc 6901) Enrolment in our flagship Summer Re...,BSc 6901,0
62,(BSc 6901) Who is entitled to join YSS?,(BSc 6901) have a second chance to enrol in YS...,BSc 6901,0
63,(BSc 6901) Who is eligible to apply for the sc...,(BSc 6901) Candidates who hold a recognised fu...,BSc 6901,0


## Model 4: Fine-tuned GloVe (Common Crawl)

In [10]:
df_faq = pd.read_csv('faq-data/df_faq.csv', index_col=0)

In [11]:
df_faq

Unnamed: 0,Question,Answer,Type
0,(HKDSE) How do I apply to HKU through JUPAS sc...,(HKDSE) We welcome your application to HKU thr...,HKDSE
1,(HKDSE) What are the common mistakes as a JUPA...,(HKDSE) Students should not forget that in add...,HKDSE
2,(HKDSE) How can I apply to HKU as a HKDSE repe...,(HKDSE) All students who apply to HKU on the b...,HKDSE
3,(HKDSE) What are the minimum university entran...,(HKDSE) To have your application considered fo...,HKDSE
4,(HKDSE) How is the admission score calculated?,(HKDSE) Starting from the academic year 2020/2...,HKDSE
...,...,...,...
170,(AAO) I still have other questions regarding t...,(AAO) You might try to look at the FAQ compile...,AAO
171,(AAO) Are there scholarships that accept appli...,(AAO) Please visit the website of the Scholars...,AAO
172,(AAO) When and how do I apply for leave of abs...,(AAO) You need to apply for leave of absence i...,AAO
173,(AAO) What is plagiarism and what happens if I...,"(AAO) To put it simply, plagiarism is defined ...",AAO


In [12]:
questions = df_faq['Question']
questions

0      (HKDSE) How do I apply to HKU through JUPAS sc...
1      (HKDSE) What are the common mistakes as a JUPA...
2      (HKDSE) How can I apply to HKU as a HKDSE repe...
3      (HKDSE) What are the minimum university entran...
4         (HKDSE) How is the admission score calculated?
                             ...                        
170    (AAO) I still have other questions regarding t...
171    (AAO) Are there scholarships that accept appli...
172    (AAO) When and how do I apply for leave of abs...
173    (AAO) What is plagiarism and what happens if I...
174    (AAO) I have a question that isn’t answered he...
Name: Question, Length: 175, dtype: object

In [5]:
from mittens import GloVe, Mittens
from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer

In [6]:
import csv

In [7]:
# define function to convert the glove file name to dictionary type
def glove2dict(glove_filename):
    with open(glove_filename, encoding='utf-8') as f:
        reader = csv.reader(f, delimiter=' ', quoting=csv.QUOTE_NONE)
        embed = {line[0]: np.array(list(map(float, line[1:])))
                for line in reader}
    return embed

In [15]:
# convert the txt file of the pretrained glove model to dictionary type
glove_path = 'glove-data/glove.840B.300d.txt'
pre_glove = glove2dict(glove_path)

In [17]:
len(pre_glove)

2196016

In [14]:
def preprocess(text):
    text = text.lower() # convert all letters to lowercase
    words = word_tokenize(text) # tokanize each sentence
    words = [word for word in words if word.isalpha()]

    # handle stopwords
    stop_words = set(stopwords.words('english'))
    words = [word for word in words if not word in stop_words]

    return words

In [38]:
# preprocess the questions list
questions_tokenized = questions.apply(preprocess).tolist()

# concatenate all the tokens of the questions_tokenized dataset
questions_words = [j for i in questions_tokenized for j in i]

# identify any out-of-vocabulary words compared to the pretrained vocab
oov = [word for word in questions_words if word not in pre_glove.keys()]

In [44]:
corp_vocab = list(set(oov))
questions_doc = [' '.join(questions_words)]

In [45]:
corp_vocab

['sgpa', 'hkdse', 'jupas', 'ygpa', 'ggpa', 'admssions']

In [48]:
# get the cooccurence matrix
cv = CountVectorizer(ngram_range=(1,1), vocabulary=corp_vocab)
X = cv.fit_transform(questions_doc)
Xc = (X.T * X)
Xc.setdiag(0)
coocc_ar = Xc.toarray()

In [53]:
# fine tune the glove model using mittens
model_mittens = Mittens(n=300, max_iter=50)

In [54]:
# get the fine-tuned embeddings
finetuned_embeddings = model_mittens.fit(
    coocc_ar,
    vocab=corp_vocab,
    initial_embedding_dict=pre_glove
)

Iteration 50: error 0.0052

In [64]:
finetuned_embeddings

array([[ 0.21536306, -0.23348389, -0.00108256, ...,  0.08835838,
        -0.0281024 , -0.0755351 ],
       [ 0.14106864, -0.2549903 ,  0.02145601, ...,  0.19411995,
         0.16020646, -0.31885206],
       [ 0.16597015, -0.15611448, -0.01689495, ...,  0.25187659,
         0.24686613, -0.31104453],
       [-0.0233133 , -0.20189346, -0.05745978, ...,  0.05373154,
         0.28167596, -0.07932853],
       [ 0.01277053, -0.05098631,  0.02492939, ..., -0.09738831,
         0.26762349, -0.2695603 ],
       [ 0.00584429,  0.10794007,  0.00472186, ...,  0.13523033,
         0.05557022, -0.36607522]])

In [67]:
# get the new glove model
model_glove_oov = dict(zip(corp_vocab, finetuned_embeddings))

In [73]:
# save the oov glove model
import pickle
f = open('models/glove_cc_oov.pkl', 'wb')
pickle.dump(model_glove_oov, f)
f.close()

In [69]:
# concatenate pretrained glove and oov glove dictionaries together to form the final glove dictionary
model_glove_finetuned = {}
model_glove_finetuned.update(pre_glove)
model_glove_finetuned.update(model_glove_oov)

In [70]:
print(f'Length of the pretrained GloVe dictionary: {len(pre_glove)}')
print(f'Length of the final fine-tuned GloVe dictionary: {len(model_glove_finetuned)}')

Length of the pretrained GloVe dictionary: 2196016
Length of the final fine-tuned GloVe dictionary: 2196022


In [84]:
# try another query
get_similarities(
    'I am an HKDSE student. I wanna know how much the programme costs.', 
    df_faq, model_glove_finetuned, with_stopwords=False)

Unnamed: 0,Question,Answer,Type,Similarity
170,(AAO) I still have other questions regarding t...,(AAO) You might try to look at the FAQ compile...,AAO,0.776316
169,(AAO) If I need to defer my graduation for one...,(AAO) If students have paid full composition f...,AAO,0.772135
99,(AAO) How many courses can I take for the summ...,"(AAO) Under normal circumstances, you can take...",AAO,0.768008
162,(AAO) I want to exchange but my CGPA is not ve...,(AAO) IAO offers a range of study abroad progr...,AAO,0.767756
157,(AAO) If I go on exchange studies in the secon...,(AAO) Since enrollment record of all courses (...,AAO,0.762995
...,...,...,...,...
69,(BASc) What are the differences between our BA...,(BASc) While other BASc programmes require stu...,BASc,0.269092
62,(BSc 6901) Who is entitled to join YSS?,(BSc 6901) have a second chance to enrol in YS...,BSc 6901,0.250393
2,(HKDSE) How can I apply to HKU as a HKDSE repe...,(HKDSE) All students who apply to HKU on the b...,HKDSE,0.227826
150,"(AAO) What are the differences between SGPA, Y...",(AAO) GPA is the abbreviation of Grade Point A...,AAO,0.214337


## Model 5: Fine-tuned GloVe (Wiki + Gigaword)

In [8]:
# convert the txt file of the pretrained glove model to dictionary type
glove_path = 'glove-data/glove.6B.300d.txt'
pre_glove_6b = glove2dict(glove_path)

In [15]:
# preprocess the questions list
questions_tokenized = questions.apply(preprocess).tolist()

# concatenate all the tokens of the questions_tokenized dataset
questions_words = [j for i in questions_tokenized for j in i]

# identify any out-of-vocabulary words compared to the pretrained vocab
oov = [word for word in questions_words if word not in pre_glove_6b.keys()]

In [16]:
corp_vocab = list(set(oov))
questions_doc = [' '.join(questions_words)]

In [17]:
corp_vocab

['ggpa', 'hkdse', 'admssions', 'ygpa', 'yss', 'sgpa', 'fintech']

In [18]:
# get the cooccurence matrix
cv = CountVectorizer(ngram_range=(1,1), vocabulary=corp_vocab)
X = cv.fit_transform(questions_doc)
Xc = (X.T * X)
Xc.setdiag(0)
coocc_ar = Xc.toarray()

In [23]:
# fine tune the glove model using mittens
model_mittens = Mittens(n=300, max_iter=100)

In [24]:
# get the fine-tuned embeddings
finetuned_embeddings = model_mittens.fit(
    coocc_ar,
    vocab=corp_vocab,
    initial_embedding_dict=pre_glove_6b
)

Iteration 100: error 0.0001

In [25]:
finetuned_embeddings

array([[ 0.02631426, -0.15538317,  0.08178973, ..., -0.01127533,
        -0.03334514,  0.03103017],
       [ 0.1045783 , -0.05995505,  0.09140404, ...,  0.28087561,
        -0.10251079,  0.21209909],
       [ 0.15922981, -0.10345597, -0.11062329, ..., -0.06195216,
        -0.0042606 ,  0.02028969],
       ...,
       [-0.07172647, -0.15133784, -0.10242488, ...,  0.18892125,
         0.03875714,  0.00315756],
       [ 0.12751028,  0.05125323,  0.14642109, ...,  0.05362132,
        -0.07406009,  0.05368258],
       [-0.0211357 ,  0.02340683,  0.13315088, ...,  0.07894328,
         0.03253981,  0.03545454]])

In [26]:
# get the new glove model
model_glove_oov = dict(zip(corp_vocab, finetuned_embeddings))

In [102]:
# save the oov glove model
import pickle
f = open('models/glove_wiki_oov.pkl', 'wb')
pickle.dump(model_glove_oov, f)
f.close()

In [28]:
# concatenate pretrained glove and oov glove dictionaries together to form the final glove dictionary
model_glove_finetuned = {}
model_glove_finetuned.update(pre_glove_6b)
model_glove_finetuned.update(model_glove_oov)

In [31]:
print(f'Length of the pretrained GloVe dictionary: {len(pre_glove_6b)}')
print(f'Length of the final fine-tuned GloVe dictionary: {len(model_glove_finetuned)}')

Length of the pretrained GloVe dictionary: 400000
Length of the final fine-tuned GloVe dictionary: 400007


In [34]:
import pickle
f = open('models/glove_wiki_finetuned.pkl', 'wb')
pickle.dump(model_glove_finetuned, f)
f.close()

In [33]:
# try another query
get_similarities(
    'I wanna know what the application deadline is.', 
    df_faq, model_glove_finetuned, with_stopwords=False)

Unnamed: 0,Question,Answer,Type,Similarity
13,(HKDSE (Non-local)) What is the University’s a...,(HKDSE (Non-local)) You may wish to go through...,HKDSE (Non-local),0.608544
18,(International) What is the University’s appli...,(International) You may wish to go through the...,International,0.590474
21,(International) Where should I submit my appli...,(International) All application to the Univers...,International,0.588276
15,(HKDSE (Non-local)) Where should I submit my a...,(HKDSE (Non-local)) All application to the Uni...,HKDSE (Non-local),0.580242
140,(AAO) What should I do if I know I will not be...,(AAO) Special arrangement will be made only un...,AAO,0.558955
...,...,...,...,...
56,(BSc 6901) What are the admissions scores for ...,(BSc 6901) HKDSE: 22 - 36.5 (Best 5 subjects);...,BSc 6901,0.110630
2,(HKDSE) How can I apply to HKU as a HKDSE repe...,(HKDSE) All students who apply to HKU on the b...,HKDSE,0.109337
70,(BASc) What are majors and minors and what is ...,(BASc) When we say we “declare a major” in a c...,BASc,0.107719
69,(BASc) What are the differences between our BA...,(BASc) While other BASc programmes require stu...,BASc,0.100741


In [114]:
# # try load the pkl model
# import pickle
# with open('models/glove_wiki_finetuned.pkl', 'rb') as f:
#     test = pickle.load(f)