In [1]:
import numpy as np
import pandas as pd

In [10]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

from nltk import word_tokenize
from nltk import sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
import string

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Kackie\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Kackie\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Kackie\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\Kackie\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [15]:
from sklearn.model_selection import train_test_split

# Recommender Intent/Type Classification Training

## Dataset

In [11]:
df = pd.read_csv('data/recommender_intent_dataset.csv')

In [12]:
df

Unnamed: 0,Student Query,Recommender Intent
0,I am really worried about my future. What kind...,career
1,How do I know what career should I take?,career
2,"I don't know what I'm good at, so I'd really l...",career
3,Career or job recommendation?,career
4,"Concerning my future, I am really anxious. Whi...",career
...,...,...
115,I hate courses that teach theory. Can you help...,course
116,Have some trouble choosing the best courses fo...,course
117,I don't really know which classes I should tak...,course
118,I am quite confused about what classes are bes...,course


In [13]:
# show the distribution of all 3 classes
df.groupby(['Recommender Intent']).size()

Recommender Intent
career            40
course            40
subject_domain    40
dtype: int64

In [14]:
# create X and y datasets
X = df['Student Query']
y = df['Recommender Intent']

## Models

### Model: TF-IDF + Classifier

### Model: Word2Vec (mean of word vectors) + classifier

### Model: Fine-tuned GloVe (Wiki + Gigaword) + Classifier

In [46]:
import csv, pickle
from mittens import GloVe, Mittens
from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer

In [77]:
from sklearn.metrics import classification_report, accuracy_score

In [16]:
def preprocess(text):
    text = text.lower()
    words = word_tokenize(text)
    words = [word for word in words if word.isalpha()]

    # stopwords
    stop_words = set(stopwords.words('english'))
    words = [word for word in words if not word in stop_words]

    return words

#### GloVe model training

In [17]:
# split into training and testing datasets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, shuffle=True, random_state=7600)

In [21]:
# define function to convert the glove file name to dictionary type
def glove2dict(glove_filename):
    with open(glove_filename, encoding='utf-8') as f:
        reader = csv.reader(f, delimiter=' ', quoting=csv.QUOTE_NONE)
        embed = {line[0]: np.array(list(map(float, line[1:])))
                for line in reader}
    return embed

In [30]:
# convert the txt file of the pretrained glove model to dictionary type
glove_path = '../faq/q-Q_similarity/glove-data/glove.6B.300d.txt'
pre_glove_6b = glove2dict(glove_path)
len(pre_glove_6b)

400000

In [25]:
# preprocess the queries list
X_tokenized = X.apply(preprocess).tolist()

# concatenate all the tokens of the X_tokenized dataset
X_words = [j for i in X_tokenized for j in i]

# identify any out-of-vocabulary words compared to the pretrained vocab
oov = [word for word in X_words if word not in pre_glove_6b.keys()]

In [26]:
oov

['fintech',
 'gradurate',
 'graduration',
 'manangement',
 'recommendataions',
 'universiy',
 'recommendataion']

In [27]:
corp_vocab = list(set(oov))
X_doc = [' '.join(X_words)]

In [28]:
corp_vocab

['recommendataions',
 'gradurate',
 'manangement',
 'graduration',
 'fintech',
 'recommendataion',
 'universiy']

In [31]:
# get the cooccurence matrix
cv = CountVectorizer(ngram_range=(1,1), vocabulary=corp_vocab)
X_cv_transformed = cv.fit_transform(X_doc)
Xc = (X_cv_transformed.T * X_cv_transformed)
Xc.setdiag(0)
coocc_ar = Xc.toarray()

In [39]:
# fine tune the glove model using mittens
model_mittens = Mittens(n=300, max_iter=100)

In [40]:
# get the fine-tuned embeddings
finetuned_embeddings = model_mittens.fit(
    coocc_ar,
    vocab=corp_vocab,
    initial_embedding_dict=pre_glove_6b
)

Iteration 100: error 0.0003

In [41]:
finetuned_embeddings.shape

(7, 300)

In [42]:
# concatenate the pretrained glove and oov glove dictionaries together to form the final glove dictionary
model_glove_oov = dict(zip(corp_vocab, finetuned_embeddings))
model_glove_finetuned = {}
model_glove_finetuned.update(pre_glove_6b)
model_glove_finetuned.update(model_glove_oov)

In [44]:
len(model_glove_finetuned)

400007

In [45]:
# save the model
f = open('models/recommender_intent_classfiication_glove_wiki_finetuned.pkl', 'wb')
pickle.dump(model_glove_finetuned, f)
f.close()

In [69]:
# prepare word embeddings for all X data
def get_vector(text, model):
    return np.mean(np.array([model[i] for i in preprocess(text)]), axis=0).tolist()

X_train_glove_transformed = X_train.apply(lambda x: get_vector(x, model_glove_finetuned)).tolist()
X_test_glove_transformed = X_test.apply(lambda x: get_vector(x, model_glove_finetuned)).tolist()

#### Naive Bayes (baseline)

In [65]:
from sklearn.naive_bayes import MultinomialNB

In [73]:
# we need to use the min max scaler to convert all negative values
from sklearn.preprocessing import MinMaxScaler
X_train_glove_transformed_scaled = MinMaxScaler().fit_transform(X_train_glove_transformed)
X_test_glove_transformed_scaled = MinMaxScaler().fit_transform(X_test_glove_transformed)

In [75]:
clf_glove_mnb = MultinomialNB()
clf_glove_mnb.fit(X_train_glove_transformed_scaled, y_train)

MultinomialNB()

In [78]:
accuracy_score(y_test, clf_glove_mnb.predict(X_test_glove_transformed_scaled))

0.7777777777777778

In [79]:
print(
    classification_report(y_test, clf_glove_mnb.predict(X_test_glove_transformed_scaled))
)

                precision    recall  f1-score   support

        career       0.67      0.89      0.76         9
        course       0.78      0.93      0.85        15
subject_domain       1.00      0.50      0.67        12

      accuracy                           0.78        36
     macro avg       0.81      0.77      0.76        36
  weighted avg       0.82      0.78      0.77        36



#### Linear SVM

In [82]:
from sklearn.linear_model import SGDClassifier

In [222]:
clf_glove_sgd = SGDClassifier(
    loss='hinge',
    penalty='l2',
    alpha=1e-3,
    random_state=7600,
)
clf_glove_sgd.fit(X_train_glove_transformed, y_train)

SGDClassifier(alpha=0.001, random_state=7600)

In [223]:
accuracy_score(y_test, clf_glove_sgd.predict(X_test_glove_transformed))

0.8888888888888888

In [224]:
print(classification_report(y_test, clf_glove_sgd.predict(X_test_glove_transformed)))

                precision    recall  f1-score   support

        career       0.82      1.00      0.90         9
        course       0.93      0.87      0.90        15
subject_domain       0.91      0.83      0.87        12

      accuracy                           0.89        36
     macro avg       0.89      0.90      0.89        36
  weighted avg       0.89      0.89      0.89        36



In [225]:
clf_glove_sgd.predict(X_test_glove_transformed)

array(['career', 'subject_domain', 'course', 'course', 'career',
       'subject_domain', 'career', 'subject_domain', 'career', 'course',
       'career', 'course', 'course', 'course', 'subject_domain',
       'subject_domain', 'course', 'subject_domain', 'subject_domain',
       'career', 'career', 'course', 'career', 'course', 'course',
       'subject_domain', 'career', 'subject_domain', 'course', 'course',
       'course', 'subject_domain', 'subject_domain', 'career', 'course',
       'career'], dtype='<U14')

In [226]:
df_glove_sgd_predictions = pd.DataFrame(
    X_test
)
df_glove_sgd_predictions['Ground Truth Recommender Intent'] = y_test
df_glove_sgd_predictions['Predicted Recommender Intent'] = clf_glove_sgd.predict(X_test_glove_transformed)

In [227]:
# get the cases where the predictions are wrong
df_glove_sgd_predictions.loc[df_glove_sgd_predictions['Ground Truth Recommender Intent'] != df_glove_sgd_predictions['Predicted Recommender Intent']]

Unnamed: 0,Student Query,Ground Truth Recommender Intent,Predicted Recommender Intent
89,I want to learn more about computer programmin...,subject_domain,course
92,I don't have any dream. Which subject should I...,subject_domain,career
58,I want to study math.,course,subject_domain
41,I want to work in investment bank after gradua...,course,career


In [228]:
# save this model as it performs the best
from joblib import dump
dump(clf_glove_sgd, 'models/clf_glove_sgd_recommender_intent_classification.joblib')

['models/clf_glove_sgd_recommender_intent_classification.joblib']

In [229]:
import joblib
temp = joblib.load('models/clf_glove_sgd_recommender_intent_classification.joblib')

In [231]:
temp.predict(X_test_glove_transformed)

array(['career', 'subject_domain', 'course', 'course', 'career',
       'subject_domain', 'career', 'subject_domain', 'career', 'course',
       'career', 'course', 'course', 'course', 'subject_domain',
       'subject_domain', 'course', 'subject_domain', 'subject_domain',
       'career', 'career', 'course', 'career', 'course', 'course',
       'subject_domain', 'career', 'subject_domain', 'course', 'course',
       'course', 'subject_domain', 'subject_domain', 'career', 'course',
       'career'], dtype='<U14')

#### Logistic Regression

In [88]:
from sklearn.linear_model import LogisticRegression

In [92]:
from os import cpu_count
clf_glove_lr = LogisticRegression(
    max_iter=200,
    random_state=7600,
    n_jobs=cpu_count()
)
clf_glove_lr.fit(X_train_glove_transformed, y_train)

LogisticRegression(max_iter=200, n_jobs=16, random_state=7600)

In [93]:
accuracy_score(y_test, clf_glove_lr.predict(X_test_glove_transformed))

0.8611111111111112

In [95]:
print(classification_report(y_test, clf_glove_lr.predict(X_test_glove_transformed)))

                precision    recall  f1-score   support

        career       0.75      1.00      0.86         9
        course       0.92      0.80      0.86        15
subject_domain       0.91      0.83      0.87        12

      accuracy                           0.86        36
     macro avg       0.86      0.88      0.86        36
  weighted avg       0.88      0.86      0.86        36



#### CNN

In [96]:
import torch, scipy
from torch import nn

In [98]:
# set up gpu for training
if torch.cuda.is_available(): device = torch.device('cuda')
else: device = torch.device('cpu')
device

device(type='cuda')

In [133]:
# convert vectors to tensor
X_train_glove_tensor = torch.tensor(X_train_glove_transformed).float().to(device)
X_test_glove_tensor = torch.tensor(X_test_glove_transformed).float().to(device)

In [134]:
y_train_codes = pd.Categorical(y_train).codes
y_test_codes = pd.Categorical(y_test).codes

In [141]:
y_train_tensor = torch.tensor(y_train_codes).long().to(device)
y_test_tensor = torch.tensor(y_test_codes).long().to(device)

In [142]:
# define cnn model
clf_glove_cnn = nn.Sequential(
    nn.Linear(X_train_glove_tensor.shape[1], 64),
    nn.ReLU(),
    nn.Linear(64, y.nunique()),
    nn.LogSoftmax(dim=1)
)
clf_glove_cnn.to(device)

Sequential(
  (0): Linear(in_features=300, out_features=64, bias=True)
  (1): ReLU()
  (2): Linear(in_features=64, out_features=3, bias=True)
  (3): LogSoftmax(dim=1)
)

In [143]:
# define loss
criterion = nn.NLLLoss()
logps = clf_glove_cnn(X_train_glove_tensor)
loss = criterion(logps, y_train_tensor)

loss.backward()

# define optimizer
optimizer = torch.optim.Adam(clf_glove_cnn.parameters(), lr=2e-3)

In [144]:
EPOCHS = 50

In [145]:
for epoch in range(EPOCHS):
    optimizer.zero_grad()
    outputs = clf_glove_cnn.forward(X_train_glove_tensor)
    loss = criterion(outputs, y_train_tensor)
    loss.backward()
    optimizer.step()

In [146]:
# evaluate our model
with torch.no_grad():
    clf_glove_cnn.eval()
    log_ps = clf_glove_cnn(X_test_glove_tensor)
    test_loss = criterion(log_ps, y_test_tensor)

    ps = torch.exp(log_ps)
    top_p, top_class = ps.topk(1, dim=1)
    equals = top_class == y_test_tensor.view(*top_class.shape)
    test_accuracy = torch.mean(equals.float())

In [147]:
test_accuracy

tensor(0.8889, device='cuda:0')