In [123]:
import tensorflow as tf
import tensorflow_ranking as tfr
import tensorflow_datasets as tfds
import tensorflow_recommenders as tfrs
import numpy as np
import pandas as pd
from tensorflow.keras.preprocessing.text import Tokenizer
from sklearn.preprocessing import OneHotEncoder
import re
import json
import pprint
import collections
import random
random_state = 42

# <span Style='font-family: Georgia, serif; color:orange'> **Read/Prepare Data**

In [157]:
qrels = pd.read_csv('training_qrels_annotated.csv').drop(columns='cip_name')
docset = pd.read_csv('final_curriculum_data\\final_docset.csv').drop(index=[210, 199, 47, 190]).reset_index(drop=True)
cip_titles = pd.read_csv('final_curriculum_data\\cip_names.csv')[['Title', 'CIP Code']]
cip_titles['CIP Code'] = [i[2:-1] if i[2] != '0' else i[3:-1] for i in cip_titles['CIP Code']]
cip_titles['CIP Code'] = [i[:-1] if i[-1] == '0' else i for i in cip_titles['CIP Code']]
docset = docset[docset['cip'].isin(cip_titles['CIP Code'])].reset_index(drop=True)
docset['cip_name'] = [cip_titles[cip_titles['CIP Code']==i].Title.iloc[0] for i in docset.cip]
docset['cip'] = docset['cip'].astype(str)
qrels['courses'] = [r['courses'] for i in qrels['cip_code'] for ind, r in docset.iterrows() if str(i) == str(r['cip'])]
qrels['descriptions'] = [r['descriptions'] for i in qrels['cip_code'] for ind, r in docset.iterrows() if str(i) == str(r['cip'])]

all_queries = pd.read_csv('query_terms.csv')['0'].unique().tolist()
all_courses = docset['courses'].astype(str).tolist()
all_descriptions = docset['descriptions'].astype(str).tolist()

In [143]:
len(all_descriptions)

192

# <span Style='font-family: Georgia, serif; color:orange'> **Define Helper Functions and Ranking Model**

<span Style='font-family: Georgia, serif; color:orange'> **Code Sourced from https://github.com/tensorflow/recommenders/blob/main/tensorflow_recommenders/examples/movielens.py and re-tooled to fit our data**

In [144]:
def _create_feature_dict():
    return {"courses": [], "descriptions": [], 'scores':[]}


def _sample_list(feature_lists, num_examples_per_list, random_state):
    if random_state is None:
        random_state = np.random.RandomState()

    sampled_indices = random_state.choice(
        range(len(feature_lists["descriptions"])),
        size=num_examples_per_list,
        replace=False,
    )
    sampled_descriptions = [
        feature_lists["descriptions"][idx] for idx in sampled_indices
    ]
    sampled_courses = [
        feature_lists["courses"][idx] for idx in sampled_indices
    ]
    sampled_scores = [
        feature_lists["scores"][idx] for idx in sampled_indices
    ]

    return (
        tf.stack(sampled_courses, 0),
        tf.stack(sampled_descriptions, 0),
        tf.stack(sampled_scores, 0)
    )

def sample_listwise(dataset, num_list_per_query, num_examples_per_list, seed):

    random_state = np.random.RandomState(seed)

    example_lists_by_user = collections.defaultdict(_create_feature_dict)

    movie_title_vocab = set()
    for example in dataset:
        query = example["query"].numpy()
        example_lists_by_user[query]["courses"].append(
            example["courses"])
        example_lists_by_user[query]["descriptions"].append(
            example["descriptions"])
        example_lists_by_user[query]["scores"].append(
            example["scores"])
        movie_title_vocab.add(example["descriptions"].numpy())

    tensor_slices = {"query": [], "courses": [], 'descriptions': [], "scores": []}

    for query, feature_lists in example_lists_by_user.items():
        for _ in range(num_list_per_query):

            # Drop the user if they don't have enough ratings.
            if len(feature_lists["scores"]) < num_examples_per_list:
                continue

            sampled_courses, sampled_descriptions, sampled_scores = _sample_list(
                feature_lists,
                num_examples_per_list,
                random_state=random_state,
            )
            tensor_slices["query"].append(query)
            tensor_slices["courses"].append(sampled_courses)
            tensor_slices["descriptions"].append(sampled_descriptions)
            tensor_slices["scores"].append(sampled_scores)

    return tf.data.Dataset.from_tensor_slices(tensor_slices)

<span Style='font-family: Georgia, serif; color:orange'> **Code Sourced from https://www.tensorflow.org/recommenders/examples/listwise_ranking and re-tooled to fit our data**

In [216]:
class RankingModel(tfrs.Model):

    def __init__(self, loss):
        super().__init__()

        embedding_dim = 32

        self.query_embeddings = tf.keras.Sequential([
            tf.keras.layers.StringLookup(vocabulary=all_queries),
            tf.keras.layers.Embedding(len(all_queries)+2, embedding_dim)
        ])

        self.course_embeddings = tf.keras.Sequential([
            tf.keras.layers.StringLookup(vocabulary=all_courses),
            tf.keras.layers.Embedding(len(all_courses)+2, embedding_dim)
        ])

        self.desc_embeddings = tf.keras.Sequential([
            tf.keras.layers.StringLookup(vocabulary=all_descriptions),
            tf.keras.layers.Embedding(len(all_descriptions)+2, embedding_dim)
        ])

        self.score_model = tf.keras.Sequential([
            tf.keras.layers.Dense(256, activation='relu'),
            tf.keras.layers.Dense(64, activation='relu'),
            tf.keras.layers.Dense(1)
        ])

        self.task = tfrs.tasks.Ranking(
            loss=loss,
            metrics=[
                tfr.keras.metrics.NDCGMetric(topn=10, name='NDCG_top10'),
                tf.keras.metrics.RootMeanSquaredError()
            ]
        )

    def call(self, features):
        query_embeddings = self.query_embeddings(features['query'])
        course_embeddings = self.course_embeddings(features['courses'])
        desc_embeddings = self.desc_embeddings(features['descriptions'])
        list_length = features["descriptions"].shape[1]
        query_embedding_repeated = tf.repeat(
            tf.expand_dims(query_embeddings, 1), [list_length], axis=1)

        concatenated_embeddings = tf.concat([query_embedding_repeated, course_embeddings, desc_embeddings], 2)

        return self.score_model(concatenated_embeddings)

    def compute_loss(self, inputs, training = False):
        labels = inputs.pop('scores')
        scores = self(inputs)

        return self.task(
            labels=labels,
            predictions=tf.squeeze(scores, axis=-1)
        )

# <span Style='font-family: Georgia, serif; color:orange'> **Create Dataset**

In [146]:
dataset = tf.data.Dataset.from_tensor_slices({'query':qrels['query'], 'courses':qrels['courses'], 'descriptions':qrels['descriptions'], 'scores':qrels['assigned_score'].astype(float)})
dataset = dataset.shuffle(len(dataset),seed=random_state)
training_dataset = dataset.take(round(len(dataset)*.8))
test_dataset = dataset.skip(round(len(dataset)*.8))

train = sample_listwise(training_dataset, num_list_per_query=20, num_examples_per_list=5, seed=random_state)
test = sample_listwise(test_dataset, num_list_per_query=1, num_examples_per_list=5, seed=random_state)

In [147]:
cached_train = train.shuffle(10000).batch(32).cache()
cached_test = test.batch(32).cache()

# <span Style='font-family: Georgia, serif; color:orange'> **Initialize, Compile, and Train First Model**

In [187]:
epochs = 32

listwise = RankingModel(tfr.keras.losses.ListMLELoss())
listwise.compile(optimizer=tf.keras.optimizers.Adagrad(0.1))

In [188]:
listwise.fit(cached_train, epochs=epochs, verbose=0)

<keras.callbacks.History at 0x2199bfc3130>

In [189]:
listwise_model_result = listwise.evaluate(cached_test, return_dict=True, verbose=False)
print("NDCG of the ListMLE model: {:.4f}".format(listwise_model_result["NDCG_top10"]))

NDCG of the ListMLE model: 0.8671


# <span Style='font-family: Georgia, serif; color:orange'> **Train Final Model**

In [190]:
train = sample_listwise(training_dataset, num_list_per_query=30, num_examples_per_list=5, seed=random_state)
test = sample_listwise(test_dataset, num_list_per_query=1, num_examples_per_list=5, seed=random_state)
cached_train = train.shuffle(10000, seed=random_state).batch(32).cache()
cached_test = test.batch(32).cache()
epochs = 64
listwise = RankingModel(tfr.keras.losses.ListMLELoss())
listwise.compile(optimizer=tf.keras.optimizers.Adagrad(0.001))
listwise.fit(cached_train, epochs=epochs, verbose=1)

Epoch 1/64
Epoch 2/64
Epoch 3/64
Epoch 4/64
Epoch 5/64
Epoch 6/64
Epoch 7/64
Epoch 8/64
Epoch 9/64
Epoch 10/64
Epoch 11/64
Epoch 12/64
Epoch 13/64
Epoch 14/64
Epoch 15/64
Epoch 16/64
Epoch 17/64
Epoch 18/64
Epoch 19/64
Epoch 20/64
Epoch 21/64
Epoch 22/64
Epoch 23/64
Epoch 24/64
Epoch 25/64
Epoch 26/64
Epoch 27/64
Epoch 28/64
Epoch 29/64
Epoch 30/64
Epoch 31/64
Epoch 32/64
Epoch 33/64
Epoch 34/64
Epoch 35/64
Epoch 36/64
Epoch 37/64
Epoch 38/64
Epoch 39/64
Epoch 40/64
Epoch 41/64
Epoch 42/64
Epoch 43/64
Epoch 44/64
Epoch 45/64
Epoch 46/64
Epoch 47/64
Epoch 48/64
Epoch 49/64
Epoch 50/64
Epoch 51/64
Epoch 52/64
Epoch 53/64
Epoch 54/64
Epoch 55/64
Epoch 56/64
Epoch 57/64
Epoch 58/64
Epoch 59/64
Epoch 60/64
Epoch 61/64
Epoch 62/64
Epoch 63/64
Epoch 64/64


<keras.callbacks.History at 0x219faef3040>

In [191]:
listwise_model_result = listwise.evaluate(cached_test, return_dict=True)
listwise_model_result['NDCG_top10']



0.8653592467308044

In [212]:
hinge_model = RankingModel(tfr.keras.losses.PairwiseHingeLoss())
hinge_model.compile(optimizer=tf.keras.optimizers.Adagrad(0.01))
hinge_model.fit(cached_train, epochs=epochs, verbose=1)
hinge_model_result = hinge_model.evaluate(cached_test, return_dict=True)
print("NDCG of the pairwise hinge loss model: {:.4f}".format(hinge_model_result["NDCG_top10"]))

Epoch 1/64
Epoch 2/64
Epoch 3/64
Epoch 4/64
Epoch 5/64
Epoch 6/64
Epoch 7/64
Epoch 8/64
Epoch 9/64
Epoch 10/64
Epoch 11/64
Epoch 12/64
Epoch 13/64
Epoch 14/64
Epoch 15/64
Epoch 16/64
Epoch 17/64
Epoch 18/64
Epoch 19/64
Epoch 20/64
Epoch 21/64
Epoch 22/64
Epoch 23/64
Epoch 24/64
Epoch 25/64
Epoch 26/64
Epoch 27/64
Epoch 28/64
Epoch 29/64
Epoch 30/64
Epoch 31/64
Epoch 32/64
Epoch 33/64
Epoch 34/64
Epoch 35/64
Epoch 36/64
Epoch 37/64
Epoch 38/64
Epoch 39/64
Epoch 40/64
Epoch 41/64
Epoch 42/64
Epoch 43/64
Epoch 44/64
Epoch 45/64
Epoch 46/64
Epoch 47/64
Epoch 48/64
Epoch 49/64
Epoch 50/64
Epoch 51/64
Epoch 52/64
Epoch 53/64
Epoch 54/64
Epoch 55/64
Epoch 56/64
Epoch 57/64
Epoch 58/64
Epoch 59/64
Epoch 60/64
Epoch 61/64
Epoch 62/64
Epoch 63/64
Epoch 64/64
NDCG of the pairwise hinge loss model: 0.8639


In [240]:
mse_model = RankingModel(tf.keras.losses.MeanSquaredError())
mse_model.compile(optimizer=tf.keras.optimizers.Adagrad(0.1))
mse_model.fit(cached_train, epochs=epochs, verbose=1)
mse_model_result = mse_model.evaluate(cached_test, return_dict=True)
print("NDCG of the MSE Model: {:.4f}".format(mse_model_result["NDCG_top10"]))

Epoch 1/64
Epoch 2/64
Epoch 3/64
Epoch 4/64
Epoch 5/64
Epoch 6/64
Epoch 7/64
Epoch 8/64
Epoch 9/64
Epoch 10/64
Epoch 11/64
Epoch 12/64
Epoch 13/64
Epoch 14/64
Epoch 15/64
Epoch 16/64
Epoch 17/64
Epoch 18/64
Epoch 19/64
Epoch 20/64
Epoch 21/64
Epoch 22/64
Epoch 23/64
Epoch 24/64
Epoch 25/64
Epoch 26/64
Epoch 27/64
Epoch 28/64
Epoch 29/64
Epoch 30/64
Epoch 31/64
Epoch 32/64
Epoch 33/64
Epoch 34/64
Epoch 35/64
Epoch 36/64
Epoch 37/64
Epoch 38/64
Epoch 39/64
Epoch 40/64
Epoch 41/64
Epoch 42/64
Epoch 43/64
Epoch 44/64
Epoch 45/64
Epoch 46/64
Epoch 47/64
Epoch 48/64
Epoch 49/64
Epoch 50/64
Epoch 51/64
Epoch 52/64
Epoch 53/64
Epoch 54/64
Epoch 55/64
Epoch 56/64
Epoch 57/64
Epoch 58/64
Epoch 59/64
Epoch 60/64
Epoch 61/64
Epoch 62/64
Epoch 63/64
Epoch 64/64
NDCG of the MSE Model: 0.8705


# <span Style='font-family: Georgia, serif; color:orange'> **Define Prediction Generation and Test Predictions**

In [233]:
def generate_predictions(query, model):
    prediction_dataset = tf.data.Dataset.from_tensor_slices({'query':[[query]],'courses':[[all_courses]], 'descriptions':[[all_descriptions]]})
    prediction_input = list(prediction_dataset.as_numpy_iterator())[0]

    query_embeddings = tf.keras.Sequential([
            tf.keras.layers.StringLookup(vocabulary=all_queries),
            tf.keras.layers.Embedding(len(all_queries)+2, 32)
        ])
    q_embed = query_embeddings(prediction_input['query'])

    course_embeddings = tf.keras.Sequential([
            tf.keras.layers.StringLookup(vocabulary=all_courses),
            tf.keras.layers.Embedding(len(all_courses)+2, 32)
        ])
    c_embed = course_embeddings(prediction_input['courses'])

    description_embeddings = tf.keras.Sequential([
            tf.keras.layers.StringLookup(vocabulary=all_descriptions),
            tf.keras.layers.Embedding(len(all_descriptions)+2, 32)
        ])
    d_embed = description_embeddings(prediction_input['descriptions'])

    query_embedding_repeated = tf.repeat(tf.expand_dims(q_embed, 1), 192, axis=1)

    concatenated_embeddings = tf.concat([query_embedding_repeated, c_embed, d_embed], 2)

    concatenated_embeddings

    preds = model.score_model(concatenated_embeddings)
    scores = pd.DataFrame({'score':tf.squeeze(preds,-1).numpy()[0]})
    top_scores = scores.sort_values('score', ascending=False).head(20)
    top_scores['query'] = [query]*20
    top_scores['cip'] = [docset.iloc[i]['cip'] for i in top_scores.index]
    top_scores['cip_name'] = [docset.iloc[i]['cip_name'] for i in top_scores.index]

    return top_scores

In [241]:
query = random.choice(all_queries)
generate_predictions(query, listwise)

Unnamed: 0,score,query,cip,cip_name
31,0.068278,Science Writing,14.08,Civil Engineering.
46,0.05879,Science Writing,50.06,Film/Video and Photographic Arts.
107,0.032189,Science Writing,27.05,Statistics.
27,0.022193,Science Writing,16.04,"Slavic, Baltic and Albanian Languages, Literat..."
149,0.020108,Science Writing,52.07,Entrepreneurial and Small Business Operations.
78,0.018535,Science Writing,1.09,Animal Sciences.
180,0.017036,Science Writing,52.15,Real Estate.
108,0.017001,Science Writing,14.42,"Mechatronics, Robotics, and Automation Enginee..."
137,0.012546,Science Writing,26.09,"Physiology, Pathology and Related Sciences."
138,0.010752,Science Writing,30.01,Biological and Physical Sciences.


In [239]:
query = random.choice(all_queries)
generate_predictions(query, hinge_model)

Unnamed: 0,score,query,cip,cip_name
110,0.278183,Kinesiology,11.09,Computer Systems Networking and Telecommunicat...
18,0.242642,Kinesiology,26.02,"Biochemistry, Biophysics and Molecular Biology."
37,0.234664,Kinesiology,26.12,Biotechnology.
17,0.195087,Kinesiology,27.01,Mathematics.
167,0.152327,Kinesiology,30.1,Biopsychology.
156,0.140576,Kinesiology,42.28,"Clinical, Counseling and Applied Psychology."
58,0.134695,Kinesiology,51.38,"Registered Nursing, Nursing Administration, Nu..."
78,0.129398,Kinesiology,1.09,Animal Sciences.
71,0.128367,Kinesiology,50.05,Drama/Theatre Arts and Stagecraft.
33,0.125817,Kinesiology,14.02,"Aerospace, Aeronautical, and Astronautical/Spa..."


In [242]:
query = random.choice(all_queries)
generate_predictions(query, mse_model)

Unnamed: 0,score,query,cip,cip_name
3,1.780989,Debate,50.07,Fine and Studio Arts.
21,1.778566,Debate,38.01,Philosophy.
128,1.763501,Debate,14.18,Materials Engineering.
23,1.759955,Debate,45.1,Political Science and Government.
188,1.754979,Debate,52.16,Taxation.
92,1.748719,Debate,19.06,Housing and Human Environments.
11,1.745737,Debate,45.06,Economics.
31,1.745663,Debate,14.08,Civil Engineering.
149,1.742177,Debate,52.07,Entrepreneurial and Small Business Operations.
47,1.741174,Debate,31.05,"Sports, Kinesiology, and Physical Education/Fi..."
