## Collaborative Filtering recommender system
### with NN embedding


In [1]:
import pandas as pd
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import gensim
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from gensim import corpora
from scipy.spatial.distance import cosine

In [2]:
#loading data
rating_url = "https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBM-ML321EN-SkillsNetwork/labs/datasets/ratings.csv"
rating_df = pd.read_csv(rating_url)
rating_df.head()

Unnamed: 0,user,item,rating
0,1889878,CC0101EN,3.0
1,1342067,CL0101EN,3.0
2,1990814,ML0120ENv3,3.0
3,380098,BD0211EN,3.0
4,779563,DS0101EN,3.0


In [3]:
#unique items & users for latent feature vectors construction
num_users = len(rating_df['user'].unique())
num_items = len(rating_df['item'].unique())

In [4]:
#defining recommender net sub class, inheriting from tensorflow.keras.Model
class RecommenderNet(keras.Model):
    
    def __init__(self, num_users, num_items, embedding_size=16, **kwargs):
        """Constructor
           :param int num_users: number of users
           :param int num_items: number of items
           :param int embedding_size: the size of embedding vector"""
        super(RecommenderNet, self).__init__(**kwargs)
        self.num_users = num_users
        self.num_items = num_items
        self.embedding_size = embedding_size
        
        #defining user_embedding vector
        self.user_embedding_layer = layers.Embedding(
            input_dim=num_users,
            output_dim=embedding_size,
            name='user_embedding_layer',
            embeddings_initializer="he_normal",
            embeddings_regularizer=keras.regularizers.l2(1e-6))
        # defining user bias layer
        self.user_bias = layers.Embedding(
            input_dim=num_users,
            output_dim=1,
            name="user_bias")
        
        # defining item_embedding vector
        self.item_embedding_layer = layers.Embedding(
            input_dim=num_items,
            output_dim=embedding_size,
            name='item_embedding_layer',
            embeddings_initializer="he_normal",
            embeddings_regularizer=keras.regularizers.l2(1e-6))
        # defining item bias layer
        self.item_bias = layers.Embedding(
            input_dim=num_items,
            output_dim=1,
            name="item_bias")
        
    def call(self, inputs):
        """method to be called during model fitting           
           :param inputs: user and item one-hot vectors"""
        # computing user and item embedding vectors
        user_vector = self.user_embedding_layer(inputs[:, 0])
        user_bias = self.user_bias(inputs[:, 0])
        item_vector = self.item_embedding_layer(inputs[:, 1])
        item_bias = self.item_bias(inputs[:, 1])
        dot_user_item = tf.tensordot(user_vector, item_vector, 2)
        # Add all the components (including bias)
        x = dot_user_item + user_bias + item_bias
        # Sigmoid output layer to output the probability
        return tf.nn.relu(x)

In [5]:
#convering original data to iniciees for one hot vectors for tensorflow processing
def process_dataset(raw_data):
    
    encoded_data = raw_data.copy()
    
    # Mapping user ids to indices
    user_list = encoded_data["user"].unique().tolist()
    user_id2idx_dict = {x: i for i, x in enumerate(user_list)}
    user_idx2id_dict = {i: x for i, x in enumerate(user_list)}
    
    # Mapping course ids to indices
    course_list = encoded_data["item"].unique().tolist()
    course_id2idx_dict = {x: i for i, x in enumerate(course_list)}
    course_idx2id_dict = {i: x for i, x in enumerate(course_list)}

    # Convert original user ids to idx
    encoded_data["user"] = encoded_data["user"].map(user_id2idx_dict)
    # Convert original course ids to idx
    encoded_data["item"] = encoded_data["item"].map(course_id2idx_dict)
    # Convert rating to int
    encoded_data["rating"] = encoded_data["rating"].values.astype("int")

    return encoded_data, user_idx2id_dict, course_idx2id_dict

In [6]:
encoded_data, user_idx2id_dict, course_idx2id_dict = process_dataset(rating_df)
encoded_data.head()

Unnamed: 0,user,item,rating
0,0,0,3
1,1,1,3
2,2,2,3
3,3,3,3
4,4,4,3


In [7]:
#generating custom split for train, validation and test sets
def generate_train_test_datasets(dataset, scale=True):

    min_rating = min(dataset["rating"])
    max_rating = max(dataset["rating"])

    dataset = dataset.sample(frac=1, random_state=42)
    x = dataset[["user", "item"]].values
    if scale:
        y = dataset["rating"].apply(lambda x: (x - min_rating) / (max_rating - min_rating)).values
    else:
        y = dataset["rating"].values

    # Assuming training on 80% of the data and validating on 10%, and testing 10%
    train_indices = int(0.8 * dataset.shape[0])
    test_indices = int(0.9 * dataset.shape[0])

    x_train, x_val, x_test, y_train, y_val, y_test = (
        x[:train_indices],
        x[train_indices:test_indices],
        x[test_indices:],
        y[:train_indices],
        y[train_indices:test_indices],
        y[test_indices:],
    )
    return x_train, x_val, x_test, y_train, y_val, y_test

In [8]:
x_train, x_val, x_test, y_train, y_val, y_test = generate_train_test_datasets(encoded_data)

In [9]:
#training & evaluating the collaborative filtering recommender net
embedding_size = 16
model = RecommenderNet(num_users, num_items, embedding_size)

model.compile(
        optimizer=keras.optimizers.Adam(),
        loss=tf.keras.losses.MeanSquaredError(),
        metrics=tf.keras.metrics.RootMeanSquaredError())
history = model.fit(x_train, y_train, batch_size=64, epochs=10, validation_data=(x_val, y_val), verbose=1)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [10]:
#evaluating the model
model.evaluate(x_test, y_test)



[0.016012616455554962, 0.11959309875965118]

Not bad, but there is always room for improvement.
I'll try another approach:

## Content similarity based recommender system
### based on course description with nltk

In [11]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\eneme\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\eneme\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\eneme\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [12]:
#pulling course data frame for processing
course_url = "https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBM-ML321EN-SkillsNetwork/labs/datasets/course_processed.csv"
course_content_df = pd.read_csv(course_url)
course_content_df.head()

Unnamed: 0,COURSE_ID,TITLE,DESCRIPTION
0,ML0201EN,robots are coming build iot apps with watson ...,have fun with iot and learn along the way if ...
1,ML0122EN,accelerating deep learning with gpu,training complex deep learning models with lar...
2,GPXX0ZG0EN,consuming restful services using the reactive ...,learn how to use a reactive jax rs client to a...
3,RP0105EN,analyzing big data in r using apache spark,apache spark is a popular cluster computing fr...
4,GPXX0Z2PEN,containerizing packaging and running a sprin...,learn how to containerize package and run a ...


In [13]:
#merging title and discription text fields
course_content_df['course_texts'] = course_content_df[['TITLE', 'DESCRIPTION']].agg(' '.join, axis=1)
course_content_df = course_content_df.reset_index()
course_content_df['index'] = course_content_df.index

In [14]:
#method for tokenization of course content
def tokenize_course(course, keep_only_nouns=True):
    stop_words = set(stopwords.words('english'))
    word_tokens = word_tokenize(course)
    # Remove English stop words and numbers
    word_tokens = [w for w in word_tokens if (not w.lower() in stop_words) and (not w.isnumeric())]
    # Only keep nouns 
    if keep_only_nouns:
        filter_list = ['WDT', 'WP', 'WRB', 'FW', 'IN', 'JJR', 'JJS', 'MD', 'PDT', 'POS', 'PRP', 'RB', 'RBR', 'RBS',
                       'RP']
        tags = nltk.pos_tag(word_tokens)
        word_tokens = [word for word, pos in tags if pos not in filter_list]

    return word_tokens

In [15]:
#tokenize all courses
tokens = [tokenize_course(course, True) for course in course_content_df['course_texts']]
tokens[0]

['robots',
 'coming',
 'build',
 'iot',
 'apps',
 'watson',
 'swift',
 'red',
 'fun',
 'iot',
 'learn',
 'way',
 'swift',
 'developer',
 'want',
 'learn',
 'iot',
 'watson',
 'ai',
 'services',
 'cloud',
 'raspberry',
 'pi',
 'node',
 'red',
 'found',
 'place',
 'build',
 'iot',
 'apps',
 'read',
 'temperature',
 'data',
 'take',
 'pictures',
 'raspcam',
 'use',
 'ai',
 'recognize',
 'objects',
 'pictures',
 'program',
 'irobot',
 'create',
 'robot']

In [16]:
#generate bag of words dataframe
token_dct = gensim.corpora.Dictionary(tokens)
bow = [token_dct.doc2bow(course) for course in tokens]
bow_dicts = {"doc_index": [],
            "doc_id": [],
            "token": [],
            "bow": []}

for course_idx, course_bow in enumerate(bow):    
    for token_index, token_bow in course_bow:
        bow_dicts['doc_index'].append(course_idx)
        bow_dicts['doc_id'].append(course_content_df['COURSE_ID'].iloc[course_idx]) 
        bow_dicts['token'].append(token_dct[token_index])
        bow_dicts['bow'].append(token_bow)
bows_df = pd.DataFrame(bow_dicts)
bows_df = bows_df[['doc_id', 'token', 'bow']]

In [17]:
#method to pivot union token of two BoWs 
def pivot_two_bows(basedoc, comparedoc):
    base = basedoc.copy()
    base['type'] = 'base'
    compare = comparedoc.copy()
    compare['type'] = 'compare'
    # Append the two token sets vertically
    join = base.append(compare)
    # Pivot the two joined courses
    joinT = join.pivot(index=['doc_id', 'type'], columns='token').fillna(0).reset_index(level=[0, 1])
    # Assign columns
    joinT.columns = ['doc_id', 'type'] + [t[1] for t in joinT.columns][2:]
    return joinT

In [18]:
#finally find all courses similar to those I was particulary interested in
#generate two data frames with the course mentioned above and everythind besides that one
course_ml = course_content_df[course_content_df['COURSE_ID'] == 'ML0101ENv3']
df2 =bows_df[bows_df['doc_id'] != 'ML0101ENv3']
bow_ml = bows_df[bows_df['doc_id'] == 'ML0101ENv3']
similar_ids =[]
for course in course_content_df['COURSE_ID']:
    pivot = pivot_two_bows(bows_df[bows_df['doc_id'] == course], bow_ml)
    similarity = 1 - cosine(pivot.iloc[0, 2:], pivot.iloc[1, 2:])
    if similarity>0.5:
        similar_ids.append(course)
similar_df = course_content_df[course_content_df['COURSE_ID'].isin(similar_ids)]
similar_df

Unnamed: 0,index,COURSE_ID,TITLE,DESCRIPTION,course_texts
157,157,ML0109EN,machine learning dimensionality reduction,machine learning dimensionality reduction,machine learning dimensionality reduction ma...
158,158,ML0101ENv3,machine learning with python,machine learning can be an incredibly benefici...,machine learning with python machine learning ...
200,200,ML0151EN,machine learning with r,this machine learning with r course dives into...,machine learning with r this machine learning ...
259,259,excourse46,machine learning,machine learning is the science of getting com...,machine learning machine learning is the scien...
260,260,excourse47,machine learning for all,machine learning often called artificial inte...,machine learning for all machine learning oft...
273,273,excourse60,introduction to tensorflow for artificial inte...,if you are a software developer who wants to b...,introduction to tensorflow for artificial inte...
