# Online Course Recommender


---


## Practice Module: Intelligent Reasoning Systems (IRS)

## Feature Extraction



# 0. File Path & Library Setup

In [1]:
# Load All Necessary Packages

import os
# from google.colab import drive

import pandas as pd
import numpy as np
import re
import pickle
from collections import defaultdict
from collections import Counter
import sqlite3

import sklearn
from sklearn.feature_extraction.text import TfidfVectorizer

import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from rake_nltk import Rake

from sklearn.metrics.pairwise import cosine_similarity

seed = 18

print('Versions of key libraries')
print('-------------------------')
print('pandas:  ', pd.__version__)
print('numpy:   ', np.__version__)
print('sklearn: ', sklearn.__version__)

Versions of key libraries
-------------------------
pandas:   1.3.3
numpy:    1.19.5
sklearn:  1.0


In [2]:
# # Mounting to Google Drive
# drive.mount('/content/gdrive')

# # Change Working Directory
# os.chdir('/content/gdrive/My Drive/iss/irs_pm/')

print('Working Directory: ')
os.getcwd()

Working Directory: 


'D:\\Personal\\Education\\NUS-ISS Mtech IS\\Course Materials\\1. Intelligent Reasoning Systems (IRS)\\0. IRS-PM_Practice Module\\Source Code\\Data Preparation'

In [3]:
# Check & Query
filename = 'app_database.db'
table_name = 'course'
sqlite_conn = sqlite3.connect(filename)

# Query Table
rawdata = pd.read_sql('SELECT * FROM ' + table_name, sqlite_conn, index_col='courseID')

sqlite_conn.close()

In [4]:
print(rawdata.shape)
rawdata.head()

(87700, 17)


Unnamed: 0_level_0,title,url,categories,description_short,description_long,difficulty,duration,free_option,number_of_enroll,rating,paid_option,language,subtitle,platform,provider,image_url,popularity_index
courseID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
1,创业101: 你的客户是谁？,https://www.edx.org/course/101-2,Business & Management,如果注册《创业101:谁是你的客户？》的认证证书，并且通过课程考核，将会获得有效期为1年的价...,很多看起来似乎很伟大的想法和科技在碰到一个简单、不可避免的问题时都会突然卡壳。这个问题就是：...,0,1,1,0,0.0,$100,English,中文,0,Massachusetts Institute of Technology,https://prod-discovery.edx-cdn.org/media/cours...,0.0
2,创业102: 你能为客户做什么？,https://www.edx.org/course/102-2,Business & Management,如果注册《创业102:你能为客户做什么？》的认证证书，并且通过课程考核，将会获得有效期为1年...,不要在意你的客户能为你做什么——而是要关注你能为你的客户做些什么。\n在《创业101》中，我...,0,1,1,0,0.0,$100,English,中文,0,Massachusetts Institute of Technology,https://prod-discovery.edx-cdn.org/media/cours...,0.0
3,18th-Century Opera: Handel & Mozart,https://www.edx.org/course/18th-century-opera-...,"Art & Culture, History, Music",Study Baroque and Classical opera through Hand...,"In this breathtaking course, you'll get to kno...",0,1,1,22646,0.0,$139,English,English,0,Harvard University,https://prod-discovery.edx-cdn.org/media/cours...,0.005163
4,"19th-Century Opera: Meyerbeer, Wagner, & Verdi",https://www.edx.org/course/19th-century-opera-...,"Art & Culture, History, Music",Learn the music and cultural impact of three c...,Travel through central Europe in the 1800s to ...,0,1,1,11619,0.0,$139,English,English,0,Harvard University,https://prod-discovery.edx-cdn.org/media/cours...,0.002649
5,3D GIS,https://www.edx.org/course/3d-gis,"Data Analysis & Statistics, Energy & Earth Sci...",Take your maps into the third dimension: Learn...,Maps are graphic representations of reality an...,1,1,1,0,0.0,$149,English,English,0,University of Alaska Fairbanks,https://prod-discovery.edx-cdn.org/media/cours...,0.0


# 1. Feature Extraction for Text Based Data

## i) Extract Text Based Data

In [5]:
# Extract Text Based Columns - Name, Categories, Short Description and Long Description
rawdata_name = rawdata['title']
rawdata_cat = rawdata['categories']
rawdata_sdesc = rawdata['description_short']
rawdata_ldesc = rawdata['description_long']

print(rawdata_name.shape)
rawdata_name.head()

(87700,)


courseID
1                                    创业101: 你的客户是谁？
2                                  创业102: 你能为客户做什么？
3               18th-Century Opera: Handel & Mozart
4    19th-Century Opera: Meyerbeer, Wagner, & Verdi
5                                            3D GIS
Name: title, dtype: object

## ii) Text Preprocessing

In [6]:
# Text Preprocessing Functions

stopwordsdic = stopwords.words('english')
lemmatizer = WordNetLemmatizer()

def text_preprocess(rawtext):
    text = re.sub('([^\x00-\x7F])+','',rawtext) # Remove all non ASCII characters
    text = text.lower() # lower casing all words
    text = text.strip() # Remove White Spaces
    text = re.sub('[^A-Za-z0-9]+', ' ', text) # Remove Punctuations
    text = word_tokenize(text) # Tokenize
    text = [word for word in text if word not in stopwordsdic] # Remove stopwords
    text = [lemmatizer.lemmatize(word) for word in text] # Lemmatize words
    bow  = ' '.join(text) # Create Bag-of-Words
    return bow


In [7]:
# Apply Text Preprocessing
data_name = rawdata_name.apply(lambda x: '' if pd.isna(x) else text_preprocess(x)).astype(str)
data_cat = rawdata_cat.apply(lambda x: '' if pd.isna(x) else text_preprocess(x)).astype(str)
data_sdesc = rawdata_sdesc.apply(lambda x: '' if pd.isna(x) else text_preprocess(x)).astype(str)
data_ldesc = rawdata_ldesc.apply(lambda x: '' if pd.isna(x) else text_preprocess(x)).astype(str)


## iii) Keyword Extractions for Short and Long Description (RAKE)

In [8]:
# Keyword Extraction Function using Rake

def extract_keyword(text, topn):
    rake = Rake(min_length=1, max_length=3)
    tokens = word_tokenize(text)
    if len(tokens) <= topn:
        keywords = tokens
    else:
        rake.extract_keywords_from_text(text)
        keywords = rake.get_ranked_phrases()[:topn]
    return keywords

In [9]:
# Apply keyword extraction to short and long description
topn = 10

data_sdesc_kw = data_sdesc.apply(lambda x: ' '.join(extract_keyword(x, topn)))
data_ldesc_kw = data_ldesc.apply(lambda x: ' '.join(extract_keyword(x, topn)))


## iv) Create Bag-of-words and Corresponding List of Tokens Per Course

In [39]:
data_name_npy = data_name.to_numpy()
data_cat_npy = data_cat.to_numpy()
data_sdesc_kw_npy = data_sdesc_kw.to_numpy()
data_ldesc_kw_npy = data_ldesc_kw.to_numpy()

In [42]:
# Combine all text to create bag-of-words for each course
data_bow = []

for i in range(len(data_name)):
    data_bow.append(' '.join((data_name_npy[i], data_cat_npy[i], data_sdesc_kw_npy[i], data_ldesc_kw_npy[i])).strip())

data_bow = np.array(data_bow)
data_bow.shape

(87700,)

## v) TfIdf Vectorization

In [45]:
# Fit_transform BoW to Tfidf Sparse Matrix

tfidf = TfidfVectorizer()
data_tfidf = tfidf.fit_transform(data_bow)

In [48]:
# Save Tfidf Vectorizer to file
folderpath = 'Feature Map/'
filename = 'tfidf_vectorizer'
filepath = folderpath + filename + '.pickle'
file = open(filepath, 'wb')
pickle.dump(tfidf, file, protocol=pickle.HIGHEST_PROTOCOL)
file.close()

# Save Tfidf Sparse Matrix to file

folderpath = 'Feature Map/'
filename = 'tfidf_data'
filepath = folderpath + filename + '.pickle'
file = open(filepath, 'wb')
pickle.dump(data_tfidf, file, protocol=pickle.HIGHEST_PROTOCOL)
file.close()

In [47]:
a = 'machine learning'
atfidf  = tfidf.transform([a])
sim = cosine_similarity(atfidf, data_tfidf).ravel()
idx = np.where(sim==sim.max())[0][0]
print(data_name[idx])
print(rawdata['url'][idx])


machine design part
https://www.coursera.org/learn/machine-design1


In [49]:
# Load for checking
folderpath = 'Feature Map/'
filename = 'tfidf_data'
filepath = folderpath + filename + '.pickle'
file = open(filepath, 'rb')
data_tfidf = pickle.load(file)
file.close()

folderpath = 'Feature Map/'
filename = 'tfidf_vectorizer'
filepath = folderpath + filename + '.pickle'
file = open(filepath, 'rb')
tfidf = pickle.load(file)
file.close()

# 2. Feature Extraction for Categorical Data

## i) Extract Categorical Data

In [50]:
rawdata_diff = rawdata['difficulty']
rawdata_dur = rawdata['duration']
rawdata_free = rawdata['free_option']

## ii ) One-Hot Encoding

In [51]:
data_diff = pd.get_dummies(rawdata_diff)
data_dur = pd.get_dummies(rawdata_dur)
data_free = pd.get_dummies(rawdata_free, drop_first=True)


## iii) Combine Data to form Catagorical Data Feature Map

In [52]:
data_cat = np.hstack((data_dur, data_diff, data_free))

In [53]:
data_cat.shape

(87700, 7)

## iv) Function to Encode Categorical Inputs

In [54]:
def encode_input(cat_input):
    cat_onehot = np.zeros(6)
    if cat_input[0] > 0: # 0 - No preference, 1 - Short, 2 - Medium, 3 - Long
        cat_onehot[cat_input[0] - 1] = 1
    if cat_input[1] > 0: # 0 - No preference, 1 - Introductory, 2 - Intermediate, 3 - Advanced
        cat_onehot[cat_input[1] + 2] = 1
    return cat_onehot

## iv) Save Categorical Feature Map

In [55]:
# Save Categorical Feature Map to file

folderpath = 'Feature Map/'
filename = 'categorical_data'
filepath = folderpath + filename + '.pickle'
file = open(filepath, 'wb')
pickle.dump(data_cat, file, protocol=pickle.HIGHEST_PROTOCOL)
file.close()

In [56]:
data_cat.shape

(87700, 7)

# 3. Recommendation Inference

## i) Similarity Calculation

In [57]:
# Compute Similarity on the condition that each column feature is not 0: (0 - no preference)

def cond_sim(input_vec, data_vec):
    input_diff = input_vec[:, :3]
    input_durr = input_vec[:, 3:6]
    input_free = input_vec[:, 6:]
    data_diff  = data_vec[:, :3]
    data_durr  = data_vec[:, 3:6]
    data_free  = data_vec[:, 6:]
    if (input_diff.sum() + input_durr.sum()) == 0:
        input_slice = input_free
        data_slice  = data_free
    elif input_diff.sum() == 0:
        input_slice = np.hstack((input_durr, input_free))
        data_slice  = np.hstack((data_durr, data_free))
    elif input_durr.sum() == 0:
        input_slice = np.hstack((input_diff, input_free))
        data_slice  = np.hstack((data_diff, data_free))
    else:
        input_slice = input_vec
        data_slice  = data_vec
    sim = cosine_similarity(input_slice, data_slice).ravel()
    return sim
    

## ii) Ranking Optimization for Single Group

In [58]:
def ranking(mask, text_sim, cat_sim, rating):

    target_idx = np.arange(text_sim.shape[0])[mask]
    target_text_sim = text_sim[mask]
    target_cat_sim = cat_sim[mask]
    target_rating = rating[mask]
    
    target_scores = sorted(np.unique(target_cat_sim), reverse=True)
    
    rec_idx = np.array([], dtype=int)
    rec_sim = np.array([])
    
    for score in target_scores:
        group_mask = (target_cat_sim == score)
        group_idx = target_idx[group_mask]
        group_text_sim = target_text_sim[group_mask]
        group_rating = target_rating[group_mask]
        group_sort_idx = np.argsort(group_rating)[::-1]
        rec_idx = np.append(rec_idx, group_idx[group_sort_idx])
        rec_sim = np.append(rec_sim, group_text_sim[group_sort_idx])
    
    return rec_sim, rec_idx

## iii) Recommendation Function

In [60]:
def cond_sim(input_vec, data_vec):
    input_durr = input_vec[:, :3]
    input_diff = input_vec[:, 3:]
    data_durr  = data_vec[:, :3]
    data_diff  = data_vec[:, 3:]
    if (input_durr.sum() + input_diff.sum()) == 0:
        sim = np.ones(data_vec.shape[0])
    elif input_durr.sum() == 0:
        sim = cosine_similarity(input_diff, data_diff).ravel()
    elif input_diff.sum() == 0:
        sim = cosine_similarity(input_durr, data_durr).ravel()
    else:
        sim = cosine_similarity(input_vec, data_vec).ravel()
    return sim

In [None]:
def recommend(inputs, thres, nmin):
    text_ipt = inputs[0]
    text_proc = text_preprocess(text_ipt)
    text_tfidf = tfidf.transform([text_proc])
    text_sim = cosine_similarity(text_tfidf, data_tfidf).ravel()

    cat_ipt  = inputs[1:3]
    cat_onehot = np.array([encode_input(cat_ipt)])
    cat_sim = cond_sim(cat_onehot, data_cat[:, :-1])
    
    ind = inputs[-1]
    
    thres_mask = (text_sim > thres)
    
    if ind == 1:
        free_mask = ((rawdata_free.to_numpy() == 1) * thres_mask) == 1
    else:
        free_mask = (np.ones(data_tfidf.shape[0]) * thres_mask) == 1
    
    paid_mask = ((np.ones(data_tfidf.shape[0]) * thres_mask) - free_mask) == 1
        
    print(thres_mask.sum())
    print(free_mask.sum())
    print(paid_mask.sum())
    
    rec_sim, rec_idx = ranking(free_mask, text_sim, cat_sim, rawdata_rating.to_numpy())
    
    if (free_mask.sum() < nmin) and (paid_mask.sum() > 0):
        paid_sim, paid_idx = ranking(paid_mask, text_sim, cat_sim, rawdata_rating.to_numpy())
        rec_sim = np.append(rec_sim, paid_sim)
        rec_idx = np.append(rec_idx, paid_idx)

    return rec_sim, rec_idx, ind

## iv) Testing

In [61]:
rawdata_rating = rawdata['popularity_index']

In [65]:
import time
start = time.time()

ainput = ['swim', 0, 0, 1]

a_sim, a_idx, a_ind = recommend(ainput, 0, 30)
print(a_ind)
print(a_sim.shape)
print(a_sim.sum())
# print((a_sim>(a_sim.max()-0.1)).sum())
# print(((a_sim>(a_sim.max()-0.1)).sum())/a_sim.shape[0])
print('START =================================================================================')
print(a_sim[:20])
print(a_idx[:20]+1)
print(np.array(rawdata['duration'])[a_idx][:20])
print(np.array(rawdata['difficulty'])[a_idx][:20])
print(np.array(rawdata['popularity_index'])[a_idx][:20])
print(np.array(rawdata['title'])[a_idx][:20])
print(np.array(rawdata['url'])[a_idx][:20])

print('END ===================================================================================')
print(a_sim[-20:])
print(a_idx[-20:]+1)
print(np.array(rawdata['duration'])[a_idx][-20:])
print(np.array(rawdata['difficulty'])[a_idx][-20:])
print(np.array(rawdata['title'])[a_idx][-20:])
print('TIME ==================================================================================')
print(time.time()-start)



19
2
17
1
(19,)
7.439062526975222
[0.38966117 0.4559317  0.49075566 0.35935534 0.34908393 0.35952692
 0.35703686 0.24300932 0.77038262 0.37366149 0.45512383 0.29844695
 0.24314588 0.334458   0.26099802 0.38119322 0.34805268 0.66323172
 0.30600724]
[87173 87535 84090 82793 82681 84445 84206 83637 83986 84577 61807 85624
 85326 28764 21726 85612 85664 85073 86474]
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
[0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0]
[4.27606058e-03 3.48640739e-04 7.13197916e-04 4.53642438e-04
 3.21189801e-04 1.69406641e-04 1.39541382e-04 7.69657934e-05
 7.25359926e-05 3.81877539e-05 1.25076726e-05 7.55671888e-06
 6.32296886e-06 6.31765097e-06 5.74331906e-06 4.42980072e-06
 3.99905179e-06 3.11096449e-06 0.00000000e+00]
['Learn to swim the basics of freestyle swimming'
 'How to Prepare Your Baby for Swim Lessons 0 to 8 Months'
 'Teach your child to swim - step-by-step guide'
 'Treat lower back in swimming, swim 1k freestyle'
 'Total Immersion Swimming: Swim Better, Easier, Faster