In [1]:
#imports
from pathlib import Path

import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [8]:
import spacy

In [9]:
pd.set_option('display.max_colwidth', 100)

In [10]:
datapath = Path.cwd()/'data'
dataset = datapath/'dataset.csv'

In [11]:
df = pd.read_csv(dataset)

In [12]:
df

Unnamed: 0,title,short_description,long_description,provider,url,time,language,paths,prerequisites,medium,type
0,Machine Learning with PySpark,Learn how to make predictions with Apache Spark.,"Spark is a powerful, general purpose tool for working with Big Data. Spark transparently handles...",DataCamp,https://www.datacamp.com/courses/machine-learning-with-pyspark,4 hours,Python,"Machine Learning for Everyone,Big Data with PySpark,Machine Learning Scientist with Python","Introduction to PySpark,Statistical Thinking in Python (Part 1)",video,course
1,Feature Engineering for Machine Learning in Python,Create new features to improve the performance of your Machine Learning models.,Every day you read about the amazing breakthroughs in how the newest applications of machine lea...,DataCamp,https://www.datacamp.com/courses/feature-engineering-for-machine-learning-in-python,4 hours,Python,"Machine Learning for Everyone,Machine Learning Scientist with Python","pandas Foundations,Supervised Learning with scikit-learn",video,course
2,Machine Learning for Finance in Python,"Learn to model & predict stock data values using linear models, decision trees, random forests, ...","Time series data is all around us; some examples are the weather, human behavioral patterns as c...",DataCamp,https://www.datacamp.com/courses/machine-learning-for-finance-in-python,4 hours,Python,,Supervised Learning with scikit-learn,video,course
3,Supervised Learning with scikit-learn,Learn how to build and tune predictive models and evaluate how well they'll perform on unseen data.,Machine learning is the field that teaches machines and computers to learn from existing data to...,DataCamp,https://www.datacamp.com/courses/supervised-learning-with-scikit-learn,4 hours,Python,"Data Science for Everyone,Machine Learning for Everyone,Data Scientist with Python,Machine Lear...",Statistical Thinking in Python (Part 1),video,course
4,Introduction to Natural Language Processing in Python,"In this course, you'll learn natural language processing (NLP) basics, such as how to identify a...","In this course, you'll learn natural language processing (NLP) basics, such as how to identify a...",DataCamp,https://www.datacamp.com/courses/introduction-to-natural-language-processing-in-python,4 hours,Python,"Machine Learning for Everyone,Machine Learning Scientist with Python,Natural Language Processing...",Python Data Science Toolbox (Part 2),video,course
5,Machine Learning with Tree-Based Models in Python,"In this course, you'll learn how to use tree-based models and ensembles for regression and class...",Decision trees are supervised learning models used for problems involving classification and reg...,DataCamp,https://www.datacamp.com/courses/machine-learning-with-tree-based-models-in-python,5 hours,Python,"Data Science for Everyone,Machine Learning for Everyone,Data Scientist with Python,Machine Lear...",Supervised Learning with scikit-learn,video,course
6,Extreme Gradient Boosting with XGBoost,Learn the fundamentals of gradient boosting and build state-of-the-art machine learning models u...,Do you know the basics of supervised learning and want to use state-of-the-art models on real-wo...,DataCamp,https://www.datacamp.com/courses/extreme-gradient-boosting-with-xgboost,4 hours,Python,"Machine Learning for Everyone,Machine Learning Scientist with Python","Supervised Learning with scikit-learn,Case Study: School Budgeting with Machine Learning in Python",video,course
7,Machine Learning for Business,Understand the fundamentals of Machine Learning and how it's applied in the business world.,This course will introduce the key elements of machine learning to the business leaders. We will...,DataCamp,https://www.datacamp.com/courses/machine-learning-for-business,4 hours,Python,Data Skills for Business,,video,course
8,Linear Classifiers in Python,In this course you will learn the details of linear classifiers like logistic regression and SVM.,"In this course you'll learn all about using linear classifiers, specifically logistic regression...",DataCamp,https://www.datacamp.com/courses/linear-classifiers-in-python,4 hours,Python,"Machine Learning for Everyone,Machine Learning Fundamentals with Python,Machine Learning Scienti...",Supervised Learning with scikit-learn,video,course
9,Machine Learning for Time Series Data in Python,This course focuses on feature engineering and machine learning for time series data.,"Time series data is ubiquitous. Whether it be stock market fluctuations, sensor data recording c...",DataCamp,https://www.datacamp.com/courses/machine-learning-for-time-series-data-in-python,4 hours,Python,"Machine Learning for Everyone,Machine Learning Scientist with Python,Time Series with Python","Manipulating Time Series Data in Python,Visualizing Time Series Data in Python,Supervised Learni...",video,course


In [13]:
df.describe()

Unnamed: 0,title,short_description,long_description,provider,url,time,language,paths,prerequisites,medium,type
count,18,18,18,18,18,18,18,17,16,18,18
unique,18,18,18,1,18,2,1,14,13,1,1
top,Building Recommendation Engines in PySpark,Learn how to make predictions with Apache Spark.,"Time series data is ubiquitous. Whether it be stock market fluctuations, sensor data recording c...",DataCamp,https://www.datacamp.com/courses/machine-learning-for-business,4 hours,Python,Natural Language Processing in Python,Supervised Learning with scikit-learn,video,course
freq,1,1,1,18,1,16,18,2,3,18,18


In [14]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18 entries, 0 to 17
Data columns (total 11 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   title              18 non-null     object
 1   short_description  18 non-null     object
 2   long_description   18 non-null     object
 3   provider           18 non-null     object
 4   url                18 non-null     object
 5   time               18 non-null     object
 6   language           18 non-null     object
 7   paths              17 non-null     object
 8   prerequisites      16 non-null     object
 9   medium             18 non-null     object
 10  type               18 non-null     object
dtypes: object(11)
memory usage: 1.7+ KB


## Cleaning and Wrangling

## Sanitize columns

In [15]:
df['prerequisites'].fillna("", inplace=True)

In [16]:
df['paths'].fillna("", inplace=True)

In [17]:
df[['paths', 'prerequisites']]

Unnamed: 0,paths,prerequisites
0,"Machine Learning for Everyone,Big Data with PySpark,Machine Learning Scientist with Python","Introduction to PySpark,Statistical Thinking in Python (Part 1)"
1,"Machine Learning for Everyone,Machine Learning Scientist with Python","pandas Foundations,Supervised Learning with scikit-learn"
2,,Supervised Learning with scikit-learn
3,"Data Science for Everyone,Machine Learning for Everyone,Data Scientist with Python,Machine Lear...",Statistical Thinking in Python (Part 1)
4,"Machine Learning for Everyone,Machine Learning Scientist with Python,Natural Language Processing...",Python Data Science Toolbox (Part 2)
5,"Data Science for Everyone,Machine Learning for Everyone,Data Scientist with Python,Machine Lear...",Supervised Learning with scikit-learn
6,"Machine Learning for Everyone,Machine Learning Scientist with Python","Supervised Learning with scikit-learn,Case Study: School Budgeting with Machine Learning in Python"
7,Data Skills for Business,
8,"Machine Learning for Everyone,Machine Learning Fundamentals with Python,Machine Learning Scienti...",Supervised Learning with scikit-learn
9,"Machine Learning for Everyone,Machine Learning Scientist with Python,Time Series with Python","Manipulating Time Series Data in Python,Visualizing Time Series Data in Python,Supervised Learni..."


## Get prerequisite count

For now, because we don't have contextual info, we'll use the prerequisite count of a course as a proxy of its complexity. Once we have that, we can do a simple `pd.cut` to assign a categorical label(beginner, intermediate, advanced) to each course.

**NOTE**: The process below will assign a complexity label to each course strictly based on the number of prerequisites each item has. This may or may not correspond to the actual complexity of a course, but for the sake of prototyping, we'll leave it be and come back once other parts of the pipeline are fleshed out.

In [18]:
def get_preq_count(row):
    if row == "":
        return 0
    return len(row.split(","))

In [19]:
df['prereq_count'] = df['prerequisites'].apply(get_preq_count)

In [20]:
df['complexity'] = pd.cut(df['prereq_count'], 3, labels=['Beginner', 'Intermediate', 'Advanced'])

## Generate `description` feature

In [21]:
assert all(df['short_description'].isnull()) == False
assert all(df['long_description'].isnull()) == False

In [22]:
df['descriptions'] = df['short_description'] + " " + df['long_description']

In [23]:
df['descriptions'][0]

"Learn how to make predictions with Apache Spark. Spark is a powerful, general purpose tool for working with Big Data. Spark transparently handles the distribution of compute tasks across a cluster. This means that operations are fast, but it also allows you to focus on the analysis rather than worry about technical details. In this course you'll learn how to get data into Spark and then delve into the three fundamental Spark Machine Learning algorithms: Linear Regression, Logistic Regression/Classifiers, and creating pipelines. Along the way you'll analyse a large dataset of flight delays and spam text messages. With this background you'll be ready to harness the power of Spark and apply it on your own Machine Learning projects!"

In [24]:
def to_lower(row):
    return row.lower()

In [25]:
df['descriptions'] = df['descriptions'].apply(to_lower)

## Building a user profile

Let's build a simple user profile that tells us a user's background, proficiency with a certain programming language etc

In [26]:
class User:
    def __init__(self, name:str):
        self.name = name
        self.background = None
        self.time_availability = None
        self.duration = None
        
    def setbackground(self, background: str):
        if background not in ['Beginner', 'Intermediate', 'Advanced']:
            print("Please enter one of: Beginner, Intermediate, Advanced")
            return -1
        self.background = background
    
    def getbackground(self):
        return self.background
    
    def setavailability(self, time: int, duration: str):
        self.time_availability = time
        self.duration = duration
        
    def getavailability(self):
        return self.time_availability
    
    def setprof(self, prof):
        self.proficiency = prof
    
    def getprof(self):
        return self.proficieny
    
    def __repr__(self):
        _repr = "User {} has a {} skillset and is available for {} hours a {}".format(
                 self.name, self.background, self.time_availability, self.duration)
        return _repr

In [27]:
dummy1 = User('dummy')
dummy1.setbackground('Beginner')
dummy1.setavailability(2, 'week')
dummy1

User dummy has a Beginner skillset and is available for 2 hours a week

In [28]:
dummy2 = User('dummy2')
dummy2.setbackground('Intermediate')
dummy2.setavailability(4, 'week')
dummy2

User dummy2 has a Intermediate skillset and is available for 4 hours a week

# Recommendations based on Course Description

A simple recommendation system based on the short and long description blurbs

In [29]:
tfidf = TfidfVectorizer(stop_words='english')

In [30]:
matrix = tfidf.fit_transform(df['descriptions'])

In [31]:
cosine_sim = cosine_similarity(matrix, matrix)

In [32]:
title2id = pd.Series(df.index, index = df['title'])

In [33]:
temp_skill_level = {
    "Beginner": 0,
    "Intermediate": 1,
    "Advanced": 2
}

In [34]:
def filter_recommendations(idxs, user):
    ans = []
    for idx in idxs:
        data = df.iloc[idx]
        time_required = data['time'][0]
        proficiency = data['complexity']
        if user.getavailability() >= int(time_required) and temp_skill_level[user.getbackground()] >= temp_skill_level[proficiency]:
            ans.append(idx)
        else:
            continue
    return ans

In [35]:
def recommend(title: str = None, measure:str = None, npreds:int = 3, user:User = None):
    idx = title2id[title]
    score = list(enumerate(measure[idx]))
    score = sorted(score, key = lambda x: x[1], reverse = True)
    score = score[1:npreds+1]
    idxs = [i[0] for i in score]
    idxs = filter_recommendations(idxs, user)
    if not len(idxs):
        print("No suitable recommendations found for your profile. Would you be interested in a learning path to build your skillset?")
    return df['title'].iloc[idxs]

In [36]:
dummy1 = User('dummy')
dummy1.setbackground('Beginner')
dummy1.setavailability(2, 'week')
dummy1

User dummy has a Beginner skillset and is available for 2 hours a week

In [37]:
dummy2 = User('dummy2')
dummy2.setbackground('Intermediate')
dummy2.setavailability(4, 'week')
dummy2

User dummy2 has a Intermediate skillset and is available for 4 hours a week

In [38]:
recommend('Introduction to Natural Language Processing in Python', cosine_sim, user=dummy2)

17    Natural Language Generation in Python
3     Supervised Learning with scikit-learn
Name: title, dtype: object

In [39]:
recommend('Introduction to Natural Language Processing in Python', cosine_sim, user=dummy1)

No suitable recommendations found for your profile. Would you be interested in a learning path to build your skillset?


Series([], Name: title, dtype: object)

In [40]:
dummy3 = User('dummy3')
dummy3.setbackground('Advanced')
dummy3.setavailability(10, 'week')
dummy3

User dummy3 has a Advanced skillset and is available for 10 hours a week

In [41]:
dummy4 = User('dummy4')
dummy4.setbackground('Beginner')
dummy4.setavailability(4, 'week')
dummy4

User dummy4 has a Beginner skillset and is available for 4 hours a week

In [42]:
recommend('Machine Learning for Time Series Data in Python', cosine_sim, user=dummy3)

2                Machine Learning for Finance in Python
1    Feature Engineering for Machine Learning in Python
3                 Supervised Learning with scikit-learn
Name: title, dtype: object

In [43]:
recommend('Machine Learning for Time Series Data in Python', cosine_sim, user=dummy4)

2    Machine Learning for Finance in Python
3     Supervised Learning with scikit-learn
Name: title, dtype: object

In [44]:
recommend('Natural Language Generation in Python', cosine_sim, user=dummy2)

4     Introduction to Natural Language Processing in Python
12                    Image Processing with Keras in Python
Name: title, dtype: object

Clearly, as we can see above, just using the description blurbs is not giving us enough contextual information to generate meeaningful recommendations.

## Combining multiple features into one blurb

Let's combine multiple features(namely the `descriptions, paths` info and explore the recommendations from that

In [45]:
features = ['descriptions', 'paths']

In [46]:
for feature in features:
    df[feature] = df[feature].apply(to_lower)

In [47]:
def create_feature(x):
    title = x['title']
    descriptions = x['descriptions']
    paths = x['paths']
    return title + " " + descriptions + " " + paths

In [48]:
df['acc_features'] = df.apply(create_feature, axis=1)

In [49]:
count_vectorizer = CountVectorizer(stop_words='english')

In [50]:
count_matrix = count_vectorizer.fit_transform(df['acc_features'])

In [51]:
cosine_sim1 = cosine_similarity(count_matrix, count_matrix)

In [52]:
cosine_sim1.shape

(18, 18)

In [53]:
df = df.reset_index()
idxs = pd.Series(df.index, index = df['title'])

In [55]:
recommend('Machine Learning for Marketing in Python', cosine_sim1, npreds=3, user=dummy1)

No suitable recommendations found for your profile. Would you be interested in a learning path to build your skillset?


Series([], Name: title, dtype: object)

## Allowing search

Thus far, we've only been able to get recommendations based on an input that's already present in our corpus. What if we wanted to be more generic and search instead by keywords rather than an existing entity?

**NOTE**: Needs refinement, maybe custom word embeddings?

In [56]:
nlp = spacy.load('en_core_web_md')

In [57]:
def get_results(res:list):
    ans = []
    for item in res:
        data = df.iloc[item]
        title, url = data['title'], data['url']
        ans.append([title, url])
    return ans

In [58]:
def search(query: str = None, npreds:int = 3):
    doc1 = nlp(query)
    similarities = {}
    for idx, feature in enumerate(df['acc_features']):
        doc2 = nlp(feature)
        similarities[idx] = doc1.similarity(doc2)
    
    preds = sorted(similarities, key=similarities.get, reverse=True)[:npreds]
    results = get_results(preds)
    return results

In [59]:
response = search("Deep Learning", npreds=3)
response

[['Image Processing with Keras in Python',
  'https://www.datacamp.com/courses/image-processing-with-keras-in-python'],
 ['Introduction to Deep Learning with PyTorch',
  'https://www.datacamp.com/courses/introduction-to-deep-learning-with-pytorch'],
 ['Natural Language Generation in Python',
  'https://www.datacamp.com/courses/natural-language-generation-in-python']]

In [60]:
response = search("Spark", npreds=3)
response

[['Machine Learning with PySpark',
  'https://www.datacamp.com/courses/machine-learning-with-pyspark'],
 ['Natural Language Generation in Python',
  'https://www.datacamp.com/courses/natural-language-generation-in-python'],
 ['AI Fundamentals', 'https://www.datacamp.com/courses/ai-fundamentals']]

In [61]:
response = search("Marketing", npreds=3)
response

[['Machine Learning for Marketing in Python',
  'https://www.datacamp.com/courses/machine-learning-for-marketing-in-python'],
 ['Machine Learning for Business',
  'https://www.datacamp.com/courses/machine-learning-for-business'],
 ['Building Recommendation Engines in PySpark',
  'https://www.datacamp.com/courses/recommendation-engines-in-pyspark']]

## Moving on from video courses to articles and tech blogs