In [None]:
#imports
from pathlib import Path

import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
import spacy
!python -m spacy download en_core_web_lg

In [3]:
pd.set_option('display.max_colwidth', 100)

In [4]:
datapath = Path.cwd()/'data'
dataset = datapath/'dataset.csv'

In [5]:
df = pd.read_csv(dataset)

In [6]:
df.describe()

Unnamed: 0,title,short_description,long_description,provider,url,time,language,paths,prerequisites,medium,type
count,18,18,18,18,18,18,18,17,16,18,18
unique,18,18,18,1,18,2,1,14,13,1,1
top,Machine Learning for Time Series Data in Python,Learn to create deep learning models with the PyTorch library.,"Time series data is ubiquitous. Whether it be stock market fluctuations, sensor data recording c...",DataCamp,https://www.datacamp.com/courses/machine-learning-for-business,4 hours,Python,Data Skills for Business,Supervised Learning with scikit-learn,video,course
freq,1,1,1,18,1,16,18,2,3,18,18


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18 entries, 0 to 17
Data columns (total 11 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   title              18 non-null     object
 1   short_description  18 non-null     object
 2   long_description   18 non-null     object
 3   provider           18 non-null     object
 4   url                18 non-null     object
 5   time               18 non-null     object
 6   language           18 non-null     object
 7   paths              17 non-null     object
 8   prerequisites      16 non-null     object
 9   medium             18 non-null     object
 10  type               18 non-null     object
dtypes: object(11)
memory usage: 1.7+ KB


## Cleaning and Wrangling

## Sanitize columns

In [8]:
df['prerequisites'].fillna("", inplace=True)

In [9]:
df['paths'].fillna("", inplace=True)

In [10]:
df[['paths', 'prerequisites']]

Unnamed: 0,paths,prerequisites
0,"Machine Learning for Everyone,Big Data with PySpark,Machine Learning Scientist with Python","Introduction to PySpark,Statistical Thinking in Python (Part 1)"
1,"Machine Learning for Everyone,Machine Learning Scientist with Python","pandas Foundations,Supervised Learning with scikit-learn"
2,,Supervised Learning with scikit-learn
3,"Data Science for Everyone,Machine Learning for Everyone,Data Scientist with Python,Machine Lear...",Statistical Thinking in Python (Part 1)
4,"Machine Learning for Everyone,Machine Learning Scientist with Python,Natural Language Processing...",Python Data Science Toolbox (Part 2)
5,"Data Science for Everyone,Machine Learning for Everyone,Data Scientist with Python,Machine Lear...",Supervised Learning with scikit-learn
6,"Machine Learning for Everyone,Machine Learning Scientist with Python","Supervised Learning with scikit-learn,Case Study: School Budgeting with Machine Learning in Python"
7,Data Skills for Business,
8,"Machine Learning for Everyone,Machine Learning Fundamentals with Python,Machine Learning Scienti...",Supervised Learning with scikit-learn
9,"Machine Learning for Everyone,Machine Learning Scientist with Python,Time Series with Python","Manipulating Time Series Data in Python,Visualizing Time Series Data in Python,Supervised Learni..."


## Get prerequisite count

For now, because we don't have contextual info, we'll use the prerequisite count of a course as a proxy of its complexity. Once we have that, we can do a simple `pd.cut` to assign a categorical label(beginner, intermediate, advanced) to each course.

**NOTE**: The process below will assign a complexity label to each course strictly based on the number of prerequisites each item has. This may or may not correspond to the actual complexity of a course, but for the sake of prototyping, we'll leave it be and come back once other parts of the pipeline are fleshed out.

In [11]:
def get_preq_count(row):
    if row == "":
        return 0
    return len(row.split(","))

In [12]:
df['prereq_count'] = df['prerequisites'].apply(get_preq_count)

In [13]:
df['complexity'] = pd.cut(df['prereq_count'], 3, labels=['Beginner', 'Intermediate', 'Advanced'])

In [72]:
df.tail()

Unnamed: 0,index,title,short_description,long_description,provider,url,time,language,paths,prerequisites,medium,type,prereq_count,complexity,descriptions,acc_features
13,13,Advanced NLP with spaCy,"Learn how to use spaCy to build advanced natural language understanding systems, using both rule...","If you're working with a lot of text, you'll eventually want to know more about it. For example,...",DataCamp,https://www.datacamp.com/courses/advanced-nlp-with-spacy,5 hours,Python,natural language processing in python,Introduction to Natural Language Processing in Python,video,course,1,Beginner,"learn how to use spacy to build advanced natural language understanding systems, using both rule...",Advanced NLP with spaCy learn how to use spacy to build advanced natural language understanding ...
14,14,Sentiment Analysis in Python,Are customers thrilled with your products or is your service lacking? Learn how to perform an en...,Have you left a review to express how you feel about a product or a service? And do you have a h...,DataCamp,https://www.datacamp.com/courses/sentiment-analysis-in-python,4 hours,Python,natural language processing in python,Python Data Science Toolbox (Part 2),video,course,1,Beginner,are customers thrilled with your products or is your service lacking? learn how to perform an en...,Sentiment Analysis in Python are customers thrilled with your products or is your service lackin...
15,15,Building Recommendation Engines in PySpark,Learn tools and techniques to leverage your own big data to facilitate positive experiences for ...,This course will show you how to build recommendation engines using Alternating Least Squares i...,DataCamp,https://www.datacamp.com/courses/recommendation-engines-in-pyspark,4 hours,Python,big data with pyspark,"Introduction to PySpark,Supervised Learning with scikit-learn",video,course,2,Intermediate,learn tools and techniques to leverage your own big data to facilitate positive experiences for ...,Building Recommendation Engines in PySpark learn tools and techniques to leverage your own big d...
16,16,Machine Learning for Marketing in Python,"From customer lifetime value, predicting churn to segmentation - learn and implement Machine Lea...","The rise of machine learning (almost sounds like ""rise of the machines""?) and applications of st...",DataCamp,https://www.datacamp.com/courses/machine-learning-for-marketing-in-python,4 hours,Python,marketing analytics with python,"Data Manipulation with pandas,Supervised Learning with scikit-learn",video,course,2,Intermediate,"from customer lifetime value, predicting churn to segmentation - learn and implement machine lea...","Machine Learning for Marketing in Python from customer lifetime value, predicting churn to segme..."
17,17,Natural Language Generation in Python,Learn to generate language in Python using Deep Learning in Python.,"Have you ever wondered how Gmail autocompletes your sentences, or, what powers the WhatsApp sugg...",DataCamp,https://www.datacamp.com/courses/natural-language-generation-in-python,4 hours,Python,deep learning for nlp in python,"Introduction to Natural Language Processing in Python,Advanced Deep Learning with Keras",video,course,2,Intermediate,learn to generate language in python using deep learning in python. have you ever wondered how g...,Natural Language Generation in Python learn to generate language in python using deep learning i...


## Generate `description` feature

In [15]:
assert all(df['short_description'].isnull()) == False
assert all(df['long_description'].isnull()) == False

In [16]:
df['descriptions'] = df['short_description'] + " " + df['long_description']

In [17]:
df['descriptions'][0]

"Learn how to make predictions with Apache Spark. Spark is a powerful, general purpose tool for working with Big Data. Spark transparently handles the distribution of compute tasks across a cluster. This means that operations are fast, but it also allows you to focus on the analysis rather than worry about technical details. In this course you'll learn how to get data into Spark and then delve into the three fundamental Spark Machine Learning algorithms: Linear Regression, Logistic Regression/Classifiers, and creating pipelines. Along the way you'll analyse a large dataset of flight delays and spam text messages. With this background you'll be ready to harness the power of Spark and apply it on your own Machine Learning projects!"

In [18]:
def to_lower(row):
    return row.lower()

In [19]:
df['descriptions'] = df['descriptions'].apply(to_lower)

# Recommendations based on Course Description

A simple recommendation system based on the short and long description blurbs

In [20]:
tfidf = TfidfVectorizer(stop_words='english')

In [21]:
matrix = tfidf.fit_transform(df['descriptions'])

In [22]:
cosine_sim = cosine_similarity(matrix, matrix)

In [23]:
title2id = pd.Series(df.index, index = df['title'])

In [24]:
def recommend(title: str = None, measure:str = None, npreds:int = 3, tolower:bool = False):
    if tolower:
        title = title.lower()
    idx = title2id[title]
    score = list(enumerate(measure[idx]))
    score = sorted(score, key = lambda x: x[1], reverse = True)
    score = score[1:npreds+1]
    idxs = [i[0] for i in score]
    return df['title'].iloc[idxs]

In [25]:
recommend('Machine Learning for Marketing in Python', cosine_sim)

10                               AI Fundamentals
7                  Machine Learning for Business
15    Building Recommendation Engines in PySpark
Name: title, dtype: object

Clearly, as we can see above, just using the description blurbs is not giving us enough contextual information to generate meeaningful recommendations.

## Combining multiple features into one blurb

Let's combine multiple features(namely the `descriptions, paths` info and explore the recommendations from that

In [26]:
features = ['descriptions', 'paths']

In [27]:
for feature in features:
    df[feature] = df[feature].apply(to_lower)

In [28]:
def create_feature(x):
    title = x['title']
    descriptions = x['descriptions']
    paths = x['paths']
    return title + " " + descriptions + " " + paths

In [29]:
df['acc_features'] = df.apply(create_feature, axis=1)

In [30]:
count_vectorizer = CountVectorizer(stop_words='english')

In [31]:
count_matrix = count_vectorizer.fit_transform(df['acc_features'])

In [32]:
cosine_sim1 = cosine_similarity(count_matrix, count_matrix)

In [33]:
cosine_sim1.shape

(18, 18)

In [34]:
df = df.reset_index()
idxs = pd.Series(df.index, index = df['title'])

In [35]:
recommend('Machine Learning for Marketing in Python', cosine_sim1)

3                  Supervised Learning with scikit-learn
1     Feature Engineering for Machine Learning in Python
10                                       AI Fundamentals
Name: title, dtype: object

## Allowing search

Thus far, we've only been able to get recommendations based on an input that's already present in our corpus. What if we wanted to be more generic and search instead by keywords rather than an existing entity?

**NOTE**: Needs refinement, maybe custom word embeddings?

In [132]:
nlp = spacy.load('en_core_web_md')

In [159]:
def get_results(res:list):
    ans = []
    for item in res:
        data = df.iloc[item]
        title, url = data['title'], data['url']
        ans.append([title, url])
    return ans

In [160]:
def search(query: str = None, npreds:int = 5):
    doc1 = nlp(query)
    similarities = {}
    for idx, feature in enumerate(df['acc_features']):
        doc2 = nlp(feature)
        similarities[idx] = doc1.similarity(doc2)
    
    preds = sorted(similarities, key=similarities.get, reverse=True)[:npreds]
    results = get_results(preds)
    return results

In [168]:
response = search("I want to learn Engineering")
response

[['Linear Classifiers in Python',
  'https://www.datacamp.com/courses/linear-classifiers-in-python'],
 ['Advanced NLP with spaCy',
  'https://www.datacamp.com/courses/advanced-nlp-with-spacy'],
 ['Natural Language Generation in Python',
  'https://www.datacamp.com/courses/natural-language-generation-in-python'],
 ['Supervised Learning with scikit-learn',
  'https://www.datacamp.com/courses/supervised-learning-with-scikit-learn'],
 ['Introduction to Natural Language Processing in Python',
  'https://www.datacamp.com/courses/introduction-to-natural-language-processing-in-python']]