# Online Course Recommender


---


## Practice Module: Intelligent Reasoning Systems (IRS)

## Data Preparation: Edx Courses



## 0. File Path & Library Setup

In [1]:
# Load All Necessary Packages

import os
from google.colab import drive

import pandas as pd
import numpy as np
import re

seed = 18

print('Versions of key libraries')
print('-------------------------')
print('pandas:  ', pd.__version__)
print('numpy:   ', np.__version__)

Versions of key libraries
-------------------------
pandas:   1.1.5
numpy:    1.19.5


In [2]:
# Mounting to Google Drive
drive.mount('/content/gdrive')

# Change Working Directory
os.chdir('/content/gdrive/My Drive/iss/irspm/')

print('Working Directory: ')
!pwd

Mounted at /content/gdrive
Working Directory: 
/content/gdrive/My Drive/iss/irspm


In [3]:
# Load Data (Raw Web Scrapped Data - Edx Courses)

rawdata = pd.read_csv('coursera_courses_eng_1.csv')

In [4]:
rawdata.shape

(5726, 19)

In [5]:
rawdata.head()

Unnamed: 0,Title,Category,Sub-category,CourseOrProject,Difficulty,Requirements,Instructor(s),Institution(s),No. of Students Enrolled,Rating,No. of ratings,Free Access?,Upgrade to Cert,Duration,Skills,Main Language,Subtitles,Description,Link
0,Add Gore to Your Game in Unity,Computer Science,Software Development,Project,Beginner,Not mentioned,Edward Falzon,Coursera Project Network,,,,No,Paid,2 hours,"Video-game Development, C sharp (C#) Programmi...",English,,"In this Guided Project, you will:Become famili...",https://www.coursera.org/learn/add-gore-game-u...
1,Advanced Topics in Derivative Pricing,Business,Finance,Course,Intermediate Level,Students should have taken intermediate to adv...,"Garud Iyengar, Ali Hirsa, Martin Haugh",Columbia University,9380 recent views,Not mentioned,Not mentioned,Yes,Paid,Approx. 16 hours to complete,"Implied Volatility, Synthetic Collateralised D...",English,English,This course discusses topics in derivative pri...,https://www.coursera.org/learn/financial-engin...
2,Approve Social Media Posts with Zapier and Trello,Business,Marketing,Project,Beginner-friendly,Familiarity with popular social media platform...,Carmen Rojas,Coursera Project Network,,,,Free Guided Project,Free,2 hours,"Marketing, Marketing Strategy, Content Marketi...",English,,"In this Free Guided Project, you will:Document...",https://www.coursera.org/learn/approve-social-...
3,Automate Blog Advertisements with Zapier,Business,Marketing,Project,Intermediate,Basic familiarity with a Blog website as well ...,Carmen Rojas,Coursera Project Network,,,,Free Guided Project,Free,2 hours,"Advertising, Social Media, Blogging, Marketing",English,,"In this Free Guided Project, you will:Register...",https://www.coursera.org/learn/automate-blog-a...
4,Basic Descriptives using R Cmdr,Data Science,Data Analysis,Project,Beginner,Not mentioned,Shalini Gopalkrishnan,Coursera Project Network,,,,No,Paid,1 hour,"Statistics, no coding analysis",English,,"In this Guided Project, you will:Create and in...",https://www.coursera.org/learn/basic-desc-r-cmdr


In [6]:
# Adding the short description column just for uniformity (since coursera only has long description)

final = pd.DataFrame(columns = ['Name','URL','Categories', 'Short Description','Long Description', 'Difficulty', 'Duration', 'Free Option', 'Rating', 'Paid Option', 'Language', 'Subtitle Language', 'Platform', 'Provider', 'Image URL'])
final

Unnamed: 0,Name,URL,Categories,Short Description,Long Description,Difficulty,Duration,Free Option,Rating,Paid Option,Language,Subtitle Language,Platform,Provider,Image URL


## Drop unwanted data

In [7]:
# drop instructors, bestseller columns
df = rawdata.drop(columns = ['CourseOrProject','Requirements','Instructor(s)','Skills'])
df.shape

(5726, 15)

## 1. Custom functions

In [8]:
# Filters for english courses

list_lang = ['English', 'english', 'ENGLISH']

def lang_eng(lang):
  return (lang in list_lang)


In [9]:
# Convert course difficulty from string to numbers in increasing difficulty order

print('All levels of difficulty:')
print(rawdata['Difficulty'].unique())

def conv_diff(diff_string):
  # first make all alphabets to lower case
  diff_string = diff_string.lower()
  if 'beginner' in diff_string:
    diff = 0
  elif 'intermediate' in diff_string:
    diff = 1
  elif 'advanced' in diff_string:
    diff = 2
  else:
    diff = 1
  return diff
  

All levels of difficulty:
['Beginner' 'Intermediate Level' 'Beginner-friendly' 'Intermediate'
 'Advanced Level' 'Beginner Level' 'Advanced' 'Not mentioned']


In [10]:
# Extract course duration in hours
# Coursera raw data examples (hours): 'one hour','1 hour','1hour','1-1.5 hours','1.25','1 week of study, 2 hours','2 Hour','2-hour course','1.03 hours'
# Coursera raw data examples (minutes): '100 Minutes','60-75 minutes'
# Coursera raw data examples (hours&minutes): '1 hr 10 minutes','2h 10m','1h 20min','1-hour 30-minutes'
# Special cases: '1 to 2 hours in total','One to two hours','5 hours (1hr video content)', 'Not mentioned'

def extract_dur(strDur):
  # first make all alphabets to lower case
  strDur = strDur.lower()
  numDur = -1
  
  # replace any numbers (1-10 only) in words to digits
  numDict = {'one':1,'two':2,'three':3,'four':4,'five':5,'six':6,'seven':7,'eight':8,'nine':9,'ten':10}
  for num in numDict.keys():
    if num in strDur:
      strDur = strDur.replace(num,str(numDict[num]))

  # handling this pattern of duration string ('1 to 2 hours in total','One to two hours')
  x = re.search(".*to.*hour", strDur)
  if x:
    numDur = 0
    durList = re.findall("\d",strDur)
    for i, dur in enumerate(durList):
      numDur = numDur + int(dur)/len(durList)

  # handling this pattern of duration string ('1 hr 10 minutes','2h 10m','1h 20min','1-hour 30-minutes')
  x = re.search(".*-h.*-m",strDur)
  if x:
    durList = [num for num in re.split(r'[-\s]\s*', x[0]) if num.isdigit()]
    if len(durList) > 0:
      numDur = int(durList[0])
  x = re.search(".*h.*m",strDur)
  if x:
    durList = [num for num in re.split(r'[hm\s]', x[0]) if num.isdigit()]
    if len(durList) > 0:
      numDur = int(durList[0])

  # handling this pattern of duration string ('100 Minutes','60-75 minutes')
  x = re.search(".*-.*min",strDur)
  if x:
    durList =  [num for num in re.split(r'[-\s]', x[0]) if num.isdigit()]
    numDur = (int(durList[0])+int(durList[-1])) / (2*60)
  x = re.search(".*min",strDur)
  if x:
    durList =  [num for num in re.split(r'[m\s]', x[0]) if num.isdigit()]

  # handling this pattern of duration string ('one hour','1 hour','1hour','1-1.5 hours','1.25','1 week of study, 2 hours','2 Hour','2-hour course','1.03 hours')
  # rounding down the lower range to int
  x = re.search(".*week.*hour",strDur)
  if x:
    durList =  [num for num in re.split(r'[-.h\s]', x[0]) if num.isdigit()]
    numDur = int(durList[-1])
  x = re.search(".*hour",strDur)
  if x:
    durList =  [num for num in re.split(r'[-.h\s]', x[0]) if num.isdigit()]
    numDur = int(durList[0])

  # if none of the above patterns are found then simply take the first number under duration
  if numDur == -1:
    numDur = [int(i) for i in re.split(r'[.\s]', strDur) if i.isdigit()]
    if len(numDur) > 0:
      numDur = numDur[0]
    else:
      numDur = 10
  return numDur


In [11]:
# Convert course duration in hours to categorical

def conv_dur(dur):
  dur = int(dur)
  if pd.isna(dur):
    cat_dur = np.nan
  elif (dur > 0) and (dur <= 10):
    cat_dur = 0
  elif (dur > 10) and (dur <= 50):
    cat_dur = 1
  else:
    cat_dur = 2
  return cat_dur
  

In [12]:
def isfree(strs):
  if 'free' or 'yes' in strs.lower():
    f = 1
  else:
    f = 0
  return f

## 2. Preprocess Data

### Set final 'categories' as comma-separated string concatenation of categories + sub-categories

In [13]:
final['Categories'] = df['Category'] + ', ' + df['Sub-category']

###Set Language as English



In [14]:
# For Coursera only English courses are scraped, so no need to filter
# Set language as English in the final df

final['Language'] = ['English' for i in range(len(final['Name']))]

### Get Course Name, Course Description, Course URL Column

In [15]:
final['Name'] = df.Title
final['Long Description'] = df.Description
final['URL'] = df.Link
final.head()

Unnamed: 0,Name,URL,Categories,Short Description,Long Description,Difficulty,Duration,Free Option,Rating,Paid Option,Language,Subtitle Language,Platform,Provider,Image URL
0,Add Gore to Your Game in Unity,https://www.coursera.org/learn/add-gore-game-u...,"Computer Science, Software Development",,"In this Guided Project, you will:Become famili...",,,,,,English,,,,
1,Advanced Topics in Derivative Pricing,https://www.coursera.org/learn/financial-engin...,"Business, Finance",,This course discusses topics in derivative pri...,,,,,,English,,,,
2,Approve Social Media Posts with Zapier and Trello,https://www.coursera.org/learn/approve-social-...,"Business, Marketing",,"In this Free Guided Project, you will:Document...",,,,,,English,,,,
3,Automate Blog Advertisements with Zapier,https://www.coursera.org/learn/automate-blog-a...,"Business, Marketing",,"In this Free Guided Project, you will:Register...",,,,,,English,,,,
4,Basic Descriptives using R Cmdr,https://www.coursera.org/learn/basic-desc-r-cmdr,"Data Science, Data Analysis",,"In this Guided Project, you will:Create and in...",,,,,,English,,,,


### v. Get Course Difficulty

In [18]:
# Extract Course Difficulty, converted to numerical category and normalised

final['Difficulty'] = df['Difficulty'].apply(lambda x: conv_diff(x)) # difficulty in numerical category

final.head()

Unnamed: 0,Name,URL,Categories,Short Description,Long Description,Difficulty,Duration,Free Option,Rating,Paid Option,Language,Subtitle Language,Platform,Provider,Image URL
0,Add Gore to Your Game in Unity,https://www.coursera.org/learn/add-gore-game-u...,"Computer Science, Software Development",,"In this Guided Project, you will:Become famili...",0,,,,,English,,,,
1,Advanced Topics in Derivative Pricing,https://www.coursera.org/learn/financial-engin...,"Business, Finance",,This course discusses topics in derivative pri...,1,,,,,English,,,,
2,Approve Social Media Posts with Zapier and Trello,https://www.coursera.org/learn/approve-social-...,"Business, Marketing",,"In this Free Guided Project, you will:Document...",0,,,,,English,,,,
3,Automate Blog Advertisements with Zapier,https://www.coursera.org/learn/automate-blog-a...,"Business, Marketing",,"In this Free Guided Project, you will:Register...",1,,,,,English,,,,
4,Basic Descriptives using R Cmdr,https://www.coursera.org/learn/basic-desc-r-cmdr,"Data Science, Data Analysis",,"In this Guided Project, you will:Create and in...",0,,,,,English,,,,


### vi. Get Course Duration

In [19]:
# Get Course Duration in unit of hours

data_dur = df['Duration'].apply(lambda x: extract_dur(x))
data_dur = data_dur.apply(lambda x: conv_dur(x)) # convert duration to categorical 0/1/2

final['Duration'] = data_dur

final.shape

(5726, 15)

### vii. Get Course Rating

In [20]:
type(df['No. of Students Enrolled'][0])

float

In [21]:
# replacing values with 'recent views' with integer number 
def get_enrolls(strs):
  if isinstance(strs, str) and 'recent views' in strs:
    num = re.findall(r'[0-9]+',strs)[0]
    # dividing the number of views by 20 (assume 1 in 20 ppl take the course)
    num = int(num) / 20
    return int(num)

num_enrolls = df['No. of Students Enrolled'].apply(lambda x:get_enrolls(x))

# replaces 'Not Mentioned' or 'N/A' with NaN and changes data type to numeric
df['No. of ratings'] = pd.to_numeric(df['No. of ratings'],errors='coerce')
df.Rating = pd.to_numeric(df.Rating,errors='coerce')
df['No. of Students Enrolled'] = pd.to_numeric(df['No. of Students Enrolled'],errors='coerce')

# calculate the median values for replacing NaN values
medianRating = df.Rating.median() 
medianNumRatings = df['No. of ratings'].median()
medianEnrolled = df['No. of Students Enrolled'].median()

# replace NaN values with mean values
df.Rating.fillna(medianRating,inplace = True)
df['No. of ratings'].fillna(medianNumRatings,inplace = True)
df['No. of Students Enrolled'].fillna(medianEnrolled,inplace = True)

num_ratings = df['No. of ratings']
ratings = df.Rating
maxenroll = df['No. of Students Enrolled'].max()
print('the maximum enroll number is:',maxenroll)
final['Rating'] = ratings/5.0*num_ratings/maxenroll

the maximum enroll number is: 4386627.0


### viii. Get Subtitle Language

In [22]:
# Get course list of subtitle language available

df.Subtitles.fillna('English(default)',inplace = True)
final['Subtitle Language'] = df['Subtitles']
final.head()

Unnamed: 0,Name,URL,Categories,Short Description,Long Description,Difficulty,Duration,Free Option,Rating,Paid Option,Language,Subtitle Language,Platform,Provider,Image URL
0,Add Gore to Your Game in Unity,https://www.coursera.org/learn/add-gore-game-u...,"Computer Science, Software Development",,"In this Guided Project, you will:Become famili...",0,0,,5.7e-05,,English,English(default),,,
1,Advanced Topics in Derivative Pricing,https://www.coursera.org/learn/financial-engin...,"Business, Finance",,This course discusses topics in derivative pri...,1,1,,5.7e-05,,English,English,,,
2,Approve Social Media Posts with Zapier and Trello,https://www.coursera.org/learn/approve-social-...,"Business, Marketing",,"In this Free Guided Project, you will:Document...",0,0,,5.7e-05,,English,English(default),,,
3,Automate Blog Advertisements with Zapier,https://www.coursera.org/learn/automate-blog-a...,"Business, Marketing",,"In this Free Guided Project, you will:Register...",1,0,,5.7e-05,,English,English(default),,,
4,Basic Descriptives using R Cmdr,https://www.coursera.org/learn/basic-desc-r-cmdr,"Data Science, Data Analysis",,"In this Guided Project, you will:Create and in...",0,0,,5.7e-05,,English,English(default),,,


### viv. Free Option

In [23]:
# Check if free option is available

final['Free Option'] = df['Free Access?'].apply(lambda x:isfree(x))
final.head()

Unnamed: 0,Name,URL,Categories,Short Description,Long Description,Difficulty,Duration,Free Option,Rating,Paid Option,Language,Subtitle Language,Platform,Provider,Image URL
0,Add Gore to Your Game in Unity,https://www.coursera.org/learn/add-gore-game-u...,"Computer Science, Software Development",,"In this Guided Project, you will:Become famili...",0,0,1,5.7e-05,,English,English(default),,,
1,Advanced Topics in Derivative Pricing,https://www.coursera.org/learn/financial-engin...,"Business, Finance",,This course discusses topics in derivative pri...,1,1,1,5.7e-05,,English,English,,,
2,Approve Social Media Posts with Zapier and Trello,https://www.coursera.org/learn/approve-social-...,"Business, Marketing",,"In this Free Guided Project, you will:Document...",0,0,1,5.7e-05,,English,English(default),,,
3,Automate Blog Advertisements with Zapier,https://www.coursera.org/learn/automate-blog-a...,"Business, Marketing",,"In this Free Guided Project, you will:Register...",1,0,1,5.7e-05,,English,English(default),,,
4,Basic Descriptives using R Cmdr,https://www.coursera.org/learn/basic-desc-r-cmdr,"Data Science, Data Analysis",,"In this Guided Project, you will:Create and in...",0,0,1,5.7e-05,,English,English(default),,,


### x. Paid Option

In [24]:
# Get Cost for paid option

final['Paid Option'] = df['Upgrade to Cert'].apply(lambda x: 0 if x == 'Free' else 'Refer to link for price')
final.head()

Unnamed: 0,Name,URL,Categories,Short Description,Long Description,Difficulty,Duration,Free Option,Rating,Paid Option,Language,Subtitle Language,Platform,Provider,Image URL
0,Add Gore to Your Game in Unity,https://www.coursera.org/learn/add-gore-game-u...,"Computer Science, Software Development",,"In this Guided Project, you will:Become famili...",0,0,1,5.7e-05,Refer to link for price,English,English(default),,,
1,Advanced Topics in Derivative Pricing,https://www.coursera.org/learn/financial-engin...,"Business, Finance",,This course discusses topics in derivative pri...,1,1,1,5.7e-05,Refer to link for price,English,English,,,
2,Approve Social Media Posts with Zapier and Trello,https://www.coursera.org/learn/approve-social-...,"Business, Marketing",,"In this Free Guided Project, you will:Document...",0,0,1,5.7e-05,0,English,English(default),,,
3,Automate Blog Advertisements with Zapier,https://www.coursera.org/learn/automate-blog-a...,"Business, Marketing",,"In this Free Guided Project, you will:Register...",1,0,1,5.7e-05,0,English,English(default),,,
4,Basic Descriptives using R Cmdr,https://www.coursera.org/learn/basic-desc-r-cmdr,"Data Science, Data Analysis",,"In this Guided Project, you will:Create and in...",0,0,1,5.7e-05,Refer to link for price,English,English(default),,,


### xi. Adding a Column for platform identifier

In [25]:
# Add Platform name
# 0 - Edx
# 1 - Udemy
# 2 - Coursera

final['Platform'] = [2 for i in range(len(final['Name']))]

### Get the Provider

In [26]:
final['Provider'] = df['Institution(s)']

### Adding the Image URL

In [27]:
courseraLogoURL = 'https://about.coursera.org/static/blueCoursera-646f855eae3d677239ea9db93d6c9e17.svg'
final['Image URL'] = [courseraLogoURL for i in range(len(final['Name']))]

## 3. Combine data 

In [28]:
final

Unnamed: 0,Name,URL,Categories,Short Description,Long Description,Difficulty,Duration,Free Option,Rating,Paid Option,Language,Subtitle Language,Platform,Provider,Image URL
0,Add Gore to Your Game in Unity,https://www.coursera.org/learn/add-gore-game-u...,"Computer Science, Software Development",,"In this Guided Project, you will:Become famili...",0,0,1,0.000057,Refer to link for price,English,English(default),2,Coursera Project Network,https://about.coursera.org/static/blueCoursera...
1,Advanced Topics in Derivative Pricing,https://www.coursera.org/learn/financial-engin...,"Business, Finance",,This course discusses topics in derivative pri...,1,1,1,0.000057,Refer to link for price,English,English,2,Columbia University,https://about.coursera.org/static/blueCoursera...
2,Approve Social Media Posts with Zapier and Trello,https://www.coursera.org/learn/approve-social-...,"Business, Marketing",,"In this Free Guided Project, you will:Document...",0,0,1,0.000057,0,English,English(default),2,Coursera Project Network,https://about.coursera.org/static/blueCoursera...
3,Automate Blog Advertisements with Zapier,https://www.coursera.org/learn/automate-blog-a...,"Business, Marketing",,"In this Free Guided Project, you will:Register...",1,0,1,0.000057,0,English,English(default),2,Coursera Project Network,https://about.coursera.org/static/blueCoursera...
4,Basic Descriptives using R Cmdr,https://www.coursera.org/learn/basic-desc-r-cmdr,"Data Science, Data Analysis",,"In this Guided Project, you will:Create and in...",0,0,1,0.000057,Refer to link for price,English,English(default),2,Coursera Project Network,https://about.coursera.org/static/blueCoursera...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5721,iPhone Application Flow with Wireframes in Miro,https://www.coursera.org/learn/iphone-applicat...,"Computer Science, Design and Product",,"In this Guided Project, you will:Identify use ...",0,0,1,0.000057,Refer to link for price,English,English(default),2,Coursera Project Network,https://about.coursera.org/static/blueCoursera...
5722,idea 2 IMPACT: An Introduction to Translating ...,https://www.coursera.org/learn/idea-2-impact,"Health, Research",,"idea 2 IMPACT (i2I) is an online, 6-week cours...",1,1,1,0.000057,Refer to link for price,English,English,2,University of Pittsburgh,https://about.coursera.org/static/blueCoursera...
5723,Â¿QuÃ© son los medios sociales?,https://www.coursera.org/learn/what-is-social-es,"Business, Marketing",,El curso en lÃ­nea abierto y masivo (Massive O...,1,0,1,0.000057,Refer to link for price,English,"English, Spanish, Arabic, Japanese",2,Northwestern University,https://about.coursera.org/static/blueCoursera...
5724,Ð¡/C++ for competitive programming,https://www.coursera.org/learn/c-for-competiti...,"Computer Science, Algorithms",,C and C++ are the most popular programming lan...,0,2,1,0.000057,Refer to link for price,English,English,2,Moscow Institute of Physics and Technology,https://about.coursera.org/static/blueCoursera...


## 4. Save Output to file

In [29]:
filename = 'Cousera_Data.csv'
final.to_csv(filename, index=False, encoding='utf_8_sig')