# Online Course Recommender


---


## Practice Module: Intelligent Reasoning Systems (IRS)

## Data Preparation: Edx Courses



## 0. File Path & Library Setup

In [1]:
# Load All Necessary Packages

import os
# from google.colab import drive

import pandas as pd
import numpy as np

seed = 18

print('Versions of key libraries')
print('-------------------------')
print('pandas:  ', pd.__version__)
print('numpy:   ', np.__version__)

Versions of key libraries
-------------------------
pandas:   1.3.3
numpy:    1.19.5


In [2]:
# # Mounting to Google Drive
# drive.mount('/content/gdrive')

# # Change Working Directory
# os.chdir('/content/gdrive/My Drive/iss/irs_pm/')

print('Working Directory: ')
os.getcwd()

Working Directory: 


'D:\\Personal\\Education\\NUS-ISS Mtech IS\\Course Materials\\1. Intelligent Reasoning Systems (IRS)\\0. IRS-PM_Practice Module\\Source Code\\Data Preparation\\Edx'

In [3]:
# Load Data (Raw Web Scrapped Data - Edx Courses)

rawdata = pd.read_csv('Edx_Courses.csv')

In [4]:
rawdata.shape

(3506, 16)

In [5]:
rawdata.head()

Unnamed: 0,Course Name,Multi Categories,Course URL,Course Provider,Short Description,Long Description,Difficulty,Duration,Commitment,Number of Enrolled,Cost,Alternative Options,Language,Video Language,Instructors,Course Img
0,Креативное предпринимательство и проектировани...,Business & Management,https://www.edx.org/course/-10,ITMO University,Научитесь творческому и предпринимательскому м...,Креативность — это феномен человеческой души. ...,Introductory,5 weeks,1–2 hours per week,,Free,$99 USD,Русский,Русский,['Olga Tikhomirova'],https://prod-discovery.edx-cdn.org/media/cours...
1,Landscape Techniques | 山水画技法,Art & Culture,https://www.edx.org/course/-15,Tsinghua University,Chinese painting is an aesthetic representatio...,The purpose of this course ： is to improve stu...,Introductory,8 weeks,4–6 hours per week,,Free,$139 USD,中文,English,['Jingwei Han'],https://prod-discovery.edx-cdn.org/media/cours...
2,Тяжелые ионы и синтез новых элементов: совреме...,"Engineering, Physics",https://www.edx.org/course/-3,National Research Nuclear University,В курсе рассматриваются основы физики тяжелых ...,Предметом курса является одно из новых направл...,Intermediate,5 weeks,3–6 hours per week,,Free,$49 USD,Русский,Русский,['Юрий Пенионжкевич'],https://prod-discovery.edx-cdn.org/media/cours...
3,Лазеры: физические основы и лазерные технологии,"Engineering, Physics",https://www.edx.org/course/-4,National Research Nuclear University,Девиз курса - дать студентам необходимые знани...,Лазерные технологии с момента своего открытия ...,Intermediate,5 weeks,3–6 hours per week,,Free,$49 USD,Русский,Русский,['Алексей Менушенков'],https://prod-discovery.edx-cdn.org/media/cours...
4,Особенности написания научно-технических текстов,"Language, Social Sciences",https://www.edx.org/course/-5,National Research Nuclear University,Курс формирует навыки правильного использовани...,В настоящее время умение понимать и создавать ...,Intermediate,7 weeks,3–6 hours per week,,Free,$49 USD,Русский,Русский,['Алевтина Сицына-Кудрявцева'],https://prod-discovery.edx-cdn.org/media/cours...


## 1. Helper functions

In [6]:
# Filter for english courses

list_lang = ['English', 'english', 'ENGLISH']

def lang_eng(lang):
  return (lang in list_lang)


In [7]:
# Convert course difficulty from string to numbers in increasing difficulty order

print('All levels of difficulty:')
print(rawdata['Difficulty'].unique())

def conv_diff(diff_string):
  if diff_string == 'Introductory':
    diff = 0
  elif diff_string == 'Intermediate':
    diff = 1
  else:
    diff = 2
  return diff
  

All levels of difficulty:
['Introductory' 'Intermediate' 'Advanced']


In [8]:
# Get course week duration given a string of 'Course Duration'

def cal_cw(course_duration):
  if pd.isna(course_duration):
    cw = np.nan
  else:
    cw = course_duration.split(' ')[0]
    cw = int(cw)
  return cw


In [9]:
# Get average hours per week given a string of 'Commitment'

def cal_ahpw(commitment):
  if pd.isna(commitment):
    ahpw = np.nan
  else:
    list_hours = commitment.split(' ')[0].split('–')
    list_hours = [int(x) for x in list_hours]
    if len(list_hours) == 1:
      ahpw = list_hours[0]
    else:
      ahpw = np.mean(list_hours)
  return ahpw
  

In [10]:
# Convert course duration in hours to categorical

def conv_dur(dur):
  if pd.isna(dur):
    cat_dur = np.nan
  elif (dur > 0) and (dur <= 10):
    cat_dur = 0
  elif (dur > 10) and (dur <= 50):
    cat_dur = 1
  else:
    cat_dur = 2
  return cat_dur

In [11]:
# Filter for non empty value Course Duration
week_nonna = np.array((rawdata['Duration'].isna() == False))

# Filter for non empty value Commitment 
com_nonna = np.array((rawdata['Commitment'].isna() == False))

# Filter for non empty value for both Course Duration and Commitment
dur_nonna = week_nonna * com_nonna

# Calculate global average course duration
gdur = (np.array(rawdata['Duration'][dur_nonna].apply(lambda x: cal_cw(x))) \
         * np.array(rawdata['Commitment'][dur_nonna].apply(lambda x: cal_ahpw(x)))).sum() \
         / dur_nonna.sum()


# Calculate course duration in hours
# cw - course week duration
# ahpw - average hours per week
# ahpw = cw * ahpw

def cal_dur(week_dur, commt):
  cw = cal_cw(week_dur)
  ahpw = cal_ahpw(commt)
  if pd.isna(cw):
    dur = np.nan
  elif (not(pd.isna(cw)) and pd.isna(ahpw)):
    dur = gdur # for courses with number of weeks but not average hours per week info
  else: 
    dur = cw*ahpw
  return conv_dur(dur)


## 2. Preprocess Data

### i. Filter English Courses Only

In [12]:
# Get language filter

lang_fil = rawdata['Language'].apply(lambda x: lang_eng(x))
data_lang = rawdata['Language']

### ii. Get Course Name Column

In [13]:
# Extract Course Name Column

data_name = rawdata['Course Name']

### iii. Get Course URLs

In [14]:
# Extract Course URLs Column

data_url = rawdata['Course URL']

### iv. Get Course Category

In [15]:
# Extract Course Categories

data_cat = rawdata['Multi Categories']

### v. Get Course Description

In [16]:
# Extract Course Short Description

data_sdesc = rawdata['Short Description'].apply(lambda x: '' if pd.isna(x) else x)


In [17]:
# Extract Course Long Description

data_ldesc = rawdata['Long Description'].apply(lambda x: '' if pd.isna(x) else x)

### vi. Get Course Difficulty

In [18]:
# Extract Course Difficulty, one-hot encoded

print('All levels of difficulty:')
print(rawdata['Difficulty'].unique())
data_diffcat = rawdata['Difficulty'].apply(lambda x: conv_diff(x))

All levels of difficulty:
['Introductory' 'Intermediate' 'Advanced']


### vii. Get Course Duration

In [19]:
# Get Course Duration in unit of hours

data_durcat = rawdata[['Duration',	'Commitment']].apply(lambda x: cal_dur(x[0],x[1]), axis=1)


### viii. Free Option

In [20]:
# Check if free option is available

data_costcat = rawdata['Cost'].apply(lambda x: 1 if x == 'Free' else 0)

### viv. Get Course Rating

In [21]:
# Get Course Rating as normalized number of enrolls


data_enroll = rawdata['Number of Enrolled'].apply(lambda x: '0' if pd.isna(x) else x)
data_enroll = data_enroll.apply(lambda x: x.replace(',', ''))
data_enroll = data_enroll.apply(lambda x: int(x))
# rat_min = data_ratcount.min()
# rat_max = data_ratcount.max()

# data_rat = (data_ratcount - rat_min) / (rat_max - rat_min) # normalized
data_rat = np.zeros(data_enroll.shape[0])

### x. Paid Option

In [22]:
# Get Cost for paid option
data_paid = rawdata['Alternative Options']

### xi. Get Subtitle Language

In [23]:
# Get course list of subtitle language available

data_sublang = rawdata['Video Language']

### xii. Adding a Column for platform identifier

In [24]:
# Add Platform name
# 0 - Edx

data_plat = np.zeros(rawdata.shape[0])

### xiii. Get Course Provider


In [25]:
# Course Provider

data_prov = rawdata['Course Provider']

### xiv. Get Course Image URL

In [26]:
# Get Course Image URL

data_img = rawdata['Course Img']

## 3. Combine data 

In [27]:
# Combine Data

data_header = ['Name', 'URL', 'Categories', 'Short Description', 'Long Description', 'Difficulty', 'Duration', 'Free Option', 'Number of Enroll', 'Rating', 'Paid Option', 'Language', 'Subtitle Language', 'Platform', 'Provider', 'Image URL']
data_comb = np.array([data_name, data_url, data_cat, data_sdesc, data_ldesc, data_diffcat, data_durcat, data_costcat, data_enroll, data_rat, data_paid, data_lang, data_sublang, data_plat, data_prov, data_img]).transpose()

In [28]:
# Filter for English Course only

data = pd.DataFrame(data_comb[lang_fil,:], columns=data_header)
data.shape

(2688, 16)

In [29]:
data.head()

Unnamed: 0,Name,URL,Categories,Short Description,Long Description,Difficulty,Duration,Free Option,Number of Enroll,Rating,Paid Option,Language,Subtitle Language,Platform,Provider,Image URL
0,创业101: 你的客户是谁？,https://www.edx.org/course/101-2,Business & Management,如果注册《创业101:谁是你的客户？》的认证证书，并且通过课程考核，将会获得有效期为1年的价...,很多看起来似乎很伟大的想法和科技在碰到一个简单、不可避免的问题时都会突然卡壳。这个问题就是：...,0,1.0,1,0,0.0,$100 USD,English,中文,0.0,Massachusetts Institute of Technology,https://prod-discovery.edx-cdn.org/media/cours...
1,创业102: 你能为客户做什么？,https://www.edx.org/course/102-2,Business & Management,如果注册《创业102:你能为客户做什么？》的认证证书，并且通过课程考核，将会获得有效期为1年...,不要在意你的客户能为你做什么——而是要关注你能为你的客户做些什么。\n在《创业101》中，我...,0,1.0,1,0,0.0,$100 USD,English,中文,0.0,Massachusetts Institute of Technology,https://prod-discovery.edx-cdn.org/media/cours...
2,18th-Century Opera: Handel & Mozart,https://www.edx.org/course/18th-century-opera-...,"Art & Culture, History, Music",Study Baroque and Classical opera through Hand...,"In this breathtaking course, you'll get to kno...",0,1.0,1,22646,0.0,$139 USD,English,English,0.0,Harvard University,https://prod-discovery.edx-cdn.org/media/cours...
3,"19th-Century Opera: Meyerbeer, Wagner, & Verdi",https://www.edx.org/course/19th-century-opera-...,"Art & Culture, History, Music",Learn the music and cultural impact of three c...,Travel through central Europe in the 1800s to ...,0,1.0,1,11619,0.0,$139 USD,English,English,0.0,Harvard University,https://prod-discovery.edx-cdn.org/media/cours...
4,3D GIS,https://www.edx.org/course/3d-gis,"Data Analysis & Statistics, Energy & Earth Sci...",Take your maps into the third dimension: Learn...,Maps are graphic representations of reality an...,1,1.0,1,0,0.0,$149 USD,English,English,0.0,University of Alaska Fairbanks,https://prod-discovery.edx-cdn.org/media/cours...


In [30]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2688 entries, 0 to 2687
Data columns (total 16 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   Name               2688 non-null   object
 1   URL                2688 non-null   object
 2   Categories         2688 non-null   object
 3   Short Description  2688 non-null   object
 4   Long Description   2688 non-null   object
 5   Difficulty         2688 non-null   object
 6   Duration           2682 non-null   object
 7   Free Option        2688 non-null   object
 8   Number of Enroll   2688 non-null   object
 9   Rating             2688 non-null   object
 10  Paid Option        2574 non-null   object
 11  Language           2688 non-null   object
 12  Subtitle Language  2688 non-null   object
 13  Platform           2688 non-null   object
 14  Provider           2585 non-null   object
 15  Image URL          2688 non-null   object
dtypes: object(16)
memory usage: 336.1+ KB


In [31]:
(data.isna()).sum()

Name                   0
URL                    0
Categories             0
Short Description      0
Long Description       0
Difficulty             0
Duration               6
Free Option            0
Number of Enroll       0
Rating                 0
Paid Option          114
Language               0
Subtitle Language      0
Platform               0
Provider             103
Image URL              0
dtype: int64

## 4. Save Output to file

In [32]:
data['URL'][43]

'https://www.edx.org/course/accounting-principles-for-ma'

In [33]:
filename = 'Edx_Data.csv'
data.to_csv(filename, index=False, encoding='utf_8_sig')