In [1]:
import numpy as np
import pandas as pd
import math
import matplotlib.pyplot as plt

In [2]:
data = pd.read_table("../data/course_list.tsv",encoding='utf-8')
data = data.iloc[:, :-1]
data = data.drop(columns=['link'])

In [3]:
print(data.columns)
print(data.shape)

Index(['id', 'name', 'seller', 'price', 'original_price', 'lectures', 'hours',
       'level'],
      dtype='object')
(1013, 8)


In [4]:
data['lectures'].fillna(value="All Levels", inplace=True)

In [5]:
data["price"].replace("Free","0.0", inplace=True)
data["price"] = data["price"].astype(float)
index = data.index[data["original_price"].isna()]
data.loc[index,"original_price"] = pd.Series(data.loc[index]["price"])

In [6]:
data.head()

Unnamed: 0,id,name,seller,price,original_price,lectures,hours,level
0,1,the-complete-sql-bootcamp/,Jose Portilla,25.99,179.99,83,9.0,All Levels
1,2,tableau10/,Kirill Eremenko SuperDataScience Team,16.99,124.99,82,9.0,All Levels
2,3,powerbi-complete-introduction/,Manuel Lorenz Academind by Maximilian Schwarz...,12.99,94.99,275,23.5,Beginner
3,4,microsoft-power-bi-up-running-with-power-bi-de...,Maven Analytics Chris Dutton Aaron Parry,19.99,139.99,117,11.0,All Levels
4,5,the-business-intelligence-analyst-course-2018/,365 Careers,12.99,94.99,413,20.5,All Levels


In [7]:
import os.path
total_num = data.shape[0]
miss_course = []
for i in range(total_num):
    if not os.path.exists('../data/raw/'+str(i+1)+'_info.txt'):
        miss_course.append(i)
print(miss_course)
data = data.drop(data.index[miss_course])
# data = data.reset_index(drop=True)

[372, 494, 868, 914]


In [8]:
title = []
headline = []
enrollment = []
rating = []
num_ratings = []
last_update_date = []
# requirements = []
description = []
five_stars = []
four_stars = []
three_stars = []
two_stars = []
hours_ondemand_video = []
downloadable_resources = []

In [9]:
import codecs
import re
for i in range(total_num):
    fname = '../data/raw/'+str(i+1)+'_info.txt'
    if i not in miss_course:
        with codecs.open(fname,encoding="utf8") as f:
            content = f.readlines()
        title.append(content[1].replace('\n',''))
        headline.append(content[2].replace('\n',''))
        enrollment.append(content[3].replace('\n',''))
        rating.append(content[4].replace('\n',''))
        num_ratings.append(content[5].replace('\n',''))
        last_update_date.append(content[6].replace('\n',''))
        description.append(content[7].replace('\n','') + " "+content[8].replace('\n','')+ " "+content[9].replace('\n',''))
        five_stars.append(content[10].replace('\n',''))
        four_stars.append(content[11].replace('\n',''))
        three_stars.append(content[12].replace('\n',''))
        two_stars.append(content[13].replace('\n',''))
        try:
            hours_ondemand_video.append(re.findall(r"[-+]?\d*\.\d+|\d+", content[16])[0])
        except:
            hours_ondemand_video.append(0)
        try:
            downloadable_resources.append(re.findall(r'\d+', content[17])[0])
        except:
            downloadable_resources.append(0)

In [10]:
def convert_star(d):
    tmp = re.findall(r'\d+', d)
    if len(tmp)>0:
        return float(tmp[0])/100
    else:
        return 0

In [11]:
data['enrollment'] = pd.Series(enrollment).replace("NULL","0").replace("","0").astype(int)
data['rating'] = pd.Series(rating).replace("NULL","0").astype(float)
data['num_ratings'] = pd.Series(num_ratings).replace("NULL","0").astype(float)
data['five_stars'] = pd.Series(five_stars).apply(convert_star)
data['four_stars'] = pd.Series(four_stars).apply(convert_star)
data['three_stars'] = pd.Series(three_stars).apply(convert_star)
data['two_stars'] = pd.Series(two_stars).apply(convert_star)
data['description'] = pd.Series(description)
data['hours_ondemand_video'] = pd.Series(hours_ondemand_video).astype(float)
data['downloadable_resources'] = pd.Series(downloadable_resources).astype(int)
data['description'] = pd.Series(description)

In [12]:
import datetime as dt
def to_DT(date):
    date = date.replace("Published ","")
    date = date.replace(" ","")
    date = date.strip()
    if date == "NULL":
        date = "1/1900"
    DT = dt.datetime.strptime(date, '%m/%Y') # 11/2018
    return DT
data['last_update_date'] = pd.Series([to_DT(date) for date in last_update_date])

In [13]:
instructor_title = []
instructor_job_title = []
instructor_rating = []
instructor_no_reviews = []
instructor_no_students = []
instructor_no_courses = []
instructor_info = []

In [14]:
def convert_rating(d):
    tmp = re.findall(r"[-+]?\d*\.\d+|\d+", d)
    if len(tmp)>0:
        return tmp[0]
    else:
        return 0

def convert_number(d):
    d = d.replace(",", "")
    tmp = re.findall(r'\d+', d)
    if len(tmp)>0:
        return tmp[0]
    else:
        return 0


In [15]:
for i in range(total_num):
    fname = '../data/raw/'+str(i+1)+'_instructor.txt'
    if i not in miss_course:
        with codecs.open(fname,encoding="utf8") as f:
            content = f.readlines()
        instructor_title.append(content[0].replace('\n',''))
        instructor_job_title.append(content[1].replace('\n',''))
        instructor_rating.append(convert_rating(content[2]))
        instructor_no_courses.append(convert_number(content[3]))
        instructor_no_reviews.append(convert_number(content[4]))
        instructor_no_students.append(convert_number(content[5]))
        instructor_info.append(content[6].replace('\n',''))

In [16]:
data['instructor_title'] = pd.Series(instructor_title)
data['instructor_job_title'] = pd.Series(instructor_job_title)
data['instructor_info'] = pd.Series(instructor_info)
data['instructor_rating'] = pd.Series(instructor_rating).astype(float)
data['instructor_no_reviews'] = pd.Series(instructor_no_reviews).astype(int)
data['instructor_no_students'] = pd.Series(instructor_no_students).astype(int)
data['instructor_no_courses'] = pd.Series(instructor_no_courses).astype(int)

In [17]:
data.head()


Unnamed: 0,id,name,seller,price,original_price,lectures,hours,level,enrollment,rating,...,hours_ondemand_video,downloadable_resources,last_update_date,instructor_title,instructor_job_title,instructor_info,instructor_rating,instructor_no_reviews,instructor_no_students,instructor_no_courses
0,1,the-complete-sql-bootcamp/,Jose Portilla,25.99,179.99,83,9.0,All Levels,342288.0,4.7,...,9.0,14.0,2020-10-01,Jose Portilla,"Head of Data Science, Pierian Data Inc.",Jose Marcial Portilla has a BS and MS in Mec...,4.6,712729.0,2179902.0,31.0
1,2,tableau10/,Kirill Eremenko SuperDataScience Team,16.99,124.99,82,9.0,All Levels,231631.0,4.6,...,9.0,6.0,2021-02-01,Ligency Team,Helping Data Scientists Succeed,"Hi there, We are the Ligency PR and Marketing ...",4.5,429183.0,1472140.0,48.0
2,3,powerbi-complete-introduction/,Manuel Lorenz Academind by Maximilian Schwarz...,12.99,94.99,275,23.5,Beginner,148077.0,4.6,...,23.5,26.0,2020-11-01,Maximilian Schwarzmüller,Professional Web Developer and Instructor,Experience as (Web) Developer Starting out at ...,4.6,564184.0,1335299.0,35.0
3,4,microsoft-power-bi-up-running-with-power-bi-de...,Maven Analytics Chris Dutton Aaron Parry,19.99,139.99,117,11.0,All Levels,109303.0,4.6,...,10.5,14.0,2021-02-01,Aaron Parry,Professional Business Intelligence Trainer & C...,Aaron is a professional analytics consultant a...,4.6,34377.0,438163.0,3.0
4,5,the-business-intelligence-analyst-course-2018/,365 Careers,12.99,94.99,413,20.5,All Levels,136869.0,4.5,...,20.0,151.0,2020-11-01,365 Careers,Creating opportunities for Business & Finance ...,365 Careers is the #1 best-selling provider of...,4.5,385370.0,1322640.0,68.0


In [25]:
review_index = []
review_rating = []
review_text = []
r_i = 0

In [26]:
for i in range(total_num):
    fname = '../data/raw/'+str(i+1)+'_review.txt'
    if i not in miss_course:
        with codecs.open(fname,encoding="utf8") as f:
            content = f.readlines()
        tmp = []
        if len(content) == 0:
            review_index.append(tmp)
            continue
        for i in range(len(content)//2):
            review_rating.append(content[i*2].replace('\n','').replace("NULL","0"))
            review_text.append(content[i*2+1].replace('\n',''))
            tmp.append(r_i)
            r_i += 1
        review_index.append(tmp)


In [28]:
review_pd = pd.DataFrame(list(zip(review_rating, review_text)), columns =['review_rating', 'review_text'])
review_pd["review_rating"] = review_pd["review_rating"].astype(float)
data['review_index'] = pd.Series(review_index)

In [29]:
data.head()

Unnamed: 0,id,name,seller,price,original_price,lectures,hours,level,enrollment,rating,...,downloadable_resources,last_update_date,instructor_title,instructor_job_title,instructor_info,instructor_rating,instructor_no_reviews,instructor_no_students,instructor_no_courses,review_index
0,1,the-complete-sql-bootcamp/,Jose Portilla,25.99,179.99,83,9.0,All Levels,342288.0,4.7,...,14.0,2020-10-01,Jose Portilla,"Head of Data Science, Pierian Data Inc.",Jose Marcial Portilla has a BS and MS in Mec...,4.6,712729.0,2179902.0,31.0,"[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]"
1,2,tableau10/,Kirill Eremenko SuperDataScience Team,16.99,124.99,82,9.0,All Levels,231631.0,4.6,...,6.0,2021-02-01,Ligency Team,Helping Data Scientists Succeed,"Hi there, We are the Ligency PR and Marketing ...",4.5,429183.0,1472140.0,48.0,"[12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23]"
2,3,powerbi-complete-introduction/,Manuel Lorenz Academind by Maximilian Schwarz...,12.99,94.99,275,23.5,Beginner,148077.0,4.6,...,26.0,2020-11-01,Maximilian Schwarzmüller,Professional Web Developer and Instructor,Experience as (Web) Developer Starting out at ...,4.6,564184.0,1335299.0,35.0,"[24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35]"
3,4,microsoft-power-bi-up-running-with-power-bi-de...,Maven Analytics Chris Dutton Aaron Parry,19.99,139.99,117,11.0,All Levels,109303.0,4.6,...,14.0,2021-02-01,Aaron Parry,Professional Business Intelligence Trainer & C...,Aaron is a professional analytics consultant a...,4.6,34377.0,438163.0,3.0,"[36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47]"
4,5,the-business-intelligence-analyst-course-2018/,365 Careers,12.99,94.99,413,20.5,All Levels,136869.0,4.5,...,151.0,2020-11-01,365 Careers,Creating opportunities for Business & Finance ...,365 Careers is the #1 best-selling provider of...,4.5,385370.0,1322640.0,68.0,"[48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59]"


In [30]:
data.to_pickle("data.pkl")
review_pd.to_pickle("review_pd.pkl")
