### Load Data

In [1]:
import pandas as pd
import re

courses_prices = pd.read_csv('./Data/course_prices.csv')
courses = pd.read_csv('./Data/Coursera_courses.csv')
users = pd.read_csv('./Data/Coursera_reviews.csv')
courses_metadata = pd.read_csv('./Data/CourseraDataset-Unclean.csv')
job_skills = pd.read_csv('./Data/job_skills.csv')
job_metadata = pd.read_csv('./Data/linkedin_job_postings.csv')

### Remove Duplicate Courses

In [2]:
courses_metadata.rename(columns={'Course Title': 'name'}, inplace = True)
courses_metadata.drop_duplicates(subset=['name'], inplace=True)

### Remove Duplicate Reviews

In [3]:
users.drop_duplicates(subset=['reviews', 'reviewers', 'course_id'], inplace=True)

### Handle Null Values

In [4]:
#print(job_skills.isnull().sum())
#print(courses.isnull().sum())
#print(users.isnull().sum())
job_skills = job_skills.dropna(subset=['job_skills'])

### Merge Courses with Users by course_id

In [5]:
temp = pd.merge(users, courses, on="course_id", how="inner")
#temp.head()

### Remove non-ASCII values in 'name' columns in dataset

In [6]:
def clean_string(value):
    return value.encode('ascii', 'ignore').decode('ascii')

temp['name'] = temp['name'].apply(clean_string)
temp['institution'] = temp['institution'].apply(clean_string)
temp['reviewers'] = temp['reviewers'].apply(clean_string)
courses_metadata['name'] = courses_metadata['name'].apply(clean_string)
courses_prices['name'] = courses_prices['name'].apply(clean_string)
courses_prices['institution'] = courses_prices['institution'].apply(clean_string)

### Merge Additional Metadata with Temp dataset

In [7]:
courses_data = pd.merge(temp, courses_metadata, on="name", how="inner")

#courses_data.head()

### Filter Essential Columns in Courses Data

In [8]:
final_courses = courses_data[['reviews', 'reviewers', 'rating', 'name', 'institution', 'Rating', 'Level', 'Duration', 'Review', 'Skill gain', 'Instructor']]
final_courses = final_courses.rename(columns={ 'Rating': 'Overall Ratings', 'Review': 'Num of Reviews'})
final_courses['Skill gain'] = final_courses['Skill gain'].str.replace("'", "")
#final_courses.head()

### Merge Datasets regarding Jobs

In [9]:
jobs_data = pd.merge(job_skills, job_metadata, on="job_link", how="inner")
#jobs_data.head()

### Filter Essential Columns in Jobs Data

In [10]:
final_jobs = jobs_data[['job_skills', 'job_title', 'search_position', 'job_level']]
#final_jobs.head()
#unique_values = final_jobs['search_position'].unique()
#print(unique_values.tolist())

### Preprocess 
1) Prices
2) Remove "By' from reviewer
3) Fill in null values in level
4) get the number in duration, 
5) get the number in review, 
6) remove the list of skill gain
7) remove the list of instructor


In [11]:
final_courses['reviewers'] = final_courses['reviewers'].apply(lambda x: x[3:])
final_courses['Level'].fillna('None', inplace=True)
final_courses['Duration'] = final_courses['Duration'].str.extract('(\d+)').astype(int)
final_courses['Num of Reviews'] = final_courses['Num of Reviews'].str.extract('(\d+)').astype(int)
final_courses['Skill gain'] = final_courses['Skill gain'].apply(lambda x: x[1:-1])
final_courses['Instructor'] = final_courses['Instructor'].apply(lambda x: x[2:-2])
final_courses['Skill gain'] = final_courses['Skill gain'].replace('', 'No Specific Skills')

# final_courses.head()

### Demean Ratings

In [12]:
reviewer_average_rating = final_courses.groupby('reviewers')['rating'].transform('mean')
final_courses['Demeaned Rating'] = final_courses['rating'] - reviewer_average_rating
# final_courses.head()

### Adding Price Column to Courses Data

In [14]:
final_courses = pd.merge(final_courses, courses_prices, on=['institution', 'name'], how='inner')
print(final_courses.isnull().sum())
print(final_courses.head())
print("Count of datapoints", final_courses.shape)
print("Count of Unique Reviewers", final_courses['reviewers'].nunique())
print("Count of Unique Courses", final_courses['name'].nunique())

reviews            44
reviewers           0
rating              0
name                0
institution         0
Overall Ratings     0
Level               0
Duration            0
Num of Reviews      0
Skill gain          0
Instructor          0
Demeaned Rating     0
Fee_x               0
Fee_y               0
dtype: int64
                                             reviews    reviewers  rating  \
0  Pretty dry, but I was able to pass with just t...     Robert S       4   
1  would be a better experience if the video and ...  Gabriel E R       4   
2  Information was perfect! The program itself wa...      Jacob D       4   
3  A few grammatical mistakes on test made me do ...       Dale B       4   
4  Excellent course and the training provided was...       Sean G       4   

                                                name  \
0  Become a CBRS Certified Professional Installer...   
1  Become a CBRS Certified Professional Installer...   
2  Become a CBRS Certified Professional Installe