# The MOOC Pivot: Base code

Author: José A. Rupérez Valiente
Email: jruipere@mit.edu

## Libraries

In [1]:
import pandas as pd
import numpy as np

## Data loading

In [44]:
# Importing the person_course dataset
all_person_course = pd.read_csv('person_course.csv')

  interactivity=interactivity, compiler=compiler, result=result)


In [45]:
# Importing course metadata
course_metadata = pd.read_csv('course_metadata.csv')

In [46]:
# Importing country metadata from the United Nations
country_metadata = pd.read_csv('country_metadata.csv')

## Data cleaning and filtering

In [47]:
all_person_course.rename(columns = {'cc_by_ip':'alpha-2'}, inplace = True)

In [48]:
# keeping only students, removing course staff
all_person_course = all_person_course[all_person_course['roles'] == 'Student']

In [49]:
# Consolidating nationality of each user -- This is done to fix some missing values 
all_person_course = pd.merge(all_person_course[['course_id', 'user_id', 'completed', 'roles', 'viewed',
       'explored', 'certified', 'mode', 'prs_reason_lc', 'prs_intent_verified',
       'prs_intent_lecture', 'prs_intent_assess', 'prs_intent']],
       all_person_course.groupby(by = 'user_id', as_index=False).agg({'alpha-2':'first'}),
       on = 'user_id', how='left')

## Data merging

In [50]:
# merging both the course and country metadata into the main person_course file
all_person_course = pd.merge(all_person_course, course_metadata, on = 'course_id', how = 'left')

In [51]:
all_person_course = pd.merge(all_person_course, country_metadata, on = 'alpha-2', how = 'left')

## Analysis

### Table S2 - HarvardX and MITx Number of Enrolments and Certifications by Year

In [10]:
# grouping by year and HDI category and summing up enrolments and certifications
enrolments_certifications_per_year_hdi = all_person_course.groupby(['year', 
                                         'human_development_category']).agg({'user_id': 'count',
                                         'certified':'sum'}).reset_index().rename(columns = {'user_id':'n_enrolments',
                                                                                             'certified':'n_certified'})

In [11]:
enrolments_certifications_per_year_hdi

Unnamed: 0,year,human_development_category,n_enrolments,n_certified
0,Year 1,high_human_development,54483,2476
1,Year 1,low_human_development,12616,796
2,Year 1,medium_human_development,91773,3384
3,Year 1,very_high_human_development,205955,10798
4,Year 2,high_human_development,318683,8429
5,Year 2,low_human_development,37780,1075
6,Year 2,medium_human_development,501898,7742
7,Year 2,very_high_human_development,1359191,43235
8,Year 3,high_human_development,414912,10903
9,Year 3,low_human_development,47543,1244


In [12]:
enrolments_certifications_per_year_hdi.to_csv('Table_S2.csv', index = False)

### Table S4 - Mean, Median and Std. Deviation of Participation/Completion by HarvardX and MITx Course

In [13]:
# aggregating participation and completion by course
data_by_course = all_person_course.groupby('course_id').agg({'user_id': 'count',
                                           'viewed': 'sum', 'explored': 'sum',
                                           'completed': 'sum',
                                           'certified': 'sum'}).reset_index().rename(columns = {'user_id':'registrations'})

In [14]:
# computing percentage from registrants
data_by_course['p_participants'] = round(100*data_by_course['viewed']/data_by_course['registrations'],2)

In [15]:
# computing percentages from participants
data_by_course[['p_explored','p_completed','p_certified']] = round(100*data_by_course[['explored','completed','certified']].div(data_by_course.viewed, axis=0),2)

In [16]:
# Removing some infinite rows due to some data issues
data_by_course = data_by_course[~(data_by_course['p_completed'] == np.inf)]

In [17]:
# Computing mean, median and std. deviation by course
agg_data_by_course = round(data_by_course.agg({'p_participants':['mean', 'median', 'std'],
                                               'p_explored':['mean', 'median', 'std'],
                                               'p_completed':['mean', 'median', 'std'],
                                               'p_certified':['mean', 'median', 'std']}),2)

In [18]:
agg_data_by_course

Unnamed: 0,p_participants,p_explored,p_completed,p_certified
mean,48.02,25.59,6.39,5.03
median,51.01,19.84,4.42,2.88
std,15.49,19.54,6.62,6.04


In [19]:
agg_data_by_course.to_csv('Table_S4.csv', index = False)

### Table S1 - HarvardX and MITx Churn Rate

In [20]:
# keeping only participants
all_person_course = all_person_course[all_person_course['viewed'] == 1]

In [21]:
all_person_course.set_index('user_id', inplace=True)

In [22]:
# need to turn into a numeric to do the min afterwards
all_person_course['year2'] = pd.to_numeric(all_person_course['year'].str.split(" ", expand=True)[1])

In [23]:
# we compute the registration year per user
all_person_course['registration_year'] = all_person_course.groupby(level=0)['year2'].min()

In [24]:
# group by registration year and runnig year to compute number of unique users per each one
attrition_by_year = all_person_course.reset_index().groupby(['registration_year', 'year2']).agg({'user_id': pd.Series.nunique}).reset_index()

In [25]:
attrition_by_year = attrition_by_year.rename(columns={'user_id': 'unique_learners', 'year2': 'year'})

In [26]:
# add the initial size of each registration year cohort
attrition_by_year = pd.merge(attrition_by_year, 
                             attrition_by_year.groupby('registration_year').agg({'unique_learners':'first'}).reset_index().rename(columns={'unique_learners':'cohort_learners'}),
                             on = 'registration_year', how = 'left')

In [27]:
# computing retention in each running year per cohort
attrition_by_year['percentage_retention'] = round(100*attrition_by_year['unique_learners']/attrition_by_year['cohort_learners'],2)

In [28]:
attrition_by_year.drop(columns = 'cohort_learners', inplace=True)

In [29]:
attrition_by_year

Unnamed: 0,registration_year,year,unique_learners,percentage_retention
0,1.0,1.0,192255,100.0
1,1.0,2.0,72093,37.5
2,1.0,3.0,46494,24.18
3,1.0,4.0,36089,18.77
4,1.0,5.0,21690,11.28
5,1.0,6.0,15059,7.83
6,2.0,2.0,755800,100.0
7,2.0,3.0,205814,27.23
8,2.0,4.0,146713,19.41
9,2.0,5.0,76780,10.16


In [30]:
attrition_by_year.to_csv('Table_S1.csv', index = False)

### Table S3 HarvardX and MITx Course Completion by Year and Cohort of Learners

In [31]:
# this is the cohort of participants (we already filtered to keep only participants before, so no need to do it again here)
completion_participants = all_person_course[['year', 'viewed', 'completed']].groupby('year').sum().reset_index()

In [32]:
completion_participants['p_completed'] = round(100*(completion_participants['completed']/completion_participants['viewed']), 2)
completion_participants['cohort'] = 'participants'

In [33]:
# subsetting registrations of students where they self-reported intention to complete
# due to differences in survey fields over the years we need to use several fields
completion_intended = all_person_course[(all_person_course['prs_reason_lc'] == 3) 
                  | (all_person_course['prs_intent'] == 3)
                  | (all_person_course['prs_intent_verified'] == 1) 
                  | (all_person_course['prs_intent_assess'] == 3)
                  | (all_person_course['prs_intent_lecture'] == 3)][['year', 'viewed', 'completed']].groupby('year').sum().reset_index()

In [34]:
completion_intended['p_completed'] = round(100*(completion_intended['completed']/completion_intended['viewed']), 2)
completion_intended['cohort'] = 'intend_complete'

In [35]:
# subsetting registrations of students that paid to enter the verified track
completition_verified = all_person_course[all_person_course['mode'] == 'verified'][['year', 'viewed', 'completed']].groupby('year').sum().reset_index()

In [36]:
completition_verified['p_completed'] = round(100*(completition_verified['completed']/completition_verified['viewed']), 2)
completition_verified['cohort'] = 'verified'

In [37]:
# concatenating the three cohorts together
completion_by_cohort = pd.concat([completion_participants, completion_intended, completition_verified]).rename(columns = {'viewed':'n_participants',
                                                                                                                          'completed':'n_completed'})

In [38]:
# removing Year 1 since it did not have surveys or verified track
completion_by_cohort = completion_by_cohort[~(completion_by_cohort['year'] == 'Year 1')]

In [39]:
completion_by_cohort

Unnamed: 0,year,n_participants,n_completed,p_completed,cohort
1,Year 2,1219030,60285.0,4.95,participants
2,Year 3,1333780,78085.0,5.85,participants
3,Year 4,1958222,80495.0,4.11,participants
4,Year 5,1097759,43726.0,3.98,participants
5,Year 6,980589,30650.0,3.13,participants
0,Year 2,178909,20593.0,11.51,intend_complete
1,Year 3,191578,33986.0,17.74,intend_complete
2,Year 4,209333,34416.0,16.44,intend_complete
3,Year 5,74116,11952.0,16.13,intend_complete
4,Year 6,42005,6478.0,15.42,intend_complete


In [40]:
completion_by_cohort.to_csv('Table_S3.csv', index = False)