In [211]:
import unicodecsv

In [212]:

def read_csv(filename):
    with open(filename,'rb') as f:
        reader = unicodecsv.DictReader(f)
        return list(reader)


enrollments = read_csv('enrollments.csv')
daily_engagement = read_csv('daily_engagement.csv')
project_submissions = read_csv('project_submissions.csv')

In [213]:
enrollments[0]

{'account_key': '448',
 'cancel_date': '2015-01-14',
 'days_to_cancel': '65',
 'is_canceled': 'True',
 'is_udacity': 'True',
 'join_date': '2014-11-10',
 'status': 'canceled'}

In [214]:
daily_engagement[0]

{'acct': '0',
 'lessons_completed': '0.0',
 'num_courses_visited': '1.0',
 'projects_completed': '0.0',
 'total_minutes_visited': '11.6793745',
 'utc_date': '2015-01-09'}

In [215]:
project_submissions[0]

{'account_key': '256',
 'assigned_rating': 'UNGRADED',
 'completion_date': '2015-01-16',
 'creation_date': '2015-01-14',
 'lesson_key': '3176718735',
 'processing_state': 'EVALUATED'}

In [216]:
import datetime

def parse_maybe_int(data):
    if data == '':
        return None
    return int(float(data))

def parse_boolean(data):
    if data == '':
        return None
    if data == 'True':
        return True
    else:
        return False
    
def parse_Datetime(data):
    if data == '':
        return None
    else:
        return datetime.datetime.strptime(data, "%Y-%m-%d")
    
for enrollment in enrollments:
    enrollment['cancel_date'] = parse_Datetime(enrollment['cancel_date'])
    enrollment['days_to_cancel'] = parse_maybe_int(enrollment['days_to_cancel'])
    enrollment['is_canceled'] = parse_boolean(enrollment['is_canceled'])
    enrollment['is_udacity'] = parse_boolean(enrollment['is_udacity'])
    enrollment['join_date'] = parse_Datetime(enrollment['join_date'])
    
for engagement in daily_engagement:
    engagement['lessons_completed'] = parse_maybe_int(engagement['lessons_completed'])
    engagement['num_courses_visited'] = parse_maybe_int(engagement['num_courses_visited'])
    engagement['projects_completed'] = parse_maybe_int(engagement['projects_completed'])
    engagement['total_minutes_visited'] = float(engagement['total_minutes_visited'])
    engagement['utc_date'] = parse_Datetime(engagement['utc_date'])
    
for submission in project_submissions:
    submission['completion_date'] = parse_Datetime(submission['completion_date'])
    submission['creation_date'] = parse_Datetime(submission['creation_date'])
    

In [217]:
for engagement_record in daily_engagement:
    engagement_record['account_key'] = engagement_record['acct']
    del engagement_record['acct']
    


In [218]:
def get_unique_students(data):
    unique_students = set()
    for record in data:
        unique_students.add(record['account_key'])
    return unique_students

In [219]:
unique_enrollments_students = get_unique_students(enrollments)
unique_engagement_students =  get_unique_students(daily_engagement)
unique_submissions_students = get_unique_students(project_submissions)


In [220]:
print('unique enrollments students: %d' %(len(unique_enrollments_students)))
print('unique engagement students: %d' %(len(unique_engagement_students)))
print('unique submissions students: %d' %(len(unique_submissions_students)))

unique enrollments students: 1302
unique engagement students: 1237
unique submissions students: 743


In [221]:
num_problem_student = 0;
for enrollment in enrollments:
    student = enrollment['account_key']
    if (student not in unique_engagement_students and enrollment['join_date'] != enrollment['cancel_date']):
        print(enrollment)
        num_problem_student += 1

print(num_problem_student)

{'days_to_cancel': 59, 'join_date': datetime.datetime(2015, 1, 10, 0, 0), 'is_udacity': True, 'cancel_date': datetime.datetime(2015, 3, 10, 0, 0), 'status': 'canceled', 'account_key': '1304', 'is_canceled': True}
{'days_to_cancel': 99, 'join_date': datetime.datetime(2015, 3, 10, 0, 0), 'is_udacity': True, 'cancel_date': datetime.datetime(2015, 6, 17, 0, 0), 'status': 'canceled', 'account_key': '1304', 'is_canceled': True}
{'days_to_cancel': None, 'join_date': datetime.datetime(2015, 2, 25, 0, 0), 'is_udacity': True, 'cancel_date': None, 'status': 'current', 'account_key': '1101', 'is_canceled': False}
3


In [222]:
udacity_test_accout = set()
for enrollment in enrollments:
    if enrollment['is_udacity']:
        udacity_test_accout.add(enrollment['account_key'])
len(udacity_test_accout)

6

In [223]:
def remove_udacity_account(data):
    non_udacity_account = []
    for record in data:
        if record['account_key'] not in udacity_test_accout:
            non_udacity_account.append(record)
    return non_udacity_account



In [224]:
non_udacity_enrollments = remove_udacity_account(enrollments)
non_udacity_daily_engagement = remove_udacity_account(daily_engagement)
non_udacity_project_submissions = remove_udacity_account(project_submissions)

In [225]:
print(len(non_udacity_enrollment))
print(len(non_udacity_daily_engagement))
print(len(non_udacity_project_submissions))

1622
135656
3634


In [226]:
paid_students = {}
for enrollment in non_udacity_enrollments:
    if (not enrollment['is_canceled'] or
            enrollment['days_to_cancel'] > 7):
        account_key = enrollment['account_key']
        enrollment_date = enrollment['join_date']
        if (account_key not in paid_students or
                enrollment_date > paid_students[account_key]):
            paid_students[account_key] = enrollment_date
len(paid_students)

995

In [227]:
non_udacity_enrollment[0]

{'account_key': '700',
 'cancel_date': datetime.datetime(2014, 11, 16, 0, 0),
 'days_to_cancel': 6,
 'is_canceled': True,
 'is_udacity': False,
 'join_date': datetime.datetime(2014, 11, 10, 0, 0),
 'status': 'canceled'}

In [278]:
def within_one_week(join_date, engagement_date):
    time_delta = engagement_date - join_date
    return time_delta.days < 7 and time_delta.days >=0 

def remove_free_trial_cancels(data):
    new_data = []
    for data_point in data:
        if data_point['account_key'] in paid_students:
            new_data.append(data_point)
    return new_data



In [279]:
paid_enrollments = remove_free_trial_cancels(non_udacity_enrollments)
paid_engagement = remove_free_trial_cancels(non_udacity_daily_engagement)
paid_submissions = remove_free_trial_cancels(non_udacity_project_submissions)

print(len(paid_enrollments))
print (len(paid_engagement))
print (len(paid_submissions))

1293
134549
3618


In [280]:
paid_engagement_in_first_week = []
for engagement_record in paid_engagement:
    account_key = engagement_record['account_key']
    join_date = paid_students[account_key]
    engagement_record_date = engagement_record['utc_date']

    if within_one_week(join_date, engagement_record_date):
        paid_engagement_in_first_week.append(engagement_record)

len(paid_engagement_in_first_week)

6919

In [281]:
paid_engagement_in_first_week[0]

{'account_key': '0',
 'lessons_completed': 0,
 'num_courses_visited': 1,
 'projects_completed': 0,
 'total_minutes_visited': 11.6793745,
 'utc_date': datetime.datetime(2015, 1, 9, 0, 0)}

In [282]:
from collections import defaultdict
engagement_by_account = defaultdict(list)

for engagement in paid_engagement_in_first_week:
    key = engagement['account_key']
    engagement_by_account[key].append(engagement)
    
    

In [283]:
total_minutes_by_account = {}

for accout_key, engagement_for_student in engagement_by_account.items():
    total = 0
    for engagement in engagement_for_student:
        total += engagement['total_minutes_visited']
    total_minutes_by_account[accout_key] = total

In [284]:
total_minutes = list(total_minutes_by_account.values())
sum(total_minutes)/len(total_minutes)

import numpy as np
print('Mean:', np.mean(total_minutes))
print('std:', np.std(total_minutes))
print('min:', np.min(total_minutes))
print('max:', np.max(total_minutes))

Mean: 306.708326753
std: 412.996933409
min: 0.0
max: 3564.7332645


In [285]:
student_with_max_minutes = max(total_minutes_by_account.items(), key=lambda pair: pair[1])
s = None   
for d in paid_engagement_in_first_week:
    if d['account_key'] == student_with_max_minutes[0]:
        print(d)
        s=d
    

{'num_courses_visited': 4, 'total_minutes_visited': 850.519339666, 'account_key': '163', 'utc_date': datetime.datetime(2015, 7, 9, 0, 0), 'projects_completed': 0, 'lessons_completed': 4}
{'num_courses_visited': 6, 'total_minutes_visited': 872.633923334, 'account_key': '163', 'utc_date': datetime.datetime(2015, 7, 10, 0, 0), 'projects_completed': 0, 'lessons_completed': 6}
{'num_courses_visited': 2, 'total_minutes_visited': 777.018903666, 'account_key': '163', 'utc_date': datetime.datetime(2015, 7, 11, 0, 0), 'projects_completed': 0, 'lessons_completed': 6}
{'num_courses_visited': 1, 'total_minutes_visited': 294.568774, 'account_key': '163', 'utc_date': datetime.datetime(2015, 7, 12, 0, 0), 'projects_completed': 0, 'lessons_completed': 2}
{'num_courses_visited': 3, 'total_minutes_visited': 471.2139785, 'account_key': '163', 'utc_date': datetime.datetime(2015, 7, 13, 0, 0), 'projects_completed': 0, 'lessons_completed': 1}
{'num_courses_visited': 2, 'total_minutes_visited': 298.778345333,

In [288]:
paid_students['163']

datetime.datetime(2015, 7, 9, 0, 0)

In [287]:
within_one_week(paid_students['108'], s['utc_date'])

False