Data Analysis basic steps
* Question phase
* Data wrangling (data acquisition, data cleaning)
* Data exploration (build intuition, find patterns)
* Once farmiliar may want to draw some conclusions or make predictions - statistics, machine learning, etc)
* Communicate findings, blogs, papers, email, presentations, conversations (includes data viz, etc.)

Steps may mix around

In [1]:
# open a file
import unicodecsv

with open ('enrollments.csv', 'rb') as f_in:
    reader = unicodecsv.DictReader(f_in)
    enrollments = list(reader)
# print(enrollments)

# note, this creates a list of dicts
print(type(enrollments))
print(type(enrollments[0]))
# note, cancel_date is just a string (will be changed later)
print(enrollments[0])

<class 'list'>
<class 'dict'>
{'cancel_date': '2015-01-14', 'status': 'canceled', 'is_canceled': 'True', 'account_key': '448', 'is_udacity': 'True', 'days_to_cancel': '65', 'join_date': '2014-11-10'}


In [2]:
# make a function to open files
import unicodecsv

def read_csv(filename):
    with open(filename, 'rb') as f:
        reader = unicodecsv.DictReader(f)
        return list(reader)

enrollments = read_csv('enrollments.csv')
daily_engagement = read_csv('daily_engagement.csv')
project_submissions = read_csv('project_submissions.csv')
# print(enrollments)

In [3]:
# functions for fixing data
from datetime import datetime as dt

# converts string to datetime obj. If no date, returns none
def parse_date(date):
    if date == '':
        return None
    else:
        return dt.strptime(date, '%Y-%m-%d')
    
def parse_maybe_int(i):
    if i == '':
        return None
    else:
        return int(i)

In [4]:
# enrollments before conversion
enrollments[0]

{'account_key': '448',
 'cancel_date': '2015-01-14',
 'days_to_cancel': '65',
 'is_canceled': 'True',
 'is_udacity': 'True',
 'join_date': '2014-11-10',
 'status': 'canceled'}

In [5]:
# clean up enrollments table
for enrollment in enrollments:
    enrollment['cancel_date'] = parse_date(enrollment['cancel_date'])
    enrollment['days_to_cancel'] = parse_maybe_int(enrollment['days_to_cancel'])
    enrollment['is_canceled'] = enrollment['is_canceled'] == 'True'
    enrollment['is_udacity'] = enrollment['is_udacity'] == 'True'
    enrollment['join_date'] = parse_date(enrollment['join_date'])

In [6]:
# enrollments after conversions
enrollments[0]

{'account_key': '448',
 'cancel_date': datetime.datetime(2015, 1, 14, 0, 0),
 'days_to_cancel': 65,
 'is_canceled': True,
 'is_udacity': True,
 'join_date': datetime.datetime(2014, 11, 10, 0, 0),
 'status': 'canceled'}

In [7]:
# daily_engagement before
daily_engagement[0]

{'acct': '0',
 'lessons_completed': '0.0',
 'num_courses_visited': '1.0',
 'projects_completed': '0.0',
 'total_minutes_visited': '11.6793745',
 'utc_date': '2015-01-09'}

In [8]:
# clean up the engagements table
for engagement_record in daily_engagement:
    engagement_record['lessons_completed'] = int(float(engagement_record['lessons_completed']))
    engagement_record['num_courses_visited'] = int(float(engagement_record['num_courses_visited']))
    engagement_record['projects_completed'] = int(float(engagement_record['projects_completed']))
    engagement_record['total_minutes_visited'] = float(engagement_record['total_minutes_visited'])
    engagement_record['utc_date'] = parse_date(engagement_record['utc_date'])

In [9]:
# daily_engagement after
daily_engagement[0]

{'acct': '0',
 'lessons_completed': 0,
 'num_courses_visited': 1,
 'projects_completed': 0,
 'total_minutes_visited': 11.6793745,
 'utc_date': datetime.datetime(2015, 1, 9, 0, 0)}

In [10]:
# project_submissions before
project_submissions[0]

{'account_key': '256',
 'assigned_rating': 'UNGRADED',
 'completion_date': '2015-01-16',
 'creation_date': '2015-01-14',
 'lesson_key': '3176718735',
 'processing_state': 'EVALUATED'}

In [11]:
# clean up the submissions table
for submission in project_submissions:
    submission['completion_date'] = parse_date(submission['completion_date'])
    submission['creation_date'] = parse_date(submission['creation_date'])

In [12]:
# project_submissions after
project_submissions[0]

{'account_key': '256',
 'assigned_rating': 'UNGRADED',
 'completion_date': datetime.datetime(2015, 1, 16, 0, 0),
 'creation_date': datetime.datetime(2015, 1, 14, 0, 0),
 'lesson_key': '3176718735',
 'processing_state': 'EVALUATED'}

In [13]:
# change 'acct' in the daily_engagements table to 'account_key'
for engagement_record in daily_engagement:
    engagement_record['account_key'] = engagement_record['acct']
    del(engagement_record['acct'])
daily_engagement[12]

{'account_key': '0',
 'lessons_completed': 0,
 'num_courses_visited': 0,
 'projects_completed': 0,
 'total_minutes_visited': 0.0,
 'utc_date': datetime.datetime(2015, 1, 21, 0, 0)}

In [14]:
# find number of enrollments
len(enrollments)

1640

In [15]:
# find number of unique enrollments using pure python
unique_enrolled_students = set()
for enrollment in enrollments:
    unique_enrolled_students.add(enrollment['account_key'])
len(unique_enrolled_students)

1302

In [16]:
# create function to find unique enrollments
def get_unique_students(data):
    unique_students = set()
    for data_point in data:
        unique_students.add(data_point['account_key'])
    return unique_students

In [17]:
# use the function to find unique enrollments
print("unique enrollments: %s" %len(get_unique_students(enrollments)))

unique enrollments: 1302


In [18]:
# use the function to find unique engagements
print("unique engagements: %s" %len(get_unique_students(daily_engagement)))

unique engagements: 1237


Why are there less in daily_engement than in enrollments? We'd expect them to be the same. Printing out a row that is in enrollments but not in daily_engagement may help identify what's going on:

In [19]:
# first make a set with all the unique identifiers for daily_engagement
unique_engagement_students = get_unique_students(daily_engagement)

In [20]:
# use a loop to find what I'm looking for
for enrollment in enrollments:
    student = enrollment['account_key']
    if student not in unique_engagement_students:
        print(enrollment)
        break

{'cancel_date': datetime.datetime(2014, 11, 12, 0, 0), 'status': 'canceled', 'is_canceled': True, 'account_key': '1219', 'is_udacity': False, 'days_to_cancel': 0, 'join_date': datetime.datetime(2014, 11, 12, 0, 0)}


Notice that the join_date and cancel_date are the same, and the days to cancel is equal to zero. That could explain why there is no record in the engagement table for this student. The student may need to be enrolled for a full day before their engagement is recorded. 

So how many students are in the enrollment table but not in the engagement table, and enrolled and canceled on the same day? The below function finds this out:

In [21]:
num_problem_students = 0
for enrollment in enrollments:
    student = enrollment['account_key']
    if (student not in unique_engagement_students and enrollment['join_date'] != enrollment['cancel_date']):
        print(enrollment)
        num_problem_students += 1

num_problem_students

{'cancel_date': datetime.datetime(2015, 3, 10, 0, 0), 'status': 'canceled', 'is_canceled': True, 'account_key': '1304', 'is_udacity': True, 'days_to_cancel': 59, 'join_date': datetime.datetime(2015, 1, 10, 0, 0)}
{'cancel_date': datetime.datetime(2015, 6, 17, 0, 0), 'status': 'canceled', 'is_canceled': True, 'account_key': '1304', 'is_udacity': True, 'days_to_cancel': 99, 'join_date': datetime.datetime(2015, 3, 10, 0, 0)}
{'cancel_date': None, 'status': 'current', 'is_canceled': False, 'account_key': '1101', 'is_udacity': True, 'days_to_cancel': None, 'join_date': datetime.datetime(2015, 2, 25, 0, 0)}


3

Turns out that is_udacity here indicates it's a test account. Removing them:

In [22]:
# create set of Udacity test accounts
udacity_test_accounts = set()
for enrollment in enrollments:
    if enrollment['is_udacity']:
        udacity_test_accounts.add(enrollment['account_key'])
len(udacity_test_accounts)

6

In [23]:
# remove the test accounts
def remove_udacity_accounts(data):
    non_udacity_data = []
    for data_point in data:
        if data_point['account_key'] not in udacity_test_accounts:
            non_udacity_data.append(data_point)
    return non_udacity_data

In [24]:
# call function on all three tables and check how many records left in each
non_udacity_enrollments = remove_udacity_accounts(enrollments)
non_udacity_engagement = remove_udacity_accounts(daily_engagement)
non_udacity_submissions = remove_udacity_accounts(project_submissions)

print(len(non_udacity_enrollments))
print(len(non_udacity_engagement))
print(len(non_udacity_submissions))

1622
135656
3634


In [25]:
# create a dict of students who are not canceled and enrolled for 7+ days
paid_students = {}
for enrollment in non_udacity_enrollments:
    if not enrollment['is_canceled'] or enrollment['days_to_cancel'] > 7:
        account_key = enrollment['account_key']
        enrollment_date = enrollment['join_date']
        paid_students[account_key] = enrollment_date
        
        if account_key not in paid_students or enrollment_date > paid_students[account_key]:
            paid_students[account_key] = enrollment_date
        
len(paid_students)

995

In [26]:
# tests whether sutdent was enrolled less than 7 days
def within_one_week(join_date, engagement_date):
    time_delta = engagement_date - join_date
    return time_delta.days < 7

In [27]:
# removes free trial cancels
def remove_free_trial_cancels(data):
    new_data = []
    for data_point in data:
        if data_point['account_key'] in paid_students:
            new_data.append(data_point)
    return new_data

In [28]:
paid_enrollments = remove_free_trial_cancels(non_udacity_enrollments)
paid_engagement = remove_free_trial_cancels(non_udacity_engagement)
paid_submissions = remove_free_trial_cancels(non_udacity_submissions)

print(len(paid_enrollments))
print(len(paid_engagement))
print(len(paid_submissions))

1293
134549
3618


In [29]:
paid_engagement_in_first_week = []

for engagement_record in paid_engagement:
    account_key = engagement_record['account_key']
    join_date = paid_students[account_key]
    engagement_record_date = engagement_record['utc_date']
    
    if within_one_week(join_date, engagement_record_date):
        paid_engagement_in_first_week.append(engagement_record)
len(paid_engagement_in_first_week)

# note, I'm getting lower than the 21508 in video

17210