## Data Aquisition 
### – Load Data from CSVs

In [1]:
import unicodecsv

In [2]:
import unicodecsv

def readCSV(filename):
    """
    Reads in the data from given file; filename
    
    INPUT : csv file
    OUTPUT: list
    """
    with open(filename, 'rb') as file:
        handle = unicodecsv.DictReader(file)
        return list(handle)

In [3]:
enrollments = readCSV('enrollments.csv') # data type: list
enrollments[0] # shows first row with col name from data file 

OrderedDict([('account_key', '448'),
             ('status', 'canceled'),
             ('join_date', '2014-11-10'),
             ('cancel_date', '2015-01-14'),
             ('days_to_cancel', '65'),
             ('is_udacity', 'True'),
             ('is_canceled', 'True')])

In [4]:
enrollments[0]['account_key'] # shows first row x first col data

'448'

In [5]:
daily_engagement = readCSV('daily_engagement.csv')
daily_engagement[0]

OrderedDict([('acct', '0'),
             ('utc_date', '2015-01-09'),
             ('num_courses_visited', '1.0'),
             ('total_minutes_visited', '11.6793745'),
             ('lessons_completed', '0.0'),
             ('projects_completed', '0.0')])

In [6]:
project_submissions = readCSV('project_submissions.csv')
project_submissions[0] 

OrderedDict([('creation_date', '2015-01-14'),
             ('completion_date', '2015-01-16'),
             ('assigned_rating', 'UNGRADED'),
             ('account_key', '256'),
             ('lesson_key', '3176718735'),
             ('processing_state', 'EVALUATED')])

## Data Cleaning 
### – Fix Data Types to Suitable Format

In [7]:
from datetime import datetime as dt

In [8]:
def convertDatetime(date):
    """
    Takes str type data, and returns in into a Python datetime object
    If there is no date given, returns None
    
    INPUT : string
    OUTPUT: datetime obj or None
    """
    if date == '':
        return None
    else:
        return dt.strptime(date, '%Y-%m-%d')

In [9]:
def convertInt(i):
    """
    Takes a string which is either an empty string or represents an integer,
    and returns an int or None.
    
    INPUT : string
    OUTPUT: int or None
    """
    if i == '':
        return None
    else:
        return int(i)

In [10]:
# Clean up the data types in the enrollments table
# this can be run only once, if I run it again, the data type is already converted, and will cause error

for student in enrollments:
    student['cancel_date'] = convertDatetime(student['cancel_date'])  # str -> datetime obj
    student['days_to_cancel'] = convertInt(student['days_to_cancel']) # str -> int
    student['is_canceled'] = student['is_canceled'] == 'True'         # str -> bool
    student['is_udacity'] = student['is_udacity'] == 'True'
    student['join_date'] = convertDatetime(student['join_date'])

In [11]:
enrollments[0] # shows data in suitable/meaningful format

OrderedDict([('account_key', '448'),
             ('status', 'canceled'),
             ('join_date', datetime.datetime(2014, 11, 10, 0, 0)),
             ('cancel_date', datetime.datetime(2015, 1, 14, 0, 0)),
             ('days_to_cancel', 65),
             ('is_udacity', True),
             ('is_canceled', True)])

In [12]:
# Clean up the data types in the engagement table

for record in daily_engagement:
    record['lessons_completed'] = int(float(record['lessons_completed']))    # str -> float ->int
    record['num_courses_visited'] = int(float(record['num_courses_visited']))
    record['projects_completed'] = int(float(record['projects_completed']))
    record['total_minutes_visited'] = float(record['total_minutes_visited']) # str -> float
    record['utc_date'] = convertDatetime(record['utc_date'])

In [13]:
daily_engagement[0]

OrderedDict([('acct', '0'),
             ('utc_date', datetime.datetime(2015, 1, 9, 0, 0)),
             ('num_courses_visited', 1),
             ('total_minutes_visited', 11.6793745),
             ('lessons_completed', 0),
             ('projects_completed', 0)])

In [14]:
# Clean up the data types in the submissions table

for submission in project_submissions:
    submission['completion_date'] = convertDatetime(submission['completion_date'])
    submission['creation_date'] = convertDatetime(submission['creation_date'])

In [15]:
project_submissions[0]

OrderedDict([('creation_date', datetime.datetime(2015, 1, 14, 0, 0)),
             ('completion_date', datetime.datetime(2015, 1, 16, 0, 0)),
             ('assigned_rating', 'UNGRADED'),
             ('account_key', '256'),
             ('lesson_key', '3176718735'),
             ('processing_state', 'EVALUATED')])

## Questions
### – After briefly looking at the data, what interesting question can we ask?

- How long each student takes to submit their projects?
- How do students who passes their projects differ from those who don't in terms of daily engagement?
- How many do students visit courses and for how long do they visit before cancel?
- Among visited courses, how many lessons and projects were completed?
- Any relationship between the number of completed lessons and the number of completed project?
- How the daily engagement of each student changes?
- How many times students submit their projects before it passes?

## Explore Data

### – Closer look to the data

In [None]:
print (type(enrollments))
print (type(student))

In [None]:
print ("total number of rows:", len(enrollments))
print("total number of columns:", len(student))

In [23]:
def studentCounter(data):
    """
    Returns the total number of rows in data set
    and the the number of unique account key.
    
    INPUT : list
    OUTPUT: tuple
    """
    
    unique = set() # "set" is unordered collections of unique elements

    for row in data:
        try:
            unique.add(row['account_key'])
            
        # this is just a hack to use same function for all three data file 
        # it's not good practice
        except Exception:
            unique.add(row['acct'])

    return (len(data), len(unique))

In [24]:
studentCounter(enrollments) # among 1640 student info, there are 338(1640-1302) duplicated ones

(1640, 1302)

In [25]:
studentCounter(daily_engagement) # 136240 daily records info is about 1237 unique students

(136240, 1237)

In [26]:
studentCounter(project_submissions) # total of 3642 submissions are from 743 students

(3642, 743)

### – Observations

- The number of row is bigger than the number of unique students.
    - This might be due to the students enrolled and cancelled and re-enrolled again with same account info.
- __The unique number of students from daily engagement is smaller than the number of enrolled students.__
    - This might be because either some students enrolled to the course but didn't start anything yet.
    - Yet, still the num of unique students in enrollments and daily engagement should match!!!
    - _We are going to fix it in below_
- About only half of enrolled students submitted projects.
- __There are two different names that holds same type of info; 'account_key' and 'acct'__
    - _We are going to fix it in below_


## More Data Cleaning
### – Multiple Column Names for the Same Data

In [16]:
# Rename the "acct" column in the daily_engagement table to "account_key".

for record in daily_engagement:
    # creating new column called "account_key" and fill up data from 'acct' col
    record['account_key'] = record['acct']
    # delete 'acct' col
    del record['acct']    

In [17]:
daily_engagement[0]['account_key'] # getting data with new colunm name works

'0'

__Now, the "counter" function doesn't have to be hacky. <br> It can simply look for data points in "account_key" columns.__

In [18]:
def uniqueStudent(data):
    """
    Returns a set holds unique account_key info
    
    INPUT : list
    OUTPUT: set
    """
    
    unique = set()

    for row in data:
        unique.add(row['account_key'])
            
    return unique

In [19]:
type(uniqueStudent(enrollments))

set

### – Find enrollment records without corresponding daily engagement data and output that enrollment

In [20]:
uniqueEnrolledStudents = uniqueStudent(enrollments)
uniqueActiveStudents = uniqueStudent(daily_engagement)

In [21]:
len(uniqueEnrolledStudents)

1302

In [22]:
len(uniqueActiveStudents)

1237

In [30]:
# nonActiveAccount list holds unique account_key of students who are enrolled but doesn't have daily engagement record
nonActiveAccount = []

for student in uniqueEnrolledStudents:
    if student not in uniqueActiveStudents:
        nonActiveAccount.append(student)

In [31]:
len(nonActiveAccount) # there are 65 enrolled students that has no activity records, 1302-1237 = 65 

65

In [60]:
print(sorted(nonActiveAccount)) # showing first 10 non-active students' account key

['1010', '1025', '1044', '1063', '1069', '1079', '1086', '1101', '1120', '1125', '1129', '1145', '1148', '1155', '1171', '1186', '1190', '1191', '1213', '1218', '1219', '1222', '1237', '1238', '1241', '1270', '1273', '1284', '1291', '1304', '654', '664', '707', '711', '717', '725', '727', '728', '733', '737', '739', '749', '750', '766', '789', '799', '802', '803', '817', '819', '841', '870', '871', '875', '878', '884', '889', '902', '914', '926', '964', '968', '981', '996', '997']


In [None]:
['', '', '', '', '', '', '', '', '', '', '', '', '', '', 
 '', '', '', '', '', '', '', '', '', '', '', '', '', '', 
 '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '
 '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', 
 '', '', '', '996', '997']

['1129', # there were two of these
 '1304', # there were two of these,
 '654', # there were three of these
 '654',,
 '819',# there were two of these
 '914',




In [38]:
nonActiveAccount2 = []

for student in enrollments:
    if student['account_key'] not in uniqueActiveStudents:
        nonActiveAccount2.append(student)

In [39]:
len(nonActiveAccount2) # what makes the length is different? 
                    # those who have no active record has multiple enrollment recrod?

71

In [54]:
temp = []
for i in range(len(nonActiveAccount2)):
    temp.append(nonActiveAccount2[i]['account_key'])

In [61]:
sorted(temp)

['1010',
 '1025',
 '1044',
 '1063',
 '1069',
 '1079',
 '1086',
 '1101',
 '1120',
 '1125',
 '1129',
 '1129',
 '1145',
 '1148',
 '1155',
 '1171',
 '1186',
 '1190',
 '1191',
 '1213',
 '1218',
 '1219',
 '1222',
 '1237',
 '1238',
 '1241',
 '1270',
 '1273',
 '1284',
 '1291',
 '1304',
 '1304',
 '654',
 '654',
 '654',
 '664',
 '707',
 '711',
 '717',
 '725',
 '727',
 '728',
 '733',
 '737',
 '739',
 '749',
 '750',
 '766',
 '789',
 '799',
 '802',
 '803',
 '817',
 '819',
 '819',
 '841',
 '870',
 '871',
 '875',
 '878',
 '884',
 '889',
 '902',
 '914',
 '914',
 '926',
 '964',
 '968',
 '981',
 '996',
 '997']

In [57]:
cnt_there = 0

for i in range(len(temp)):
    if temp[i] in nonActiveAccount:
        cnt_there += 1
        
print(cnt_there)

71


In [58]:
cnt_notthere = 0

for i in range(len(temp)):
    if temp[i] not in nonActiveAccount:
        cnt_notthere += 1
        
print(cnt_notthere)

0


__Just looking at account key won't tell us much. <br>
Having enrollment information about those non-active student like below might be more helpful.__

In [33]:
# nonActiveStudents list holds enrollment data of students who are enrolled but doesn't have daily engagement record
nonActiveStudents = []

for student in enrollments:
    if student['account_key'] not in uniqueActiveStudents:
        nonActiveStudents.append(student)

In [34]:
len(nonActiveStudents)  ### ha? why?

71

In [35]:
print(nonActiveStudents[:10]) 

[OrderedDict([('account_key', '1219'), ('status', 'canceled'), ('join_date', datetime.datetime(2014, 11, 12, 0, 0)), ('cancel_date', datetime.datetime(2014, 11, 12, 0, 0)), ('days_to_cancel', 0), ('is_udacity', False), ('is_canceled', True)]), OrderedDict([('account_key', '871'), ('status', 'canceled'), ('join_date', datetime.datetime(2014, 11, 13, 0, 0)), ('cancel_date', datetime.datetime(2014, 11, 13, 0, 0)), ('days_to_cancel', 0), ('is_udacity', False), ('is_canceled', True)]), OrderedDict([('account_key', '1218'), ('status', 'canceled'), ('join_date', datetime.datetime(2014, 11, 15, 0, 0)), ('cancel_date', datetime.datetime(2014, 11, 15, 0, 0)), ('days_to_cancel', 0), ('is_udacity', False), ('is_canceled', True)]), OrderedDict([('account_key', '654'), ('status', 'canceled'), ('join_date', datetime.datetime(2014, 12, 4, 0, 0)), ('cancel_date', datetime.datetime(2014, 12, 4, 0, 0)), ('days_to_cancel', 0), ('is_udacity', False), ('is_canceled', True)]), OrderedDict([('account_key', '6