In [46]:
import csv

def read_csv(filename):
    with open(filename) as f:
        reader = csv.DictReader(f)
        return list(reader)

def update_key(rows, orig, new):
    updated = []
    for row in rows:
        if row[orig]:
            value = row[orig]
            row[new] = value
            del row[orig]
            updated.append(row)
    del rows[:]
    return updated

def get_unique(data, key):
    entities = set()
    for row in data:
        entities.add(row[key])
    return entities

def check_enrollments_with_no_engagement(enrollment, engagement):
    enrollments = []
    for e in enrollment:
        # Return enrollment not in engagement that cancelled not within the day.
        if e['account_key'] not in engagement and (e['days_to_cancel'] == '' or int(e['days_to_cancel']) > 0):
            enrollments.append(e)
    return enrollments

# Check for Unique Enrollments vs Total Enrollments
enrollments = read_csv('data/enrollments.csv')
unique_enrollments = get_unique(enrollments, 'account_key')
print(len(enrollments))
print(len(unique_enrollments))

# Check for Unique Engagements vs Total Engagements
daily_engagements = read_csv('data/daily_engagement.csv')
daily_engagements = update_key(daily_engagements, 'acct', 'account_key')
unique_daily_engagements = get_unique(daily_engagements, 'account_key')
print(len(daily_engagements))
print(len(unique_daily_engagements))

# Check problematic enrollments
problematic_enrollments = check_enrollments_with_no_engagement(enrollments, unique_daily_engagements)
print("Problematic Enrollments: ", len(problematic_enrollments), problematic_enrollments)

# Check for Unique Project Submissions vs Total Submissions
project_submissions = read_csv('data/project_submissions.csv')
unique_project_submissions = get_unique(project_submissions, 'account_key')
print(len(project_submissions))
print(len(unique_project_submissions))



1640
1302
136240
1237
[{'account_key': '1304', 'status': 'canceled', 'join_date': '2015-01-10', 'cancel_date': '2015-03-10', 'days_to_cancel': '59', 'is_udacity': 'True', 'is_canceled': 'True'}, {'account_key': '1304', 'status': 'canceled', 'join_date': '2015-03-10', 'cancel_date': '2015-06-17', 'days_to_cancel': '99', 'is_udacity': 'True', 'is_canceled': 'True'}, {'account_key': '1101', 'status': 'current', 'join_date': '2015-02-25', 'cancel_date': '', 'days_to_cancel': '', 'is_udacity': 'True', 'is_canceled': 'False'}]
Anomaly:  3
3642
743
