In [None]:
import sys, os
import numpy as np
import csv
from datetime import datetime, timedelta
# for plotting
import matplotlib.pyplot as plt
import matplotlib
matplotlib.style.use('classic')
%matplotlib inline

In [None]:
edu_dir = '/Users/jihyun/research/education/'
course_id = 2755
data_dir = os.path.join(edu_dir, 'data',str(course_id))
print data_dir

In [None]:
first_day = datetime(2016, 9, 15, 0, 0, 0)
days_limit = 90  # total 90 days

# Load csv clickstream data

In [None]:
def get_cats_from_url(url, depth=1):

    if not 0 < depth < 4:
        raise ('ERROR! depth should be integers between 1 and 3!')

    if url.endswith('/'):
        url = url[:-1]
    url = url.replace('api/v1/', '')

    if len(url.split('/')) < 6:
        return 'homepage'


def load_csv(deidentified_data_dir):
    """
    The de-identified CSV files should have the following columns
        random_id, url_action, created_at, interaction_seconds, ip_address

    """

    print('Loading csv files')
    data = {}
    index = 0
    idx2id = []
    id2idx = {}
    ip2id = {}
    date_format = "%Y-%m-%dT%H:%M:%SZ"

    for file in os.listdir(deidentified_data_dir):
        url = []
        category = []
        action = []
        created_at = []
        interact_secs = []
        remote_ip = []

        if file.endswith('csv'):
            csv_file = os.path.join(deidentified_data_dir, file)
            csv_reader = csv.reader(open(csv_file, 'r'))
            next(csv_reader, None)
            for line in csv_reader:
                id = line[0]
                ip = line[-1]
                url.append(line[1])
                category.append(get_cats_from_url(line[1]))
                action.append(line[2])
                # you need to subtract 7 hours to get the california time
                created_at.append(datetime.strptime(line[3], date_format) - timedelta(0, 25200))
                interact_secs.append(line[4])
                remote_ip.append(ip)
                if ip2id.get(ip, None) is None:
                    ip2id[ip] = []
                if int(id) not in ip2id[ip]:
                    ip2id[ip].append(int(id))

            id = int(id)
            data[id] = {"url": url[::-1], "category": category[::-1], "action": action[::-1],
                        "created_at": created_at[::-1],
                        "interact_secs": interact_secs[::-1], "remote_ip": remote_ip[::-1]}
            id2idx[id] = index
            idx2id.append(id)
            index += 1
    n_students = index
    print('Finished Loading')
    return data, n_students, id2idx, idx2id, ip2id

In [None]:
deid_data_dir = os.path.join(data_dir, 'deidentified') # path to the deidentified click data
csv_data, n_students, id2idx, idx2id, ip2id = load_csv(deid_data_dir)

In [None]:
# data checking
print csv_data.keys()
print csv_data[211456]

# Get number of clicks per day matrix

In [None]:
def get_num_clicks_per_day(student_data, days_limit, first_day, type='all'):
    """
    Parameters
    ----------
    student_data : dict
        dictionary for each student. (One entry of canvas_data.) It should have 'created_at' entry.

    Returns
    -------
        np.array
        numpy array with length 'days_limit'
        Histogram (counts) of the student as a function of time.

    """
    hist_array = np.zeros(days_limit, dtype=np.int32)
    if type == 'all':  # Default
        for time in student_data['created_at']:
            delta = time - first_day
            if delta.days < days_limit:
                hist_array[delta.days] += 1
    else:
        for i, time in enumerate(student_data['created_at']):
            cat = get_cats_from_url(student_data['url'][i], depth=1)
            if cat == type:
                delta = time - first_day
                if delta.days < days_limit:
                    hist_array[delta.days] += 1
    return hist_array


def get_num_clicks_per_day_mat(csvdata, n_students, days_limit, first_day, idx2id, type='all'):
    """
    Get (num_student X num_days) matrix
    where each row is the number of click events per day for each student.
    Returns
    -------
        np.array

    """
    clicks_per_day_mat = np.zeros((n_students, days_limit), dtype=np.int32)
    for idx, random_id in enumerate(idx2id):
        student = csvdata[random_id]
        clicks_per_day_mat[idx] = get_num_clicks_per_day(student, days_limit, first_day, type)
    return clicks_per_day_mat

In [None]:
click_mat = get_num_clicks_per_day_mat(csv_data, n_students, days_limit, first_day, idx2id)
print click_mat

In [None]:
plt.bar(range(days_limit), np.mean(click_mat, axis=0))

# Load the quiz_url description file

In [None]:
qname2closedate = {}
qname2qidx = {}
qidx2qname = []

quiz_url_file = os.path.join(data_dir, 'quiz_url_descriptions.csv')
with open(quiz_url_file, 'r') as f:
    reader = csv.reader(f, delimiter=',')
    header = reader.next()
    for line in reader:
        q_num = int(line[0])
        q_name = line[1].split(":")[0]
        q_pnts = line[2]
        q_open_date = datetime.strptime(line[3], "%m/%d/%Y %H:%M:%S")
        q_close_date = datetime.strptime(line[4], "%m/%d/%Y %H:%M:%S")
        qname2closedate[q_name] = q_close_date
        qname2qidx[q_name] = q_num-1
        qidx2qname.append(q_name)

In [90]:
qname2closedate

{'Lesson 1A': datetime.datetime(2016, 9, 26, 10, 0),
 'Lesson 1B': datetime.datetime(2016, 9, 26, 10, 0),
 'Lesson 1D': datetime.datetime(2016, 9, 28, 10, 0),
 'Lesson 2A': datetime.datetime(2016, 9, 30, 10, 0),
 'Lesson 2B': datetime.datetime(2016, 10, 3, 10, 0),
 'Lesson 2C': datetime.datetime(2016, 10, 5, 10, 0),
 'Lesson 2D': datetime.datetime(2016, 10, 7, 10, 0),
 'Lesson 2E': datetime.datetime(2016, 10, 12, 10, 0),
 'Lesson 3A': datetime.datetime(2016, 10, 19, 10, 0),
 'Lesson 3B': datetime.datetime(2016, 10, 21, 10, 0),
 'Lesson 3C': datetime.datetime(2016, 10, 24, 10, 0),
 'Lesson 3D': datetime.datetime(2016, 10, 26, 10, 0),
 'Lesson 3E': datetime.datetime(2016, 10, 28, 10, 0),
 'Lesson 3F': datetime.datetime(2016, 10, 31, 10, 0),
 'Lesson 3G': datetime.datetime(2016, 11, 2, 10, 0),
 'Lesson 4A': datetime.datetime(2016, 11, 9, 10, 0),
 'Lesson 4B': datetime.datetime(2016, 11, 14, 10, 0),
 'Lesson 4C': datetime.datetime(2016, 11, 18, 10, 0),
 'Lesson 4D': datetime.datetime(2016,

In [91]:
qname2qidx

{'Lesson 1A': 0,
 'Lesson 1B': 1,
 'Lesson 1D': 2,
 'Lesson 2A': 3,
 'Lesson 2B': 4,
 'Lesson 2C': 5,
 'Lesson 2D': 6,
 'Lesson 2E': 7,
 'Lesson 3A': 8,
 'Lesson 3B': 9,
 'Lesson 3C': 10,
 'Lesson 3D': 11,
 'Lesson 3E': 12,
 'Lesson 3F': 13,
 'Lesson 3G': 14,
 'Lesson 4A': 15,
 'Lesson 4B': 16,
 'Lesson 4C': 18,
 'Lesson 4D': 17,
 'Lesson 4E': 19,
 'Lesson 4F': 20,
 'Lesson 4G': 21}

# Load the quiz data 

In [None]:
cid2qsubs = {} # cid to quiz submissions.
quiz_data_dir = os.path.join(data_dir, 'Quiz Data')
for fname in os.listdir(quiz_data_dir):
    qname = fname.split("- ")[0]
    filepath = os.path.join(quiz_data_dir, fname)
    
    with open(filepath, 'r') as f:
        reader = csv.reader(f)
        header = reader.next()
        for line in reader:
            cid = int(line[1])
            tstr = line[6]
            submit_time = datetime.strptime(tstr, "%Y-%m-%d %H:%M:%S UTC") - timedelta(hours=7) # to pst time
            attempt = int(line[7])-1 # to use as an index, start with 0
            
            if cid not in cid2qsubs:
                cid2qsubs[cid] = {}
            if qname not in cid2qsubs[cid]:
                cid2qsubs[cid][qname] = [None, None, None]
                
            if attempt < 3:
                cid2qsubs[cid][qname][attempt] = submit_time
            if attempt >= 3:
                print "Skipping ", qname, cid, line[0], "- attempt:", attempt+1

In [93]:
# sample output for a single student with id 6849
cid2qsubs[6849]

{'Lesson 1A': [datetime.datetime(2016, 9, 26, 13, 54, 18),
  datetime.datetime(2016, 9, 26, 13, 57, 13),
  None],
 'Lesson 1B': [datetime.datetime(2016, 9, 26, 13, 58, 28),
  datetime.datetime(2016, 9, 26, 13, 59, 19),
  None],
 'Lesson 1D': [datetime.datetime(2016, 9, 28, 12, 46, 35),
  datetime.datetime(2016, 9, 28, 12, 54, 23),
  datetime.datetime(2016, 9, 28, 13, 57, 41)],
 'Lesson 2A': [datetime.datetime(2016, 9, 30, 13, 3, 20), None, None],
 'Lesson 2B': [datetime.datetime(2016, 10, 2, 21, 32, 21),
  datetime.datetime(2016, 10, 2, 21, 33, 6),
  None],
 'Lesson 2C': [datetime.datetime(2016, 10, 4, 21, 3, 29),
  datetime.datetime(2016, 10, 4, 21, 5, 45),
  None],
 'Lesson 2D': [datetime.datetime(2016, 10, 5, 19, 49, 18), None, None],
 'Lesson 2E': [datetime.datetime(2016, 10, 10, 2, 14, 19), None, None],
 'Lesson 3A': [datetime.datetime(2016, 10, 19, 12, 57, 50),
  datetime.datetime(2016, 10, 19, 13, 0, 39),
  None],
 'Lesson 3B': [datetime.datetime(2016, 10, 21, 11, 17, 26),
  dat

# Create a quiz submission matrix to save

In [None]:
Nquiz = len(qname2qidx)
subdates = []
for cid in cid2qsubs.keys():
    row = [None] * (Nquiz + 1)
    for qname in cid2qsubs[cid].keys():
        qidx = qname2qidx[qname]
        row[0] = cid
        col_st_idx = qidx*3 + 1
        col_end_idx = qidx*3 + 4
        row[col_st_idx:col_end_idx] = cid2qsubs[cid][qname]
    subdates.append(row)

In [94]:
# sample output for row 0
subdates[0]

[202240,
 datetime.datetime(2016, 9, 24, 17, 25, 16),
 None,
 None,
 datetime.datetime(2016, 9, 24, 17, 39, 17),
 datetime.datetime(2016, 9, 24, 17, 41, 3),
 datetime.datetime(2016, 9, 24, 17, 42, 54),
 datetime.datetime(2016, 9, 27, 15, 3, 53),
 datetime.datetime(2016, 9, 27, 15, 4, 33),
 None,
 datetime.datetime(2016, 9, 29, 17, 5, 34),
 datetime.datetime(2016, 9, 29, 17, 6, 38),
 None,
 datetime.datetime(2016, 10, 2, 4, 39, 57),
 None,
 None,
 datetime.datetime(2016, 10, 3, 20, 39, 4),
 datetime.datetime(2016, 10, 3, 20, 39, 31),
 None,
 datetime.datetime(2016, 10, 6, 2, 31, 34),
 datetime.datetime(2016, 10, 6, 2, 32, 29),
 None,
 datetime.datetime(2016, 10, 11, 18, 12, 22),
 None,
 None,
 datetime.datetime(2016, 10, 19, 0, 7, 25),
 datetime.datetime(2016, 10, 19, 0, 7, 59),
 None,
 datetime.datetime(2016, 10, 20, 21, 48, 52),
 datetime.datetime(2016, 10, 20, 21, 52, 41),
 datetime.datetime(2016, 10, 20, 21, 55, 12),
 datetime.datetime(2016, 10, 21, 21, 46, 10),
 None,
 None,
 datet

In [95]:
# create header
header = ["canvasid"] + ["quiz"+str(i+1)+"submit"+str(j+1) for i in range(Nquiz) for j in range(3)]
print header

['canvasid', 'quiz1submit1', 'quiz1submit2', 'quiz1submit3', 'quiz2submit1', 'quiz2submit2', 'quiz2submit3', 'quiz3submit1', 'quiz3submit2', 'quiz3submit3', 'quiz4submit1', 'quiz4submit2', 'quiz4submit3', 'quiz5submit1', 'quiz5submit2', 'quiz5submit3', 'quiz6submit1', 'quiz6submit2', 'quiz6submit3', 'quiz7submit1', 'quiz7submit2', 'quiz7submit3', 'quiz8submit1', 'quiz8submit2', 'quiz8submit3', 'quiz9submit1', 'quiz9submit2', 'quiz9submit3', 'quiz10submit1', 'quiz10submit2', 'quiz10submit3', 'quiz11submit1', 'quiz11submit2', 'quiz11submit3', 'quiz12submit1', 'quiz12submit2', 'quiz12submit3', 'quiz13submit1', 'quiz13submit2', 'quiz13submit3', 'quiz14submit1', 'quiz14submit2', 'quiz14submit3', 'quiz15submit1', 'quiz15submit2', 'quiz15submit3', 'quiz16submit1', 'quiz16submit2', 'quiz16submit3', 'quiz17submit1', 'quiz17submit2', 'quiz17submit3', 'quiz18submit1', 'quiz18submit2', 'quiz18submit3', 'quiz19submit1', 'quiz19submit2', 'quiz19submit3', 'quiz20submit1', 'quiz20submit2', 'quiz20subm

In [None]:
# save the data matrix
outfile = "./quiz_submissions.csv"
with open(outfile, 'w') as f:
    writer = csv.writer(f, delimiter=",")
    writer.writerow(header)
    writer.writerows(subdates)