# IVLE Python Workshop
-------------------------------
By Jihyun Park (`jihyunp@ics.uci.edu`)<br>
Department of Computer Science, University of California, Irvine<br>
June 2, 2017

# Outline
-----------------------
## Part 1 : Load Data
## Part 2 : Convert the Data into a Matrix
## Part 3 : Save Matrix into a csv File
## Part 4 : Plotting with `matplotlib.pyplot` 
## Part 5 : Different Click Matrices

# Requirements
----------------------------

# Part 1 : Load Data
---------------------------

## 1.1 URL, String
### Split function

In [None]:
soe_url = "http://education.uci.edu"
soe_phd_url = "http://education.uci.edu/phd-in-education.html"

print soe_url.split("/")
print soe_phd_url.split("/")

In [None]:
url = "http://education.uci.edu"
split_list = url.split("/")
print split_list
print len(split_list)

In [None]:
# Function that returns the length of the URL, split with "/"
# Make url as an argument

def get_slash_split_len(url):
    split_list = url.split("/")
    result = len(split_list)
    return result

In [None]:
# Test
get_slash_split_len("a/b/c/d/e")

Take a look at the real URLs from the csv files. <br>
We want to categorize the URLs into categories such as 'homepage', 'files', 'assignments' and etc. <br>
The easiest way to do this is take the 5th element of the list, which is the URL split by delimiter "/".

In [None]:
url_file = "https://canvas.eee.uci.edu/courses/2230/files/742190?module_item_id=62039"  # -> categorize as "files"
url_assignment = "https://canvas.eee.uci.edu/courses/2230/assignments/49367/submissions"  # -> categorize as "assignments"
url_home = "https://canvas.eee.uci.edu/courses/2230"  # -> categorize as "homepage"

In [None]:
# Homepage : Only 4 slashes exist. We can filter this case out.

# Others : More than 5 slashes exist. 

In [None]:
def get_cats_from_url(url):
    if len(url.split('/')) < 6:
        return 'homepage'
    else:
        return url.split('/')[5]

In [None]:
# Test
print get_cats_from_url(url_file)
print get_cats_from_url(url_home)

## 1.2 `datetime`

More information at : https://docs.python.org/2/library/datetime.html#strftime-and-strptime-behavior

In [None]:
from datetime import datetime, timedelta

In [None]:
dt1 = datetime(2016, 7, 20)
print dt1
dt1

### Extracting date time information and create a datetime object

In [None]:
# From the string, we want to extract the date and time information.
example_date = "2016-07-24T21:36:07Z"
dt_format = "%Y-%m-%dT%H:%M:%SZ"

dt2 = datetime.strptime(example_date, dt_format)
print dt2
dt2

### `timedelta`

In [None]:
dt_delta = dt2 - dt1
print dt_delta
tot_seconds = dt_delta.total_seconds()
print tot_seconds

## 1.3 Accessing File/Folder Paths

All the folders and files have paths that are similar to URLs. <br>
We can access the files and folders using the paths. 

### List files in a directory :  `os.listdir`

In [None]:
import os
os.listdir("/Users/jihyun/research/education/data/2230/deidentified")

### Concatenate folder/file paths : `os.path.join`

You can also do this by string concatenation (using `+`), but this is less confusing and is less prone to errors.

In [None]:
directory = '/Users/jihyun/research/education/data/2230/deidentified'
file_name = '199241.csv'
os.path.join(directory, file_name)

## 1.4 Dictionary

Access `value` using `key`. <br>
`key` doesn't have to be an integer as in list.<br>
Let's say we want to have a dictionary where the `key` is a `string` and the `value` is a `list`

In [None]:
# Different ways to define the same dictionary
# 1. 
d = {"a":[1,2,3], "b":[4,5,6]}

# 2. 
d = {} # equivalent to d = dict()
d["a"] = [1,2,3]
d["b"] = [4,5,6]

### Example
Store the following data into a dictionary

| random_id  | URL           | created_at  |
| - |------------- | ----|
| 123  |  https://canvas.eee.uci.edu/courses/2230/files |2016-07-24T21:36:07Z |
| 123  |  https://canvas.eee.uci.edu/courses/2230/    |2016-08-01T17:29:47Z |

In [None]:
d1 = {} 
d1["url"] = ["https://canvas.eee.uci.edu/courses/2230/files", "https://canvas.eee.uci.edu/courses/2230/"]
d1["created_at"] = ["2016-07-24T21:36:07Z", "2016-08-01T17:29:47Z"]
print d1

## 1.5 Load data from a csv file

In [None]:
import csv

In [None]:
# csv_file = '/Users/jihyun/research/education/data/2230/deidentified/104500.csv'

deidentified_data_dir = '/Users/jihyun/research/education/data/2230/deidentified'
csv_file = os.path.join(deidentified_data_dir, '104500.csv')  

with open(csv_file, 'r') as f:
    reader = csv.reader(f, delimiter=',')
    for line in reader:
        print line
#         print line[1]

In [None]:
with open(csv_file, 'r') as f:
    reader = csv.reader(f, delimiter=',')
    header = reader.next()
    for line in reader:
        rid = int(line[0])
        url = line[1]
        timestamp = line[3]
        ip = line[5]
        print rid, url, timestamp, ip

### Things to add
1. Create another key `category` and store the categories using the URL and the function `get_cats_from_url()`.<br>
2. Use `datetime.strptime()` to convert the string into a datetime object, and then store that datetime object instead of string.

In [None]:
dt_format = "%Y-%m-%dT%H:%M:%SZ"

url_list = []
category_list = []
time_list = []
ip_list = []

with open(csv_file, 'r') as f:
    reader = csv.reader(f, delimiter=',')
    header = reader.next()
    for line in reader:
        rid = int(line[0])
        url = line[1]
        timestamp = line[3]
        ip = line[5]
        cat = get_cats_from_url(url)
        dt = datetime.strptime(timestamp, dt_format)
        
#         print rid, url, timestamp, ip
        url_list.append(url)
#         time_list.append(timestamp)
        time_list.append(dt)
        ip_list.append(ip)
        category_list.append(cat)

student_data = {"url": url_list, "created_at":time_list, "ip":ip_list, "category":category_list}
student_data
student_data.keys()

In [None]:
student_data

## 1.6 Load multiple csv files from a folder

We have .csv file for each student.<br>
Each csv file has columns `url`, `created_at` (timestamp), `remote_ip` (IP address), and etc.

`student_data1 = {"url":[url1, url2, ...], "created_at":[time1, time2, ...], "remote_ip":[ip1, ip2, ip3, ..], ...}`

Total data will be a dictionary, where the keys are the student ID's and the values are the data of that student (above `student_data`). <br>
If the ID of the first three students were 1,2 and 3, the total data would look something like below.

`data = {1:student_data1, 2:student_data2, 3:student_data3, ...}`

In [None]:
deidentified_data_dir = '/Users/jihyun/research/education/data/2230/deidentified'

for filename in os.listdir(deidentified_data_dir):
    csv_file = os.path.join(deidentified_data_dir, filename)
    print csv_file

In [None]:
deidentified_data_dir = '/Users/jihyun/research/education/data/2230/deidentified'
data = {}
dt_format = "%Y-%m-%dT%H:%M:%SZ"

for filename in os.listdir(deidentified_data_dir):
    csv_file = os.path.join(deidentified_data_dir, filename)

    url_list = []
    category_list = []
    time_list = []
    ip_list = []

    with open(csv_file, 'r') as f:
        reader = csv.reader(f, delimiter=',')
        header = reader.next()
        for line in reader:
            rid = int(line[0])
            url = line[1]
            timestamp = line[3]
            ip = line[5]
            cat = get_cats_from_url(url)
            dt = datetime.strptime(timestamp, dt_format)

            url_list.append(url)
            time_list.append(dt)
            ip_list.append(ip)
            category_list.append(cat)

    student_data = {"url": url_list, "created_at":time_list, "ip":ip_list, "category":category_list}
    data[rid] = student_data

In [None]:
data.keys()

In [None]:
data[667648]



# Part 2 : Convert the data into a matrix
-------------------------------

We want to convert the whole data as a matrix where the rows are the students and the columns are the days,<br>
and each value in a matrix is a **number of click events per day for each student**.

## `numpy` array

In [None]:
import numpy as np

In [None]:
list_2d = [[1,2,3], [4,5,6]]
list_2d

In [None]:
nparray_2d = np.array(list_2d)
nparray_2d

#### `np.array` is useful when initialization is needed
- `np.zeros()` : Initializing numpy arrays to zeros
- `np.ones()` : Initializaing numpy arrays to ones

In [None]:
np.zeros((2,3))

In [None]:
# For a single student 
student_data = data[667648]

max_days = 50
first_day = datetime(2016 ,6 ,13 ,0 ,0 ,0)
nclicks_per_day = np.zeros(max_days, dtype=np.int32)

# number of all clicks
for time in student_data['created_at']:
    delta = time - first_day
    if delta.days < max_days:
        nclicks_per_day[delta.days] += 1

In [None]:
# Make it as a function

def get_nclicks_per_day(student_data, max_days=50, first_day=datetime(2016,6,13,0,0,0)):
    nclicks_per_day = np.zeros(max_days, dtype=np.int32)

    # number of all clicks
    for time in student_data['created_at']:
        delta = time - first_day
        if delta.days < max_days:
            nclicks_per_day[delta.days] += 1
            
    return nclicks_per_day

In [None]:
# Use the above function and generate a matrix with size (n_students, max_days) for all student data. 

n_students = len(data)
max_days = 50

nclicks_per_day_mat = np.zeros((n_students, max_days), dtype=np.int32)
idx = 0
for rid in data.keys():
    nclicks_per_day_mat[idx] = get_nclicks_per_day(data[rid])
    idx += 1

In [None]:
nclicks_per_day_mat

# Part 3 : Save matrix into a csv file

In [None]:
#import csv
outfile = './nclicks_per_day_mat.csv'
with open(outfile, 'w') as f:
    writer = csv.writer(f, delimiter=',')
    writer.writerows(nclicks_per_day_mat)

In [None]:
# if you want to add a column header ['day0', 'day1', 'day2', ...]
header = ['day'+str(i) for i in range(max_days)]
outfile = './nclicks_per_day_mat_with_header.csv'
with open(outfile, 'w') as f:
    writer = csv.writer(f, delimiter=',')
    writer.writerow(header)
    writer.writerows(nclicks_per_day_mat)

# Part 4 : Plotting with `matplotlib.pyplot`

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline

### Barplot using `plt.bar()`
documentation : http://matplotlib.org/api/pyplot_api.html#matplotlib.pyplot.bar <br>
Plot the average number of clicks per day by student using the above `nclicks_per_day_mat` matrix

In [None]:
avg_nclicks = np.mean(nclicks_per_day_mat, axis=0)
fig = plt.figure(figsize=(6,4))
plt.bar(range(max_days), avg_nclicks, linewidth=0, figure=fig)
plt.xlabel('DAYS')
plt.ylabel('NUMBER OF CLICKS PER DAY BY STUDENT')
plt.grid(alpha=0.2)

# Save figure
barplot_fname = './barplot.pdf'
plt.savefig(barplot_fname)

### Plot heatmap using `plt.imshow()`
Documentation : http://matplotlib.org/api/pyplot_api.html#matplotlib.pyplot.imshow <br>
Plot `nclicks_per_day_mat` matrix

In [None]:
fig = plt.figure(figsize=(6,6))
plt.imshow(nclicks_per_day_mat, interpolation='nearest', aspect='auto', cmap='Greys', figure=fig)
plt.xlabel('DAYS', fontsize=13)
plt.ylabel('STUDENTS', fontsize=13)

## To remove x and y ticks
# plt.xticks([])
# plt.yticks([])

# Part 5 : Different Click Matrices

Taking subset of the data by data types (e.g. by categories, preview activities.. )

### Number of *category* clicks for each student 

In [None]:
# For each student, return a row
def get_nclicks_per_day_cat(student_data, category='homepage', max_days=50, first_day=datetime(2016,6,13,0,0,0)):
    nclicks_per_day_1 = np.zeros(max_days, dtype=np.int32)

    for i, time in enumerate(student_data['created_at']):
        cat = get_cats_from_url(student_data['url'][i])
        if cat == category:
            delta = time - first_day
            if delta.days < max_days:
                nclicks_per_day_1[delta.days] += 1
    return nclicks_per_day_1

### Get a matrix of number of *homepage* clicks

In [None]:
get_nclicks_per_day_cat(student_data, category='homepage')

In [None]:
n_students = len(data)
max_days = 50

nclicks_per_day_mat_hp = np.zeros((n_students, max_days), dtype=np.int32)
idx = 0
for rid in data.keys():
    nclicks_per_day_mat_hp[idx] = get_nclicks_per_day_cat(data[rid], category='homepage')
    idx += 1

### Exercise : Save the matrix

### Exercise : Plot the matrix using `plt.imshow()`