# Preprocessing

# Importing Spark

In [1]:
from pyspark import SparkContext

import pprint
pp = pprint.PrettyPrinter(indent=4)

# Defining the spark context
sc = SparkContext("local[*]", "ADA")
# Display the parallelism of the current context
sc.defaultParallelism

8

In [2]:
import utils

# Configuration

In [3]:
config = {
    "session":"progfun-002"
}

TEST_STUDENTS={
    '6ea6949ca133acede360d3573f9d1168b3d70b51':'very good student',
    '3b305429f93de02637949578a5f9e23f13eb0726':'did two problems',
    '865981d5b40a693bafbadae4b1df769be03a25c3':'watched 8 videos',
    '67cdae6073d1089b695e2a615a01187586ad7ba6':'normal student',
    'fb4c81b3df430d1f0fbb8d0ca3e470ac6bf92a2f':'the very best student',
    '295b3496278626b6d337812b1882b756336fd633':'whatdup with this guy?'
}

# Importing the data

### Importing Event Data

In [4]:
# Reading csv files: Create RDD () with one string entry per line in the file
rdd_problem_events = sc.textFile("data/"+config['session']+"_Problem_Events_with_Info.csv")
rdd_video_events = sc.textFile("data/"+config['session']+"_Video_Events.csv")
rdd_forum_events = sc.textFile("data/"+config['session']+"_Forum_Events.csv")

In [5]:
def printHeader(rdd):
    print(list(enumerate(rdd.first().split(','))),'\n')

print("--- PROBLEM_EVENTS ---")
printHeader(rdd_problem_events)

print("--- VIDEO_EVENTS ---")
printHeader(rdd_video_events)

print("--- FORUM_EVENTS ---")
printHeader(rdd_forum_events)

--- PROBLEM_EVENTS ---
[(0, 'EventID'), (1, 'ForumUserID'), (2, 'MaximumSubmissions'), (3, 'AccountUserID'), (4, 'SubmissionNumber'), (5, 'Grade'), (6, 'TimeStamp'), (7, 'DataPackageID'), (8, 'ProblemID'), (9, 'SoftCloseTime'), (10, 'ProblemType'), (11, 'HardCloseTime'), (12, 'Platform'), (13, 'OpenTime'), (14, 'EventType'), (15, 'Title'), (16, 'SessionUserID'), (17, 'UniqueProblemID'), (18, 'UniqueUserID')] 

--- VIDEO_EVENTS ---
[(0, 'EventID'), (1, 'ForumUserID'), (2, 'OldTime'), (3, 'AccountUserID'), (4, 'CurrentTime'), (5, 'SeekType'), (6, 'TimeStamp'), (7, 'DataPackageID'), (8, 'UniqueRowID'), (9, 'TableName'), (10, 'VideoID'), (11, 'Platform'), (12, 'NewSpeed'), (13, 'EventSource'), (14, 'EventType'), (15, 'SessionUserID'), (16, 'NewTime'), (17, 'OldSpeed')] 

--- FORUM_EVENTS ---
[(0, 'EventID'), (1, 'ForumUserID'), (2, 'PostID'), (3, 'AccountUserID'), (4, 'TimeStamp'), (5, 'DataPackageID'), (6, 'UniqueRowID'), (7, 'TableName'), (8, 'Platform'), (9, 'EventSource'), (10, 'PostTy

In [6]:
def removeHeader(row):
    return not row.startswith('EventID')

rdd_problem_events = rdd_problem_events.filter(removeHeader)
rdd_video_events = rdd_video_events.filter(removeHeader)
rdd_forum_events = rdd_forum_events.filter(removeHeader)

In [7]:
# INPUT: Comma separated string
def parse_problems(x):
    data = x.split(',')
    return {
        'Grade':float(data[5]),
        'TimeStamp':int(data[6]),
        'ProblemID':int(data[8]),
        'ProblemType':data[10],
        'EventType':data[14].split('.')[0],
        'EventSubType':data[14].split('.')[1],
        'Title':data[15],
        'SessionUserID':data[16]
    }

# INPUT: Comma separated string
def parse_videos(x):
    data = x.split(',')
    return {
        'TimeStamp':int(data[6]),
        'VideoID':int(data[10]),
        'EventType':data[14].split('.')[0],
        'EventSubType':data[14].split('.')[1],
        'SessionUserID':data[15]
    }

# INPUT: Comma separated string
def parse_forums(x):
    data = x.split(',')
    return {
        'AccountUserID':data[3],
        'TimeStamp':int(data[4]),
        'EventType':data[11].split('.')[0],
        'EventSubType':data[11].split('.')[1]
    }

# INPUT: Comma separated string
def filter_problems(x):
    data = x.split(',')
    return (
        # Some problem event are quiz inside video, we decide to not consider these
        (data[10]=='Assignment')
        # We remove IDs of assignments that have been used by the teaching staff for testing the platform
        and (not data[8] in ['1','2','3','4','5'])
        # Discard assignments with no grade
        and (not data[5] in ['None'])
    )

# INPUT: Comma separated string
def filter_videos(x):
    data = x.split(',')
    return (
        # We remove video that do not belong to the MOOC lectures or ar just Setup videos
        (data[10] not in ['9','12','11','10','13','2','29','25','21','27','23'])
        # Keeps only "Play" EnventSubStype for videos (makes difference between opening the page or starting the video)
        # and (data[14].split('.')[1] in ['Play','Load'])
    )

# INPUT: Comma separated string
def filter_forums(x):
    data = x.split(',')
    return (
        True
    )

rdd_problem_events_parsed = rdd_problem_events.filter(filter_problems).map(parse_problems)
rdd_video_events_parsed = rdd_video_events.filter(filter_videos).map(parse_videos)
rdd_forum_events_parsed = rdd_forum_events.filter(filter_forums).map(parse_forums)

In [8]:
# Handles issue with the forum events table having 'AccountUserID' instead of 'SessionUserID'
# Using the table progfun-002_User_Hash_Mapping
rdd_user_mapping = sc.textFile("data/"+config['session']+"_User_Hash_Mapping.csv")

def f(x):
    x[1][1]['SessionUserID']=x[1][0] 
    return x[1][1]

rdd_forum_events_parsed = (rdd_user_mapping
    # INPUT of filter: Comma separated string
    .filter(lambda x: not x.startswith("ForumUserID"))
    # INPUT of map: Comma separated string
    .map(lambda x:(x.split(",")[1],x.split(",")[6]))
    # INPUT of join:
    # self= (key=AccountUserId (str), value=SessionUserId (str))
    # arg= (key=AccountUserId (str), value=ForumEvent (dict))
    .join(rdd_forum_events_parsed
        # INPUT of map: ForumEvent (dict)
        .map(lambda x: (x['AccountUserID'],x))
    )
    # (AccountUserId (str), (SessionUserId (str), ForumEvent (dict)))
    .map(f)
)

print(rdd_forum_events_parsed.count())
print(rdd_forum_events_parsed.take(1)[0])

297650
{'SessionUserID': '16e564ed1e9b7104acde8f075c26a1872695fb5e', 'AccountUserID': '2009208', 'EventType': 'Forum', 'TimeStamp': 1368130954, 'EventSubType': 'Thread'}


In [9]:
# Concatenantes all three table into one big table
rdd_events = (rdd_problem_events_parsed
    .union(rdd_video_events_parsed)
    .union(rdd_forum_events_parsed)
)
rdd_events.persist()
print(rdd_events.count())
print(rdd_events.first())

3519061
{'SessionUserID': 'd8f79efa32a560b8a46ea2b12d9bed97c9e39b4b', 'Title': 'Functional Sets / Functional Sets', 'TimeStamp': 1366220878, 'EventSubType': 'Check', 'ProblemID': 6, 'ProblemType': 'Assignment', 'EventType': 'Problem', 'Grade': 9.32999992371}


# Meta data

In [10]:
LECTURES_PER_PROBLEM = {
    7: [3,4,5,6,7,8,33], # Lecture 1
    6: [35,37,39,41,43,47,49], # Lecture 2
    12: [51,53,75], # Lecture 3
    14: [71,81,85,79,87,77], # Lecture 4
    17: [109,105,115,107,103,113,111], # Lecture 6
    20: [123,117,125,121,127,119] # Lecture 7
}
pp.pprint(LECTURES_PER_PROBLEM)

{   6: [35, 37, 39, 41, 43, 47, 49],
    7: [3, 4, 5, 6, 7, 8, 33],
    12: [51, 53, 75],
    14: [71, 81, 85, 79, 87, 77],
    17: [109, 105, 115, 107, 103, 113, 111],
    20: [123, 117, 125, 121, 127, 119]}


In [11]:
PROBLEM_PER_LECTURE = {}
for pb in LECTURES_PER_PROBLEM.keys():
    for lc in LECTURES_PER_PROBLEM[pb]:
        PROBLEM_PER_LECTURE[lc]=pb
print(PROBLEM_PER_LECTURE)

{51: 12, 3: 7, 4: 7, 49: 6, 6: 7, 7: 7, 8: 7, 119: 20, 77: 14, 79: 14, 43: 6, 75: 12, 85: 14, 87: 14, 39: 6, 71: 14, 111: 17, 81: 14, 5: 7, 33: 7, 35: 6, 37: 6, 103: 17, 105: 17, 107: 17, 109: 17, 47: 6, 113: 17, 53: 12, 115: 17, 117: 20, 41: 6, 121: 20, 123: 20, 125: 20, 127: 20}


# Patterns from event sequences

In [12]:
rdd_events_by_students = (rdd_events
    # INPUT: Event (dict)
    .map(lambda x: (x['SessionUserID'],x))
    # INPUT: (SessionUserId (str), Event (dict))
    .groupByKey()
    # INPUT: (SessionUserId (str), [ Event (dict), ... ])
    .map(lambda x: (x[0],sorted(x[1], key=(lambda event: event['TimeStamp']))))
)

rdd_events_by_students.persist()
print("Number of students: %d" % rdd_events_by_students.count()) 

Number of students: 26989


In [13]:
# INPUT: (SessionStudentId (str), [ Event (dict), ... ])
def extractPatterns(x):
    studentId = x[0]
    events = x[1]
    
    patterns = {}
    for problemID in LECTURES_PER_PROBLEM.keys():
        patterns[problemID]=[]
    
    curProblem=-1
    for event in events:
        if event['EventType']=='Problem':
            curProblem=event['ProblemID']
            patterns[curProblem].append(event)
        if event['EventType']=='Video':
            if event['VideoID'] in PROBLEM_PER_LECTURE.keys():
                curProblem=PROBLEM_PER_LECTURE[event['VideoID']]
                patterns[curProblem].append(event)
        if event['EventType']=='Forum':
            if curProblem>0:
                patterns[curProblem].append(event)
                
    return [ {
        'studentId': studentId, 
        'problemId': problemID, 
        'pattern': patterns[problemID]
    } for problemID in LECTURES_PER_PROBLEM.keys()]


rdd_patterns = (rdd_events_by_students
    # INPUT: (SessionStudentID (str), [ Event (dict), ... ] )
    .flatMap(extractPatterns)
)

rdd_patterns.persist()

PythonRDD[34] at RDD at PythonRDD.scala:48

In [14]:
def displayStudentsPatterns(rdd, students):
    for student in students.keys():
        print(student, students[student])
        for item in rdd.filter(lambda x: x['studentId']==student).collect():
            print(item['problemId'],end='\t>>>  ')
            for event in item['pattern']:
                print(utils.eventToString(event), end=" ")
            print()
        print()

displayStudentsPatterns(rdd_patterns,TEST_STUDENTS)

865981d5b40a693bafbadae4b1df769be03a25c3 watched 8 videos
17	>>>  
20	>>>  
6	>>>  (V 35) (V 35) (V 35) (V 35) (V 35) (V 35) 
7	>>>  (V 8) (V 3) (V 4) (V 5) (V 6) (V 7) (V 7) (V 33) 
12	>>>  
14	>>>  

3b305429f93de02637949578a5f9e23f13eb0726 did two problems
17	>>>  
20	>>>  
6	>>>  (P 6) 
7	>>>  (P 7) 
12	>>>  
14	>>>  

295b3496278626b6d337812b1882b756336fd633 whatdup with this guy?
17	>>>  (V 103) (V 103) (V 103) (V 103) (V 103) (V 103) (V 103) (V 103) (V 105) (V 105) (V 105) (V 105) (V 105) (V 105) (V 107) (V 107) (V 107) (V 107) (V 107) (V 107) (V 107) (V 107) (V 109) (V 109) (V 109) (V 109) (V 109) (V 109) (V 109) (V 109) (V 109) (V 109) (V 109) (V 111) (V 111) (V 111) (V 111) (V 111) (V 111) (V 113) (V 113) (V 113) (V 113) (V 113) (V 113) (V 113) (V 113) (V 113) (V 113) (V 113) (V 113) (V 113) (V 113) (V 113) (V 113) (V 113) (V 113) (V 113) (V 113) (V 113) (V 113) (V 113) (V 113) (V 113) (V 113) (V 113) (V 113) (V 113) (V 113) (V 113) (V 113) (V 113) (V 113) (V 113) (V 113) (V 

In [15]:
import json
import os
from os import walk
import shutil

directory='data/spark/preprocessed'
if os.path.exists(directory):
    shutil.rmtree(directory)

rdd_patterns.map(json.dumps).saveAsTextFile(directory)