# ADA - Project
## With Spark

## Getting started

In [1]:
from pyspark import SparkContext

In [2]:
sc = SparkContext("local", "ADA")

In [3]:
data = sc.parallelize(range(100))
data.sum()

4950

If you received 4950 as a result, your spark is working well :) Good job !

## Configuration

In [4]:
config = {
    "session":"progfun-002"
}

## Importing and parsing the data

In [5]:
# Reading csv files: Create RDD () with one string entry per line in the file
rdd_problem_events = sc.textFile("../data/"+config['session']+"_Problem_Events_with_Info.csv")
rdd_video_events = sc.textFile("../data/"+config['session']+"_Video_Events.csv")
rdd_forum_events = sc.textFile("../data/"+config['session']+"_Forum_Events.csv")

# Prints the first line (header) along with indices for each table
print("\n\n--- PROBLEM_EVENTS ---")
for idx,field in enumerate(rdd_problem_events.take(1)[0].split(",")): 
    print(idx,field, end=" / ")

print("\n\n--- VIDEO_EVENTS ---")
for idx,field in enumerate(rdd_video_events.take(1)[0].split(",")): 
    print(idx,field, end=" / ")

print("\n\n--- FORUM_EVENTS ---")
for idx,field in enumerate(rdd_forum_events.take(1)[0].split(",")): 
    print(idx,field, end=" / ")



--- PROBLEM_EVENTS ---
0 EventID / 1 ForumUserID / 2 MaximumSubmissions / 3 AccountUserID / 4 SubmissionNumber / 5 Grade / 6 TimeStamp / 7 DataPackageID / 8 ProblemID / 9 SoftCloseTime / 10 ProblemType / 11 HardCloseTime / 12 Platform / 13 OpenTime / 14 EventType / 15 Title / 16 SessionUserID / 17 UniqueProblemID / 18 UniqueUserID / 

--- VIDEO_EVENTS ---
0 EventID / 1 ForumUserID / 2 OldTime / 3 AccountUserID / 4 CurrentTime / 5 SeekType / 6 TimeStamp / 7 DataPackageID / 8 UniqueRowID / 9 TableName / 10 VideoID / 11 Platform / 12 NewSpeed / 13 EventSource / 14 EventType / 15 SessionUserID / 16 NewTime / 17 OldSpeed / 

--- FORUM_EVENTS ---
0 EventID / 1 ForumUserID / 2 PostID / 3 AccountUserID / 4 TimeStamp / 5 DataPackageID / 6 UniqueRowID / 7 TableName / 8 Platform / 9 EventSource / 10 PostType / 11 EventType / 12 JoinID / 13 SessionUserID / 

In [6]:
# Use the filter method to remove the first line
rdd_problem_events = rdd_problem_events.filter(lambda x: not x.startswith('EventID'))
rdd_video_events = rdd_video_events.filter(lambda x: not x.startswith('EventID'))
rdd_forum_events = rdd_forum_events.filter(lambda x: not x.startswith('EventID'))

# Prints first record for each table
print(rdd_problem_events.take(1)[0],"\n") 
print(rdd_video_events.take(1)[0],"\n") 
print(rdd_forum_events.take(1)[0],"\n")

3bcd1a54ed6ddb04b4a6fb2906110a01,None,0,None,7,None,1365344171,progfun-002,7,2147483640,Video,2147483640,Coursera,32400,Problem.Check,Lecture 1.2 - Elements of Programming (14:25),c4d4e5fcd2feba9f3234ee8d852dc7b22fbc07e4,f322944718b2ee0e53292118111533c7,21f13b3f6b50a83343b57d2f1d07dbdf 

db75adce6b87e7ab79242ea0af4b82d4,None,154.696,None,154.697,None,1372391638,progfun-002,00000078c0f0685cc50a25a8d5734a88,Video_Events,33,coursera,1.0,None,Video.Play,ef64fb7b096008f7eaf8441684afdf99af9af54a,None,1.0 

f3fdb52859b2511308aee554a573194e,None,17,4108315,1376254235,progfun-002,000006c12322ca29c7013dac42ef1a6a,Forum_Events,coursera,None,Thread,Forum.Thread.View,03b1fa287de5ef57d9c8482195b5167f,None 



In [7]:
# Function to parse the string entries from previous dataset
def parse_problems(x):
    data = x.split(',')
    return {
        'Grade':data[5],
        'TimeStamp':data[6],
        'ProblemID':data[8],
        'ProblemType':data[10],
        'EventType':data[14].split('.')[0],
        'EventSubType':data[14].split('.')[1],
        'SessionUserID':data[16]
    }

def parse_videos(x):
    data = x.split(',')
    return {
        'TimeStamp':data[6],
        'VideoID':data[10],
        'EventType':data[14].split('.')[0],
        'EventSubType':data[14].split('.')[1],
        'SessionUserID':data[15]
    }

def parse_forums(x):
    data = x.split(',')
    return {
        'AccountUserID':data[3],
        'TimeStamp':data[4],
        'EventType':data[11].split('.')[0],
        'EventSubType':data[11].split('.')[1]
    }

# Use the map method to have more workable data
rdd_problem_events_parsed = rdd_problem_events.map(parse_problems)
rdd_video_events_parsed = rdd_video_events.map(parse_videos)
rdd_forum_events_parsed = rdd_forum_events.map(parse_forums)

# Prints the count of elements along with the first element of each table
print(rdd_problem_events_parsed.count())
print(rdd_problem_events_parsed.take(1)[0],"\n") 
print(rdd_video_events_parsed.count()) 
print(rdd_video_events_parsed.take(1)[0],"\n") 
print(rdd_forum_events_parsed.count()) 
print(rdd_forum_events_parsed.take(1)[0],"\n") 


458888
{'EventType': 'Problem', 'EventSubType': 'Check', 'Grade': 'None', 'ProblemID': '7', 'ProblemType': 'Video', 'SessionUserID': 'c4d4e5fcd2feba9f3234ee8d852dc7b22fbc07e4', 'TimeStamp': '1365344171'} 

3471666
{'EventType': 'Video', 'EventSubType': 'Play', 'SessionUserID': 'ef64fb7b096008f7eaf8441684afdf99af9af54a', 'TimeStamp': '1372391638', 'VideoID': '33'} 

297650
{'EventType': 'Forum', 'EventSubType': 'Thread', 'AccountUserID': '4108315', 'TimeStamp': '1376254235'} 



In [8]:
# Handles problem with the forum events table having 'AccountUserID' instead of 'SessionUserID'
# Using the table progfun-002_User_Hash_Mapping
rdd_user_mapping = sc.textFile("../data/"+config['session']+"_User_Hash_Mapping.csv")
print(rdd_user_mapping.take(1)[0],"\n")

def f(x):
    x[1][1]['SessionUserID']=x[1][0] 
    return x[1][1]

rdd_forum_events_parsed = (rdd_user_mapping
    # removes header
    .filter(lambda x: not x.startswith("ForumUserID"))
    # maps to have the format (Key=AccountUserID,Value=SessionUserID)
    .map(lambda x:(x.split(",")[1],x.split(",")[6]))
    # join with rdd_forum_event to get format (map to have the (Key=AccountUserID,Value=(SessionUserID,EventObject)) format)
    .join(rdd_forum_events_parsed
        # map to have the (Key=AccountUserID,Value=EventObject) format
        .map(lambda x: (x['AccountUserID'],x))
    )
    # Use the function f to update EventObject with the joined SessionUserID
    .map(f)
)

print(rdd_forum_events_parsed.count())
print(rdd_forum_events_parsed.take(1)[0])

ForumUserID,AccountUserID,DataPackageID,UniqueRowID,TableName,Platform,SessionUserID 

297650
{'EventType': 'Forum', 'EventSubType': 'ThreadSubscribe', 'SessionUserID': 'a97848d806f0d88cf80dd154845693af40cf559f', 'AccountUserID': '1932792', 'TimeStamp': '1364416322'}


In [9]:
# Filters only Problem events of type Assignment
rdd_problem_events_parsed = rdd_problem_events_parsed.filter(lambda x: x['ProblemType']=='Assignment')
print(rdd_problem_events_parsed.count())
print(rdd_problem_events_parsed.take(1)[0])

105270
{'EventType': 'Problem', 'EventSubType': 'Check', 'Grade': '9.32999992371', 'ProblemID': '6', 'ProblemType': 'Assignment', 'SessionUserID': 'd8f79efa32a560b8a46ea2b12d9bed97c9e39b4b', 'TimeStamp': '1366220878'}


In [10]:
# Concatenantes all three table into one big table
rdd_events = (rdd_problem_events_parsed
    .union(rdd_video_events_parsed)
    .union(rdd_forum_events_parsed)
)
print(rdd_events.count())
print(rdd_events.take(1)[0])

3874586
{'EventType': 'Problem', 'EventSubType': 'Check', 'TimeStamp': '1366220878', 'ProblemID': '6', 'ProblemType': 'Assignment', 'SessionUserID': 'd8f79efa32a560b8a46ea2b12d9bed97c9e39b4b', 'Grade': '9.32999992371'}


## Working the data

### Grouping & Sorting

In [11]:
# uses the function groupByKey on our events with the key 'SessionStudentID'
rdd_events_by_students = (rdd_events
    .map(lambda x: (x['SessionUserID'],x))
    .groupByKey()
)
print(rdd_events_by_students.count()) # This is our number of students
print(rdd_events_by_students.take(1)[0])

31269
('75046568bfb19ce33846e4b7c384db7a8809d578', <pyspark.resultiterable.ResultIterable object at 0x1097ccda0>)


In [12]:
# Function to have a friendly way to print the events
def eventToString(event):
    return {
        "Problem": lambda x:"(P "+event['ProblemID']+" "+event['Grade']+")",
        "Video": lambda x:"(V)",
        "Forum": lambda x:"(F)",
    }[event['EventType']](event)

In [13]:
# Looks at the data for a few students
for studentID,events in rdd_events_by_students.take(3):
    print("\n",studentID,"\n")
    for event in events:
        print(eventToString(event), end=" ")
    print()


 75046568bfb19ce33846e4b7c384db7a8809d578 

(V) (V) (V) (V) (V) (V) (V) (V) (V) (V) (V) (V) (V) (V) (V) (V) (V) 

 3a3aea293c5a658c4d2cc8c0aaf07694b985bbec 

(V) (V) (V) (V) (V) (V) (V) (V) (V) (V) (V) (V) (V) (V) (V) (V) (V) (V) (V) (V) (V) (V) (V) (V) (V) (V) (V) (V) (V) (V) (V) (V) (V) (V) (V) (V) (V) (V) (V) (V) (V) (V) (V) (V) (V) (V) (V) (V) (V) (V) (V) (V) (V) 

 03ca065880520199acd8dbd9c743212cf0ba8861 

(V) (V) (V) (V) (V) (V) (V) (V) (V) (V) (V) (V) (V) (V) (V) (V) (V) (V) (V) (V) (V) (V) (V) (V) (V) (V) (V) (V) (V) (V) (V) (V) (V) (V) (V) (V) (V) (V) (V) (V) (V) (V) (V) (V) (V) (V) (V) (V) (V) (V) (V) (V) (V) (V) (V) (V) (V) (V) (V) (V) (V) (V) 


In [14]:
# Uses the map function to sort the events in each student activity list 
rdd_events_by_students_sorted = (rdd_events_by_students
    .map(lambda x: (x[0],sorted(x[1], key=(lambda event: event['TimeStamp']))))
)
print(rdd_events_by_students_sorted.count())
print((lambda x: (x[0],[y['EventType'] for y in x[1]]))(rdd_events_by_students_sorted.take(1)[0]))

31269
('75046568bfb19ce33846e4b7c384db7a8809d578', ['Video', 'Video', 'Video', 'Video', 'Video', 'Video', 'Video', 'Video', 'Video', 'Video', 'Video', 'Video', 'Video', 'Video', 'Video', 'Video', 'Video'])


In [15]:
# Looks at the data for a student, now that it's sorted by TimeStamps
for studentID,events in rdd_events_by_students_sorted.filter(lambda x:x[0]=='6ea6949ca133acede360d3573f9d1168b3d70b51').take(1):
    print("\n",studentID,"\n")
    for event in events:
        print(eventToString(event), end=" ")
    print()


 6ea6949ca133acede360d3573f9d1168b3d70b51 

(V) (V) (V) (V) (V) (V) (V) (P 5 7.32999992371) (V) (P 5 10.0) (V) (V) (V) (V) (V) (V) (V) (V) (V) (V) (V) (V) (V) (V) (V) (V) (V) (V) (V) (V) (V) (V) (P 7 9.92000007629) (V) (V) (V) (V) (V) (V) (V) (V) (V) (V) (V) (V) (V) (V) (V) (V) (V) (V) (V) (V) (V) (V) (V) (V) (V) (V) (V) (V) (V) (V) (V) (V) (V) (V) (V) (V) (V) (V) (V) (V) (V) (V) (V) (V) (V) (V) (V) (V) (V) (V) (V) (V) (P 6 9.67000007629) (P 6 9.32999992371) (P 6 10.0) (V) (V) (V) (V) (V) (V) (V) (V) (V) (V) (V) (V) (V) (V) (V) (V) (V) (V) (V) (V) (V) (V) (V) (V) (V) (V) (V) (V) (P 12 9.22999954224) (V) (V) (V) (V) (V) (V) (V) (P 14 4.5) (P 14 5.09000015259) (V) (V) (V) (V) (V) (V) (V) (V) (P 14 6.88000011444) (P 14 8.06999969482) (P 14 9.55000019073) (P 14 9.55000019073) (P 14 9.55000019073) (P 14 10.0) (P 14 10.0) (V) (V) (V) (V) (V) (V) (V) (F) (F) (F) (F) (F) (F) (F) (F) (P 17 5.57000017166) (V) (V) (V) (V) (V) (V) (V) (V) (V) (V) (V) (F) (F) (P 20 5.82999992371) (F) (F) (V) (V) (

### Extracting learning patterns
The goal is to extract what a students does between a failed attempt at an assignment and a successful attempt

In [16]:
# The goal is to concatenate small sub events into one big event
# For example Play/Pause/Play/Pause/Play/Pause on a video counts as only a Video event
def eventConcat(events):
    res=[events[0]]
    for event in events[1:]:
        # not concatenanting Problem events ??? Let's think about it 
        # the answer is to concatenate Problem events outside of the first and last problems of a pattern
        # to-do after extraction then !
        if not (event['EventType']==res[-1]['EventType']) or (event['EventType']=='Problem'):
            res.append(event)
    return res

In [17]:
rdd_events_by_students_concat = (rdd_events_by_students_sorted
    .map(lambda x: (x[0],eventConcat(x[1])))
)

# Looks at the data for a student, now that it's sorted by TimeStamps
for studentID,events in rdd_events_by_students_concat.filter(lambda x:x[0]=='6ea6949ca133acede360d3573f9d1168b3d70b51').take(1):
    print("\n",studentID,"\n")
    for event in events:
        print(eventToString(event), end=" ")
    print()


 6ea6949ca133acede360d3573f9d1168b3d70b51 

(V) (P 5 7.32999992371) (V) (P 5 10.0) (V) (P 7 9.92000007629) (V) (P 6 9.67000007629) (P 6 9.32999992371) (P 6 10.0) (V) (P 12 9.22999954224) (V) (P 14 4.5) (P 14 5.09000015259) (V) (P 14 6.88000011444) (P 14 8.06999969482) (P 14 9.55000019073) (P 14 9.55000019073) (P 14 9.55000019073) (P 14 10.0) (P 14 10.0) (V) (F) (P 17 5.57000017166) (V) (F) (P 20 5.82999992371) (F) (V) (P 20 10.0) (F) 


In [18]:
# The goal is to extract the list of event in between the first and last attempts at each problem
def extractPatterns(events):
    curProblem = None
    patterns = {}
    for event in events:
        if event['EventType']=='Problem' and event['ProblemID']!=curProblem:
            curProblem=event['ProblemID']
            patterns[curProblem]=patterns.get(curProblem,[])
        if not (curProblem is None):
            patterns[curProblem].append(event)
        if event['EventType']=='Problem' and event['Grade']=='10.0':
            curProblem=None
    return patterns

In [19]:
rdd_events_by_students_extract = (rdd_events_by_students_concat
    .map(lambda x: (x[0],extractPatterns(x[1])))
)

# Looks at the data for a student, now that it's sorted by TimeStamps
for studentID,patterns in rdd_events_by_students_extract.filter(lambda x:x[0]=='6ea6949ca133acede360d3573f9d1168b3d70b51').take(1):
    print("\n",studentID,"\n")
    for pattern in patterns.values():
        for event in pattern:
            print(eventToString(event), end=" ")
        print()
    print()


 6ea6949ca133acede360d3573f9d1168b3d70b51 

(P 14 4.5) (P 14 5.09000015259) (V) (P 14 6.88000011444) (P 14 8.06999969482) (P 14 9.55000019073) (P 14 9.55000019073) (P 14 9.55000019073) (P 14 10.0) (P 14 10.0) 
(P 7 9.92000007629) (V) 
(P 17 5.57000017166) (V) (F) 
(P 5 7.32999992371) (V) (P 5 10.0) 
(P 12 9.22999954224) (V) 
(P 20 5.82999992371) (F) (V) (P 20 10.0) 
(P 6 9.67000007629) (P 6 9.32999992371) (P 6 10.0) 



In [20]:
def eventToLetter(event):
    return {
        "Problem": lambda x:"P",
        "Video": lambda x:"V",
        "Forum": lambda x:"F",
    }[event['EventType']](event)

# Uses the method flatMap on the students patterns table to have a table of all the patterns
rdd_patterns = (rdd_events_by_students_extract
    .flatMap(lambda x: x[1].values())
    .filter(lambda x: x[-1]['EventType']=='Problem')
    .map(lambda pattern: (
        "".join([eventToLetter(event) for event in pattern]),
        {
            "from":pattern[0].get('Grade',None),
            "to":pattern[-1].get('Grade',None)
        }
    ))
)

patterns_counts = rdd_patterns.countByKey()
for pattern in list(patterns_counts.keys())[:10]:
    print(pattern, patterns_counts[pattern])

PPFPP 32
PPVPPVP 2
PPPPPPPPVPPPPP 1
PPVPPPP 8
PFVFVFVFVFPFPFP 1
PVFVPFP 2
PPPPPPPPVPVPPP 1
PPPPPFPPP 1
PFVFPP 6
PFVPVFP 1
