# ADA - Project
## With Spark

## Getting started

In [1]:
from pyspark import SparkContext

In [2]:
sc = SparkContext("local", "ADA")

In [3]:
data = sc.parallelize(range(100))
data.sum()

4950

If you received 4950 as a result, your spark is working well :) Good job !

## Importing and parsing the data

In [4]:
# Reading csv files: Create RDD () with one string entry per line in the file
rdd_problem_events = sc.textFile("../data/progfun-002_Problem_Events_with_Info.csv")
rdd_video_events = sc.textFile("../data/progfun-002_Video_Events.csv")
rdd_forum_events = sc.textFile("../data/progfun-002_Forum_Events.csv")

# Prints the first along with indices for each table
print("\n--- PROBLEM_EVENTS ---\n")
for idx,field in enumerate(rdd_problem_events.take(1)[0].split(",")): 
    print(idx,field, end=" / ")

print("\n--- VIDEO_EVENTS ---\n")
for idx,field in enumerate(rdd_video_events.take(1)[0].split(",")): 
    print(idx,field, end=" / ")

print("\n--- FORUM_EVENTS ---\n")
for idx,field in enumerate(rdd_forum_events.take(1)[0].split(",")): 
    print(idx,field, end=" / ")


--- PROBLEM_EVENTS ---

0 EventID / 1 ForumUserID / 2 MaximumSubmissions / 3 AccountUserID / 4 SubmissionNumber / 5 Grade / 6 TimeStamp / 7 DataPackageID / 8 ProblemID / 9 SoftCloseTime / 10 ProblemType / 11 HardCloseTime / 12 Platform / 13 OpenTime / 14 EventType / 15 Title / 16 SessionUserID / 17 UniqueProblemID / 18 UniqueUserID / 
--- VIDEO_EVENTS ---

0 EventID / 1 ForumUserID / 2 OldTime / 3 AccountUserID / 4 CurrentTime / 5 SeekType / 6 TimeStamp / 7 DataPackageID / 8 UniqueRowID / 9 TableName / 10 VideoID / 11 Platform / 12 NewSpeed / 13 EventSource / 14 EventType / 15 SessionUserID / 16 NewTime / 17 OldSpeed / 
--- FORUM_EVENTS ---

0 EventID / 1 ForumUserID / 2 PostID / 3 AccountUserID / 4 TimeStamp / 5 DataPackageID / 6 UniqueRowID / 7 TableName / 8 Platform / 9 EventSource / 10 PostType / 11 EventType / 12 JoinID / 13 SessionUserID / 

In [5]:
# Use the filter method to remove the first line
rdd_problem_events = rdd_problem_events.filter(lambda x: not x.startswith('EventID'))
rdd_video_events = rdd_video_events.filter(lambda x: not x.startswith('EventID'))
rdd_forum_events = rdd_forum_events.filter(lambda x: not x.startswith('EventID'))

# Prints first record for each table
print(rdd_problem_events.take(1)[0],"\n") 
print(rdd_video_events.take(1)[0],"\n") 
print(rdd_forum_events.take(1)[0],"\n")

3bcd1a54ed6ddb04b4a6fb2906110a01,None,0,None,7,None,1365344171,progfun-002,7,2147483640,Video,2147483640,Coursera,32400,Problem.Check,Lecture 1.2 - Elements of Programming (14:25),c4d4e5fcd2feba9f3234ee8d852dc7b22fbc07e4,f322944718b2ee0e53292118111533c7,21f13b3f6b50a83343b57d2f1d07dbdf 

db75adce6b87e7ab79242ea0af4b82d4,None,154.696,None,154.697,None,1372391638,progfun-002,00000078c0f0685cc50a25a8d5734a88,Video_Events,33,coursera,1.0,None,Video.Play,ef64fb7b096008f7eaf8441684afdf99af9af54a,None,1.0 

f3fdb52859b2511308aee554a573194e,None,17,4108315,1376254235,progfun-002,000006c12322ca29c7013dac42ef1a6a,Forum_Events,coursera,None,Thread,Forum.Thread.View,03b1fa287de5ef57d9c8482195b5167f,None 



In [6]:
# Function to parse the string entries from previous dataset
def parse_problems(x):
    data = x.split(',')
    return {
        'Grade':data[5],
        'TimeStamp':data[6],
        'ProblemID':data[8],
        'ProblemType':data[10],
        'EventType':data[14].split('.')[0],
        'EventSubType':data[14].split('.')[1],
        'SessionUserID':data[16]
    }

def parse_videos(x):
    data = x.split(',')
    return {
        'TimeStamp':data[6],
        'VideoID':data[10],
        'EventType':data[14].split('.')[0],
        'EventSubType':data[14].split('.')[1],
        'SessionUserID':data[15]
    }

def parse_forums(x):
    data = x.split(',')
    return {
        'AccountUserID':data[3],
        'TimeStamp':data[4],
        'EventType':data[11].split('.')[0],
        'EventSubType':data[11].split('.')[1]
    }

# Use the map method to have more workable data
rdd_problem_events_parsed = rdd_problem_events.map(parse_problems)
rdd_video_events_parsed = rdd_video_events.map(parse_videos)
rdd_forum_events_parsed = rdd_forum_events.map(parse_forums)

# Prints the count of elements along with the first element of each table
print(rdd_problem_events_parsed.count())
print(rdd_problem_events_parsed.take(1)[0],"\n") 
print(rdd_video_events_parsed.count()) 
print(rdd_video_events_parsed.take(1)[0],"\n") 
print(rdd_forum_events_parsed.count()) 
print(rdd_forum_events_parsed.take(1)[0],"\n") 


458888
{'ProblemType': 'Video', 'TimeStamp': '1365344171', 'EventSubType': 'Check', 'Grade': 'None', 'ProblemID': '7', 'SessionUserID': 'c4d4e5fcd2feba9f3234ee8d852dc7b22fbc07e4', 'EventType': 'Problem'} 

3471666
{'TimeStamp': '1372391638', 'VideoID': '33', 'EventSubType': 'Play', 'SessionUserID': 'ef64fb7b096008f7eaf8441684afdf99af9af54a', 'EventType': 'Video'} 

297650
{'AccountUserID': '4108315', 'EventSubType': 'Thread', 'TimeStamp': '1376254235', 'EventType': 'Forum'} 



In [7]:
# Handles problem with the forum events table having 'AccountUserID' instead of 'SessionUserID'
# Using the table progfun-002_User_Hash_Mapping
rdd_user_mapping = sc.textFile("../data/progfun-002_User_Hash_Mapping.csv")
print(rdd_user_mapping.take(1)[0])

rdd_user_mapping = (rdd_user_mapping
    .filter(lambda x: not x.startswith("ForumUserID"))
    .map(lambda x:(x.split(",")[1],x.split(",")[6]))
)
print(rdd_user_mapping.take(1)[0])

rdd_user_mapping = (rdd_user_mapping
    .join(rdd_forum_events_parsed
        .map(lambda x: (x['AccountUserID'],x))
    )
)
print(rdd_user_mapping.take(1)[0])

def f(x):
    x[1][1]['SessionUserID']=x[1][0] 
    return x[1][1]
rdd_forum_events_parsed = rdd_user_mapping.map(f)
print(rdd_forum_events_parsed.count())
print(rdd_forum_events_parsed.take(1)[0],"\n")

ForumUserID,AccountUserID,DataPackageID,UniqueRowID,TableName,Platform,SessionUserID
('2751555', '3b578c343cf9c2ab6526e12d047403d18182f01a')
('1932792', ('a97848d806f0d88cf80dd154845693af40cf559f', {'AccountUserID': '1932792', 'EventSubType': 'ThreadSubscribe', 'TimeStamp': '1364416322', 'EventType': 'Forum'}))
297650
{'AccountUserID': '1932792', 'TimeStamp': '1364416322', 'EventSubType': 'ThreadSubscribe', 'SessionUserID': 'a97848d806f0d88cf80dd154845693af40cf559f', 'EventType': 'Forum'} 



In [8]:
# Concatenantes all three table into one big table
rdd_events = (rdd_problem_events_parsed
    .union(rdd_video_events_parsed)
    .union(rdd_forum_events_parsed)
)
print(rdd_events.count())
print(rdd_events.take(1)[0],"\n")

4228204
{'ProblemType': 'Video', 'SessionUserID': 'c4d4e5fcd2feba9f3234ee8d852dc7b22fbc07e4', 'TimeStamp': '1365344171', 'Grade': 'None', 'EventSubType': 'Check', 'ProblemID': '7', 'EventType': 'Problem'} 

