# ADA - Project

# Getting started

In [1]:
from pyspark import SparkContext
import json
import utils

sc = SparkContext("local[*]", "ADA")
sc.defaultParallelism

8

# Configuration

In [2]:
config = {
    "session":"progfun-002"
}

# Importing and parsing the data

In [3]:
rdd = sc.textFile('data/spark/preprocessed/')

In [4]:
rdd.map(json.loads).count()

161934

In [None]:
def eventToString(event,verbosity):
    if verbosity == 'minimal':
        out = {
            "Problem": lambda x:"P",
            "Video": lambda x:"V",
            "Forum": lambda x:"F",
        }
    if verbosity == 'normal':
        out = {
            "Problem": lambda x:"P"+str(x['ProblemID']),
            "Video": lambda x:"V"+str(x['VideoID']),
            "Forum": lambda x:"F",
        }
    if verbosity == 'dates':
        out = {
            "Problem": lambda x:"P"+str(x['ProblemID'])+"."+str(x['Date']),
            "Video": lambda x:"V"+str(x['VideoID'])+"."+str(x['Date']),
            "Forum": lambda x:"F"+"."+str(x['Date']),
        }
    return out[event['EventType']](event)

def patternToString(pattern,verbosity):
    link = "" if verbosity == 'minimal' else "-" if verbosity == 'normal' else '\n-> '
    return link.join([eventToString(event,verbosity) for event in pattern])

def containsProblem(pattern):
    return 'P' in patternToString(pattern,'minimal')

def getFirstProblem(pattern):
    indexOfFirstProblem = patternToString(pattern,'minimal').find('P')
    return pattern[indexOfFirstProblem]

def getLastProblem(pattern):
    return pattern[-1]

def getBeforeLastProblem(pattern):
    indexOfLastProblem = patternToString(pattern,'minimal').rfind('P')
    return pattern[:(indexOfLastProblem+1)]

def getBeforeFirstProblem(pattern):
    indexOfFirstProblem = patternToString(pattern,'minimal').find('P')
    return pattern[:indexOfFirstProblem]

def getAfterFirstProblem(pattern):
    indexOfFirstProblem = patternToString(pattern,'minimal').find('P')
    return pattern[indexOfFirstProblem:]

# Uses the method flatMap on the students patterns table to have a table of all the patterns
rdd_patterns = (rdd_events_by_students_extract
    # x = (studentID,{problemID:pattern,...})
    .flatMap(lambda x: [
        (pb,x[1][pb]) for pb in x[1].keys() 
    ])
    # x = (problemID,pattern)
    .filter(lambda x: containsProblem(x[1]))
    # x = (problemID,pattern)
    .map(lambda x: (x[0],getBeforeLastProblem(x[1])))
)

rdd_patterns.persist()
for pattern in rdd_patterns.map(lambda x: patternToString(x[1],'minimal')).take(10):
    print (pattern)

In [None]:
def getFeatures(problem, pattern):
    features = {}
    
    videoBeforeFirstProblem = [video['VideoID'] for video in getBeforeFirstProblem(pattern)]
    value = 0. if False in [(videoID in videoBeforeFirstProblem) for videoID in LECTURES_PER_PROBLEM[problem]] else 1. 
    features['watchedAllVideosBeforeFirstProblem'] = value

    value = 1. if len(videoBeforeFirstProblem) != len(set(videoBeforeFirstProblem)) else 0.
    features['repeatedVideoBeforeFirstProblem'] = value

    value = sum([event['EventType']=='Video' for event in pattern])
    features['totalNumberOfVideoWatched'] = value

    value = sum([event['EventType']=='Problem' for event in pattern])
    features['totalNumberOfProblemSubmissions'] = value 
            
    value = 1. if [v for v in getAfterFirstProblem(pattern) if v['EventType']=='Video'] else 0.
    features['watchedSomeVideoAfterFirstProblem'] = value

    firstProblem = getFirstProblem(pattern)
    value = 1. if pattern[-1]['Grade'] > firstProblem['Grade'] else 0.
    features['increaseGradeFromFirstToLastProblem'] = value

    value = sum([event['EventType']=='Problem' for event in pattern])/len(pattern)
    features['proportionOfProblemEvents'] = value
    
    value = firstProblem['TimeStamp'] - pattern[0]['TimeStamp']
    features['timeBetweenStartAndFirstProblem'] = value / (3600*24)
    
    value = pattern[-1]['TimeStamp'] - pattern[0]['TimeStamp']
    features['timeBetweenStartAndLastProblem'] = value / (3600*24)
    
    value = pattern[-1]['TimeStamp'] - firstProblem['TimeStamp']
    features['timeBetweenFirstAndLastProblem'] = value / (3600*24)
    
    value = firstProblem['Grade']
    features['firstProblemGrade'] = value
    
    value = pattern[-1]['Grade']
    features['lastProblemGrade'] = value
    
    value = max(0,(pattern[-1]['Grade']-firstProblem['Grade']))/(10.00001-firstProblem['Grade'])
    features['percentageImprovedBetweenFirstAndLastProblem'] = value
        
    return(features)

rdd_pattern_features = (rdd_patterns
    # INPUT: (problemID (str), pattern (str))
    .map(lambda x: (
        x[0],
        patternToString(x[1],'dates'),
        getFeatures(x[0], x[1])
    ))
)
rdd_pattern_features.persist()

FEATURES = list(rdd_pattern_features.first()[2].keys())

for featureVector in rdd_pattern_features.take(1):
    print('problem', featureVector[0])
    print('pattern\n->', featureVector[1])
    pp.pprint(featureVector[2])
    print()