# Feature Engineering

In [1]:
from pyspark import SparkContext
import json
import utils
import pprint
pp = pprint.PrettyPrinter(indent=4)

sc = SparkContext("local[*]", "ADA")
sc.defaultParallelism

4

# Configuration

In [2]:
config = {
    "session":"progfun-002"
}

# Importing the learning patterns

In [3]:
rdd_patterns = sc.textFile('data/spark/preprocessed/').map(json.loads)
rdd_patterns.count()

161934

# Cleaning

In [4]:
def cleaning(item):
    studentId = item['studentId']
    problemId = item['problemId']
    pattern = item['pattern']
    
    patternStr = utils.patternToString(pattern,'minimal')
    
    if not 'P' in patternStr:
        return []
    
    lastProblemIndex = patternStr.rfind('P')
    
    return([{
        "studentId": studentId,
        "problemId": problemId,
        "pattern": pattern[:(lastProblemIndex+1)]
    }])

rdd_patterns_clean = rdd_patterns.flatMap(cleaning).persist()
print('count', rdd_patterns_clean.count())

for item in rdd_patterns_clean.take(5):
    print(item['studentId'], '\t', item['problemId'])
    print(utils.patternToString(item['pattern'], 'minimal'))
    print()

count 49344
fbcad37e5e8f783e53e89fdeee648a230e309f04 	 6
VVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVP

fbcad37e5e8f783e53e89fdeee648a230e309f04 	 7
VVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVP

fbcad37e5e8f783e53e89fdeee648a230e309f04 	 12
VVVVVVVP

fbcad37e5e8f783e53e89fdeee648a230e309f04 	 14
VVVVP

9606552f10e8c158e312a7119c86d971abf5e52b 	 6
VVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVPP



# Feature engineering

In [5]:
LECTURES_PER_PROBLEM = {
    7: [3,4,5,6,7,8,33], # Lecture 1
    6: [35,37,39,41,43,47,49], # Lecture 2
    12: [51,53,75], # Lecture 3
    14: [71,81,85,79,87,77], # Lecture 4
    17: [109,105,115,107,103,113,111], # Lecture 6
    20: [123,117,125,121,127,119] # Lecture 7
}

PROBLEM_PER_LECTURE = {}
for pb in LECTURES_PER_PROBLEM.keys():
    for lc in LECTURES_PER_PROBLEM[pb]:
        PROBLEM_PER_LECTURE[lc]=pb

def containsProblem(pattern):
    return 'P' in patternToString(pattern,'minimal')

def getFirstProblem(pattern):
    indexOfFirstProblem = utils.patternToString(pattern,'minimal').find('P')
    return pattern[indexOfFirstProblem]

def getLastProblem(pattern):
    return pattern[-1]

def getBeforeFirstProblem(pattern):
    indexOfFirstProblem = utils.patternToString(pattern,'minimal').find('P')
    return pattern[:indexOfFirstProblem]

def getAfterFirstProblem(pattern):
    indexOfFirstProblem = utils.patternToString(pattern,'minimal').find('P')
    return pattern[indexOfFirstProblem:]

def getFeatures(item):
    studentId = item['studentId']
    problemId = item['problemId']
    pattern = item['pattern']
    
    features = {}
    
    videoIdBeforeFirstProblem = [video['VideoID'] for video in getBeforeFirstProblem(pattern) if video['EventType']=='Video']
    value = len(set(videoIdBeforeFirstProblem)) == len(LECTURES_PER_PROBLEM[problemId])
    features['watchedAllVideosBeforeFirstProblem'] = value

    value = 1. if len(videoIdBeforeFirstProblem) != len(set(videoIdBeforeFirstProblem)) else 0.
    features['repeatedVideoBeforeFirstProblem'] = value

    value = sum([event['EventType']=='Video' for event in pattern])
    features['totalNumberOfVideoWatched'] = value

    value = sum([event['EventType']=='Problem' for event in pattern])
    features['totalNumberOfProblemSubmissions'] = value 
            
    value = 1. if [v for v in getAfterFirstProblem(pattern) if v['EventType']=='Video'] else 0.
    features['watchedSomeVideoAfterFirstProblem'] = value

    firstProblem = getFirstProblem(pattern)
    value = 1. if pattern[-1]['Grade'] > firstProblem['Grade'] else 0.
    features['increaseGradeFromFirstToLastProblem'] = value

    value = sum([event['EventType']=='Problem' for event in pattern])/len(pattern)
    features['proportionOfProblemEvents'] = value
    
    value = firstProblem['TimeStamp'] - pattern[0]['TimeStamp']
    features['timeBetweenStartAndFirstProblem'] = value / (3600*24)
    
    value = pattern[-1]['TimeStamp'] - pattern[0]['TimeStamp']
    features['timeBetweenStartAndLastProblem'] = value / (3600*24)
    
    value = pattern[-1]['TimeStamp'] - firstProblem['TimeStamp']
    features['timeBetweenFirstAndLastProblem'] = value / (3600*24)
    
    value = firstProblem['Grade']
    features['firstProblemGrade'] = value
    
    value = pattern[-1]['Grade']
    features['lastProblemGrade'] = value
    
    value = max(0,(pattern[-1]['Grade']-firstProblem['Grade']))/(10.00001-firstProblem['Grade'])
    features['percentageImprovedBetweenFirstAndLastProblem'] = value
        
    return({
        "studentId": studentId,
        "problemId": problemId,
        "features": features,
        "pattern": utils.patternToString(pattern,verbosity='minimal')
    })

rdd_features = rdd_patterns_clean.map(getFeatures)
for feature in rdd_features.take(5):
    pp.pprint(feature['features'])
    print(feature['pattern'])
    print()

{   'firstProblemGrade': 10.0,
    'increaseGradeFromFirstToLastProblem': 0.0,
    'lastProblemGrade': 10.0,
    'percentageImprovedBetweenFirstAndLastProblem': 0.0,
    'proportionOfProblemEvents': 0.006993006993006993,
    'repeatedVideoBeforeFirstProblem': 1.0,
    'timeBetweenFirstAndLastProblem': 0.0,
    'timeBetweenStartAndFirstProblem': 10.096863425925926,
    'timeBetweenStartAndLastProblem': 10.096863425925926,
    'totalNumberOfProblemSubmissions': 1,
    'totalNumberOfVideoWatched': 142,
    'watchedAllVideosBeforeFirstProblem': True,
    'watchedSomeVideoAfterFirstProblem': 0.0}
VVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVVP

{   'firstProblemGrade': 10.0,
    'increaseGradeFromFirstToLastProblem': 0.0,
    'lastProblemGrade': 10.0,
    'percentageImprovedBetweenFirstAndLastProblem': 0.0,
    'proportionOfProblemEvents': 0.010309278350515464,
    'repeatedVideoBeforeFirstProble

In [6]:
import json
import os
import shutil

directory='data/spark/features'
if os.path.exists(directory):
    shutil.rmtree(directory)

rdd_features.map(json.dumps).saveAsTextFile(directory)