# Feature Engineering

In [1]:
from pyspark import SparkContext
import json
import utils
import pprint
pp = pprint.PrettyPrinter(indent=4)

import pandas as pd 

sc = SparkContext("local[*]", "ADA")
sc.defaultParallelism

8

# Importing the learning patterns

In [2]:
rdd_patterns = sc.textFile('data/spark/preprocessed/').map(json.loads)
rdd_patterns.count()

41791

# Feature engineering

In [3]:
def containsProblem(pattern):
    return 'P' in utils.patternToString(pattern,'minimal')

def getFirstProblem(pattern):
    indexOfFirstProblem = utils.patternToString(pattern,'minimal').find('P')
    return pattern[indexOfFirstProblem]

def getLastProblem(pattern):
    indexOfLastProblem = utils.patternToString(pattern,'minimal').rfind('P')
    return pattern[indexOfLastProblem]

def getBeforeFirstProblem(pattern):
    indexOfFirstProblem = utils.patternToString(pattern,'minimal').find('P')
    return pattern[:indexOfFirstProblem]

def getAfterFirstProblem(pattern):
    indexOfFirstProblem = utils.patternToString(pattern,'minimal').find('P')
    return pattern[indexOfFirstProblem:]

def getConcatenatedPattern(pattern):
    concatenated = [pattern[0]]
    for event in pattern[1:]:
        if (
            event['EventType'] == 'Problem'
            or (event['EventType'] == 'Video' and event['VideoID'] != concatenated[-1].get('VideoID',-1))
            or (event['EventType'] == 'Forum' and event['EventSubType'] == 'Post')
            or (event['EventType'] == 'Forum' and event['EventSubType'] == 'Thread' and concatenated[-1]['EventType'] != 'Forum')
        ):
            concatenated.append(event)
    return concatenated

def getFeatures(item):
    StudentID = item['StudentID']
    ProblemID = item['ProblemID']
    Pattern = item['Pattern']

    concatPattern = getConcatenatedPattern(Pattern)
    firstProblem = getFirstProblem(Pattern)
    lastProblem = getLastProblem(Pattern)
    
    features = {}

    videoIdBeforeFirstProblem = [video['VideoID'] for video in getBeforeFirstProblem(concatPattern) if video['EventType']=='Video']
    value = len(set(videoIdBeforeFirstProblem))
    features['numberOfVideosBeforeFirstProblem'] = value

    value = 1. if len(videoIdBeforeFirstProblem) != len(set(videoIdBeforeFirstProblem)) else 0.
    features['repeatedVideoBeforeFirstProblem'] = value

    value = sum([event['EventType']=='Video' for event in concatPattern])
    features['numberOfVideoEvent'] = value

    value = sum([event['EventType']=='Problem' for event in concatPattern])
    features['numberOfProblemEvent'] = value 

    value = sum([event['EventType']=='Forum' for event in concatPattern])
    features['numberOfForumEvent'] = value 

    value = 1. if [v for v in getAfterFirstProblem(Pattern) if v['EventType']=='Video'] else 0.
    features['watchedVideoAfterFirstProblem'] = value

    value = sum([event['EventType']=='Problem' for event in concatPattern])/len(concatPattern)
    features['proportionOfProblem'] = value

    value = sum([event['EventType']=='Video' for event in concatPattern])/len(concatPattern)
    features['proportionOfVideo'] = value

    value = sum([event['EventType']=='Forum' for event in concatPattern])/len(concatPattern)
    features['proportionOfForum'] = value

    value = firstProblem['TimeStamp'] - Pattern[0]['TimeStamp']
    features['timeBetweenStartAndFirstProblem'] = value / (3600*24)

    value = lastProblem['TimeStamp'] - Pattern[0]['TimeStamp']
    features['timeBetweenStartAndLastProblem'] = value / (3600*24)

    value = lastProblem['TimeStamp'] - firstProblem['TimeStamp']
    features['timeBetweenFirstAndLastProblem'] = value / (3600*24)

    value = firstProblem['HardCloseTime'] - Pattern[0]['TimeStamp']
    features['timeFromStartToHardCloseTime'] = value / (3600*24)

    value = firstProblem['HardCloseTime'] - firstProblem['TimeStamp']
    features['timeFromFirstToHardCloseTime'] = value / (3600*24)

    value = firstProblem['HardCloseTime'] - lastProblem['TimeStamp']
    features['timeFromLastToHardCloseTime'] = value / (3600*24)
    
    value = (lastProblem['TimeStamp'] - firstProblem['TimeStamp']) / max(1,features['numberOfProblemEvent']-1)
    features['averageResubmitTime'] = value / (3600*24)

    value = firstProblem['Grade']
    features['firstProblemGrade'] = value

    value = lastProblem['Grade']
    features['lastProblemGrade'] = value

    value = 1. if lastProblem['Grade'] > firstProblem['Grade'] else 0.
    features['increaseGrade'] = value

    value = max(0,(lastProblem['Grade']-firstProblem['Grade']))/(10.00001-firstProblem['Grade'])
    features['percentageIncreased'] = value

    features['patternString']=utils.patternToString(concatPattern,verbosity='minimal')
    
    return({
        "StudentID": StudentID,
        "ProblemID": ProblemID,
        "Features": features,
        "Pattern": utils.patternToString(Pattern,verbosity='minimal'),
        "PatternC": utils.patternToString(concatPattern,verbosity='minimal')
    })

rdd_features = rdd_patterns.map(getFeatures)
    
df = pd.DataFrame(rdd_features.map(lambda x: x['Features']).take(1020))
df.tail()

Unnamed: 0,averageResubmitTime,firstProblemGrade,increaseGrade,lastProblemGrade,numberOfForumEvent,numberOfProblemEvent,numberOfVideoEvent,numberOfVideosBeforeFirstProblem,patternString,percentageIncreased,...,proportionOfProblem,proportionOfVideo,repeatedVideoBeforeFirstProblem,timeBetweenFirstAndLastProblem,timeBetweenStartAndFirstProblem,timeBetweenStartAndLastProblem,timeFromFirstToHardCloseTime,timeFromLastToHardCloseTime,timeFromStartToHardCloseTime,watchedVideoAfterFirstProblem
1015,0.062859,2.62,0.0,2.58,1,2,4,4,VVVVFPP,0.0,...,0.285714,0.571429,0.0,0.062859,8.043218,8.106076,4.667049,4.60419,12.710266,0.0
1016,0.036603,9.28,1.0,10.0,3,3,6,6,VVVVVVFPFPFP,0.999986,...,0.25,0.5,0.0,0.073206,6.99338,7.066586,5.651806,5.5786,12.645185,0.0
1017,0.416603,7.82,1.0,10.0,3,3,4,3,VFVVVFPFPP,0.999995,...,0.3,0.4,1.0,0.833206,5.098113,5.931319,5.570289,4.737083,10.668403,0.0
1018,0.0,5.22,0.0,5.22,3,1,7,6,VVFVVVVFVFP,0.0,...,0.090909,0.636364,1.0,0.0,4.154907,4.154907,4.616204,4.616204,8.771111,0.0
1019,0.013322,2.0,1.0,7.58,2,3,6,5,VVVFVVVFPPP,0.697499,...,0.272727,0.545455,1.0,0.026644,12.018646,12.045289,4.673519,4.646875,16.692164,0.0


In [4]:
import json
import os
import shutil

directory='data/spark/features'
if os.path.exists(directory):
    shutil.rmtree(directory)

rdd_features.map(json.dumps).saveAsTextFile(directory)