# Prepare data for Sankey Diagram with D3

In [1]:
import json
import utils
import pandas as pd
from collections import Counter, OrderedDict

from pyspark import SparkContext

sc = SparkContext("local[*]", "ADA")
sc.defaultParallelism

8

In [2]:
rdd = sc.textFile('data/spark/preprocessed/').map(json.loads)

In [3]:
# Display number of records per problem id
Counter(rdd.map(lambda x: x['ProblemID']).collect())

Counter({7: 9721, 21: 8551, 22: 4731, 23: 6128, 24: 7310, 25: 5350})

In [4]:
PROBLEM_ID = 25

In [5]:
def process_data(item):
    original_pattern = Pattern = item['Pattern']
    new_pattern = 'S,'
    
    for pattern_item in original_pattern:
        if (pattern_item['EventType'] == 'Video' and pattern_item['EventSubType'] == 'Load'):
            pattern_item = 'V' + str(pattern_item['VideoID']) + ','
            # Do not write if loop
            if pattern_item not in new_pattern[-5:]:
                new_pattern += pattern_item
        elif pattern_item['EventType'] == 'Problem':
            new_pattern += 'A' + str(new_pattern.count('A') + 1) + ','
            
    return({
        "StudentID": item['StudentID'],
        "ProblemID": item['ProblemID'],
        "Pattern": new_pattern[:-1],
    })

In [6]:
def get_final_patterns(rdd):
    # Filter by problemID and clean pattern
    rdd_processed = rdd.filter(lambda x: x['ProblemID'] == PROBLEM_ID).map(process_data)
    # Counts the occurences of the patterns
    patterns_with_counts = dict(Counter(rdd_processed.map(lambda x: x['Pattern']).collect()))
    # Remove the patterns with less than 50 occurences
    final_patterns = {k: v for k, v in patterns_with_counts.items() if v > 30}
    return final_patterns

In [7]:
final_patterns = get_final_patterns(rdd)

In [8]:
final_patterns

{'S,A1': 1319,
 'S,A1,A2': 338,
 'S,A1,A2,A3': 117,
 'S,A1,A2,A3,A4': 35,
 'S,V103,A1': 93,
 'S,V103,V105,A1': 46,
 'S,V103,V105,V107,A1': 36,
 'S,V103,V105,V107,V109,V111,A1': 37,
 'S,V103,V105,V107,V109,V111,V113,A1': 46,
 'S,V103,V105,V107,V109,V111,V113,V115,A1': 708,
 'S,V103,V105,V107,V109,V111,V113,V115,A1,A2': 221,
 'S,V103,V105,V107,V109,V111,V113,V115,A1,A2,A3': 46,
 'S,V103,V105,V107,V109,V111,V113,V115,V113,A1': 45,
 'S,V103,V105,V107,V109,V111,V113,V115,V113,V115,A1': 33,
 'S,V115,A1': 32}

In [9]:
def create_json_file(final_patterns):
    json_file = {"sankey": OrderedDict()}
    json_file['sankey']['nodes'] = []
    json_file['sankey']['links'] = []
    
    for pattern, count in final_patterns.items():
        pattern_elements = pattern.split(',')
        
        for i in range(len(pattern_elements) - 1):
            n_nodes = len(json_file['sankey']['nodes'])
            event_start = pattern_elements[i]
            event_start_idx = None
            event_end = pattern_elements[i + 1]
            event_end_idx = None
            
            # Check if these events already exist
            for idx, event in enumerate(json_file['sankey']['nodes']):
                if event['name'] == event_start:
                    event_start_idx = idx
                if event['name'] == event_end:
                    event_end_idx = idx
                    
            # If start event not found
            if event_start_idx == None:
                json_file['sankey']['nodes'].append({"name": event_start})
                event_start_idx = n_nodes
                n_nodes += 1
                
            # If loop
            if event_start == event_end:
                event_end_idx = event_start_idx
            # If end event not found
            elif event_end_idx == None:
                json_file['sankey']['nodes'].append({"name": event_end})
                event_end_idx = n_nodes
                n_nodes += 1

            # Check if source->target already exists
            for link in json_file['sankey']['links']:
                if link['source'] == event_start_idx and link['target'] == event_end_idx:
                    link['value'] += count
                    break
            else:
                json_file['sankey']['links'].append({
                    "source": event_start_idx,
                    "target": event_end_idx,
                    "value": count
                })

    return json_file

In [10]:
with open('sankey_data/dataset_pb_' + str(PROBLEM_ID) + '.json', 'w') as outfile:
    json.dump(create_json_file(final_patterns), outfile)