In [1]:
import numpy as np
import json
import pandas as pd
from matplotlib import pyplot as plt
import os

In [2]:
from sensemakingspace.story import Story
from sensemakingspace.stories import Stories

In [3]:
with open('trrack-data/outlier_extracted.json', 'r') as f:
    out_data = json.load(f)

In [4]:
with open('trrack-data/cluster_extracted.json', 'r') as f:
    other_data = json.load(f)

Outlier Dict

In [5]:
# outlier tasks
task_dict_out = {
    '23': 'out_easy_task_1',
    '24': 'out_easy_task_2',
    '25': 'out_easy_task_3',
    '26': 'out_easy_task_4',
    '27': 'out_easy_task_5',
    '29': 'out_med_task_1',
    '30': 'out_med_task_2',
    '31': 'out_med_task_3',
    '32': 'out_med_task_4',
    '34': 'out_hard_training_1',
    '35': 'out_hard_task_1',
    '36': 'out_hard_task_2',
    '37': 'out_hard_task_3',
    '38': 'out_hard_task_4'
}

Cluster  Dict

In [6]:
# # cluster tasks
task_dict_cluster = {
    '1': 'cluster_easy_task_1',
    '2': 'cluster_easy_task_2',
    '0': 'cluster_easy_training_1',
    '7': 'cluster_hard_task_1',
    '8': 'cluster_hard_task_2',
    '6': 'cluster_hard_training_1',
    '4': 'cluster_med_task_1',
    '5': 'cluster_med_task_2'#,
    #'3': 'cluster_med_training_1'
}

In [7]:
def ground_truth_story(dataset):
    task_type = dataset.split('_')[0]
    if task_type == 'cluster':
        df = pd.read_csv(f'datasets/{dataset}_ground.csv')
        label = df.columns[-1]
        indices = df.index[df[label] == True].to_list()
    elif task_type == 'out':
        df = pd.read_csv(f'datasets/{dataset}.csv')
        label = df.columns[-1]
        indices = df.index[df[label] == 'Out'].to_list()
    difficulty = dataset.split('_')[1]
    if difficulty == 'med':
        difficulty = 'medium'
    story = Story(
        f'{dataset}_ground_truth',
        dataset,
        {
            'accuracy': 1.,
            'dataset': dataset,
            'difficulty': difficulty,
            'training': True if dataset.split('_')[2] == 'training' else False,
            'supported': False,
            'autoCompleteUsed': False,
            'rankOfPredictionUsed': -1,
            'selectionSequence': [
                {
                    'timestamp': 0,
                    'selection': indices,
                    'turnedPrediction': None
                }
            ]
        }
    )
    story.is_gt = True
    return story

All Stories

In [8]:
outlier_stories = []
for user in out_data:
    for t in user['tasks']:
        outlier_stories.append(Story(
            user['id'],
            task_dict_out[t],
            user['tasks'][t]
        ))
for ds in list(task_dict_out.values()):
    outlier_stories.append(ground_truth_story(ds))
outlier_stories = Stories(outlier_stories)
outlier_stories.project(delete_duplicates=True, verbose=False)
outlier_stories.export_csv('pse-csv/trrack_stories_all-outlier.csv')

  "`pynndescent` has recently changed which distance metrics are supported, "
  self._set_arrayXarray(i, j, x)


In [9]:
cluster_stories = []
for user in other_data:
    for t in user['tasks']:
        cluster_stories.append(Story(
            user['id'],
            task_dict_cluster[t],
            user['tasks'][t]
        ))
for ds in list(task_dict_cluster.values()):
    cluster_stories.append(ground_truth_story(ds))
cluster_stories = Stories(cluster_stories)
cluster_stories.project(delete_duplicates=True, verbose=False)
cluster_stories.export_csv('pse-csv/trrack_stories_all-cluster.csv')

  "`pynndescent` has recently changed which distance metrics are supported, "
  self._set_arrayXarray(i, j, x)


In [10]:
all_stories = Stories(outlier_stories.stories + cluster_stories.stories)
all_stories.project(delete_duplicates=True, verbose=False)
all_stories.export_csv('pse-csv/trrack_stories_all.csv')

  "`pynndescent` has recently changed which distance metrics are supported, "
  self._set_arrayXarray(i, j, x)


Single Sessions

In [11]:
for task_num in task_dict_cluster:
    single_task_stories = []
    for user in other_data:
        if task_num in user['tasks']:
            single_task_stories.append(Story(
                user['id'],
                task_dict_cluster[task_num],
                user['tasks'][task_num]
            ))
    single_task_stories.append(ground_truth_story(task_dict_cluster[task_num]))
    single_task_stories = Stories(single_task_stories)

    single_task_stories.project(delete_duplicates=True, verbose=False)
    single_task_stories.export_csv('pse-csv/trrack_stories_task-cluster-{}.csv'.format(task_num))

In [12]:
for task_num in task_dict_out:
    single_task_stories = []
    for user in out_data:
        if task_num in user['tasks']:
            single_task_stories.append(Story(
                user['id'],
                task_dict_out[task_num],
                user['tasks'][task_num]
            ))
    single_task_stories.append(ground_truth_story(task_dict_out[task_num]))
    single_task_stories = Stories(single_task_stories)

    single_task_stories.project(delete_duplicates=True, verbose=False)
    single_task_stories.export_csv('pse-csv/trrack_stories_task-outlier-{}.csv'.format(task_num))

Single User Session for single task

In [15]:
# task_num = '8'
# user_id = '5d6927928a415c00194dfb6f'
# task_type = 'cluster'

# task_num = '26'
# user_id = '5bbc3d95f1f9ba000141d855'
# task_type = 'outlier'

task_num = '32'
user_id = '5d02ed8f7a3c0f0015cd3230'
task_type = 'outlier'

datasubset = out_data if task_type == 'outlier' else other_data
task_dict = {'cluster': task_dict_cluster, 'outlier': task_dict_out}[task_type]
single_user_story = []
for user in datasubset:
    if user['id'] == user_id and task_num in user['tasks']:
        # print(user['tasks'][task_num])
        single_user_story.append(Story(
            user['id'],
            task_dict[task_num],
            user['tasks'][task_num]
        ))
single_user_story.append(ground_truth_story(task_dict[task_num]))
single_user_story = Stories(single_user_story)
single_user_story.project(delete_duplicates=True, verbose=False)
single_user_story.export_csv('pse-csv/trrack_stories_task-{task_type}-{task_num}-{user_id}.csv'.format(
    task_type=task_type,
    task_num=task_num,
    user_id=user_id))

Perplexity value 30 is too high. Using perplexity 1.67 instead
