# Bug in path processing
## Breaking changes in upstream analysis
- Index out of range when processing CSV file
- Using notebook to work out where the problem is coming from

In [1]:
%matplotlib inline
#from pylab import rcParams
#rcParams['figure.figsize'] = 12, 8

%load_ext autoreload
%autoreload 2

%load_ext version_information
%version_information numpy, scipy, matplotlib, pandas

Software,Version
Python,2.7.13 64bit [GCC 4.2.1 Compatible Apple LLVM 8.1.0 (clang-802.0.42)]
IPython,5.4.1
OS,Darwin 16.7.0 x86_64 i386 64bit
numpy,1.13.1
scipy,0.19.1
matplotlib,2.0.2
pandas,0.20.3
Fri Aug 18 11:45:20 2017 AEST,Fri Aug 18 11:45:20 2017 AEST


In [2]:
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import numpy as np
import cv2
import pandas as pd
import uuid

import os
import sys

sys.path.append("../src")

np.random.seed(0)

def view(image):
    plt.figure()
    plt.axis('off')
    plt.imshow(image, cmap = cm.Greys_r)

In [5]:
df = pd.read_json('t.json')
#df = df.sort_values('frame_nums', ascending=True)
df.head(n=10)

CPU times: user 10.9 s, sys: 1.08 s, total: 12 s
Wall time: 12 s


In [7]:
type(df['flattened_28x28_tag_matrices'][0])

list

In [8]:
tag_class_names = {0: 'CircleLine', 1: 'Leaf', 2: 'Note1', 3: 'Unknown', 4: 'DD', 5: 'Peace', 6: 'Question', 7: 'Pillars', 8: 'HH', 9: 'Ampersand', 10: 'PP', 11: 'Hash', 12: 'Power', 13: 'Ankh', 14: 'TT', 15: 'Trident', 16: 'Asterisk', 17: '4', 18: 'Lines3', 19: '1', 20: '0', 21: '3', 22: 'Plane', 23: '5', 24: 'CircleHalf', 25: '7', 26: 'Sun', 27: '8', 28: 'Omega', 29: 'ArrowHollow', 30: 'AA', 31: 'Note2', 32: 'Radioactive', 33: 'EE', 34: 'UU', 35: '6', 36: 'Plant', 37: 'GG', 38: 'XX', 39: 'ZZ', 40: 'Necklace', 41: 'Umbrella', 42: 'Triangle', 43: 'Dot', 44: 'a', 45: 'Heart', 46: 'e', 47: 'RR', 48: 'KK', 49: 'h', 50: 'Queen', 51: 'Tadpole', 52: 'n', 53: 'MM', 54: '2', 55: 'r', 56: 'ArrowLine', 57: 'y', 58: 'Scissors', 59: 'CircleCross'}

unknown_class_num = None
for tag_class in tag_class_names:
    if tag_class_names[tag_class] == 'Unknown':
        unknown_class_num = tag_class

UNKNOWN_CLASS = unknown_class_num
MIXED_CLASS = 'mixed'
GAP_CLASS = 'gap'
MAX_FRAME_GAP_BETWEEN_PATHS = 10
NUM_GROUP_CLASSIFICATIONS = 40
MIN_NUM_CLASSIFIED_GROUP = 10
NUM_GROUPS_IN_SECTION = 5
CLASS_CONF_THRESH = 0.6
NUM_UNKNOWNS_IN_PATH_THRESHOLD = 0

In [11]:
def increment_dict_key_value(class_dict, classification, num_increment=1):
    if classification in class_dict.keys():
        class_dict[classification] += num_increment
    else:
        class_dict[classification] = num_increment
    return class_dict

In [100]:
class BeeData:

    bee_id = 0

    def __init__(self, classification):

        self.start_frame_num_all_paths = []
        self.list_of_all_x_paths = []
        self.list_of_all_y_paths = []

        self.consensus_grouped_classifications = []
        self.num_classifications_in_group = 0

        self.class_counts_path = {UNKNOWN_CLASS: 0, GAP_CLASS: 0}
        self.add_classification(classification)

    def get_most_freq_class_pred(self, section_classifications):
        num_classifications = len(section_classifications)
        num_mixed = section_classifications.count(MIXED_CLASS)
        num_unknown = section_classifications.count(UNKNOWN_CLASS)
        num_mixed_unknown = num_mixed + num_unknown
        num_known = num_classifications - num_mixed_unknown

        if num_classifications < 2:
            return UNKNOWN_CLASS
        elif num_mixed > 1:
            return MIXED_CLASS
        elif num_mixed_unknown > 2:
            return UNKNOWN_CLASS
        elif num_known <= num_mixed_unknown:
            return UNKNOWN_CLASS
        else:
            section_count_dict = {MIXED_CLASS: 0, UNKNOWN_CLASS: 0}
            for classification in section_classifications:
                section_count_dict = increment_dict_key_value(section_count_dict, classification)

            del section_count_dict[MIXED_CLASS]
            del section_count_dict[UNKNOWN_CLASS]

            percent_section_count_dict = {c: float(section_count_dict[c]) / num_known for c in section_count_dict}
            most_freq_class_pred = max(section_count_dict, key=section_count_dict.get)

            if percent_section_count_dict[most_freq_class_pred] > 0.5:
                return most_freq_class_pred
            else:
                return MIXED_CLASS

    def add_classification(self, classification):
        self.num_classifications_in_group += 1
        self.class_counts_path = increment_dict_key_value(self.class_counts_path, classification)

        if self.num_classifications_in_group == NUM_GROUP_CLASSIFICATIONS:
            self.identify_freq_class_path_group()
            self.num_classifications_in_group = 0

    def identify_freq_class_path_group(self):
        num_unknown_gaps_classified = self.class_counts_path[UNKNOWN_CLASS] + self.class_counts_path[GAP_CLASS]
        num_remaining_classified = self.num_classifications_in_group - num_unknown_gaps_classified
        if self.num_classifications_in_group < MIN_NUM_CLASSIFIED_GROUP or num_remaining_classified < MIN_NUM_CLASSIFIED_GROUP:
            self.consensus_grouped_classifications.append(UNKNOWN_CLASS)
        else:
            del self.class_counts_path[UNKNOWN_CLASS]
            del self.class_counts_path[GAP_CLASS]
            percent_class_counts_path = {c: float(self.class_counts_path[c]) / num_remaining_classified for c in self.class_counts_path}
            most_freq_class_pred = max(percent_class_counts_path, key=percent_class_counts_path.get)
            if percent_class_counts_path[most_freq_class_pred] > CLASS_CONF_THRESH:
                self.consensus_grouped_classifications.append(most_freq_class_pred)
            else:
                self.consensus_grouped_classifications.append(MIXED_CLASS)

        self.current_group_tags = []
        self.class_counts_path = {UNKNOWN_CLASS: 0, GAP_CLASS: 0}

    def identify_uncertain_region(self, list_classifications):
        is_uncertain_region = False
        num_mixed = list_classifications.count(MIXED_CLASS)
        num_unknown = list_classifications.count(UNKNOWN_CLASS)
        if num_mixed > 1 or num_unknown > 3 or num_mixed + num_unknown > 3:
            is_uncertain_region = True
        return is_uncertain_region


    def get_index_prev_class(self, current_path_index, current_section_pred, classifications_list):
        while True:
            if current_section_pred != classifications_list[current_path_index]:
                return current_path_index
            else:
                current_path_index -= 1

    def merge_group_classifications_into_sections(self):
        num_grouped_classifications = len(self.consensus_grouped_classifications)
        num_concurrent_mixed = 0
        num_concurrent_unknown = 0
        is_unknown_path_section = False
        self.class_path_end_index = []
        self.classes_in_path = []
        prev_classification = None
        
        #print(num_grouped_classifications, 'a1')

        for i in range(0, num_grouped_classifications, NUM_GROUPS_IN_SECTION):
            current_section = self.consensus_grouped_classifications[i:i+NUM_GROUPS_IN_SECTION]
            current_section_pred = self.get_most_freq_class_pred(current_section)

            if current_section_pred == MIXED_CLASS:
                num_concurrent_mixed += 1
            elif current_section_pred == UNKNOWN_CLASS:
                num_concurrent_unknown += 1
            else:
                num_concurrent_mixed_unknown = num_concurrent_mixed + num_concurrent_unknown
                current_prev_pred_same = current_section_pred == prev_classification
                
                #print(num_concurrent_mixed_unknown, current_prev_pred_same, prev_classification)

                if current_prev_pred_same:
                    if num_concurrent_mixed > 2 or num_concurrent_mixed_unknown > 6:
                        is_unknown_path_section = True
                else:
                    if num_concurrent_mixed > 1 or num_concurrent_mixed_unknown > 3:
                        is_unknown_path_section = True

                if is_unknown_path_section and prev_classification is None:
                    prev_classification = UNKNOWN_CLASS

                if prev_classification is None:
                    prev_classification = current_section_pred
                    current_prev_pred_same = True

                if not current_prev_pred_same or is_unknown_path_section:
                    # covers case where there's gap or new class
                    prev_class_index = self.get_index_prev_class(i,  current_section_pred, self.consensus_grouped_classifications)
                    self.class_path_end_index.append(prev_class_index * NUM_GROUP_CLASSIFICATIONS)
                    if is_unknown_path_section:
                        self.classes_in_path.append(UNKNOWN_CLASS)
                    else:
                        self.classes_in_path.append(prev_classification)

                prev_classification = current_section_pred
                num_concurrent_mixed = 0
                num_concurrent_unknown = 0
                is_unknown_path_section = False
        
        #print(num_concurrent_mixed_unknown, current_prev_pred_same, prev_classification)
        
        num_concurrent_mixed_unknown = num_concurrent_mixed + num_concurrent_unknown
        if prev_classification is None:
            self.classes_in_path.append(UNKNOWN_CLASS)
            self.class_path_end_index.append(num_grouped_classifications * NUM_GROUP_CLASSIFICATIONS)
        elif num_concurrent_mixed_unknown == 0 or num_concurrent_mixed < 3 and num_concurrent_mixed_unknown < 7:
            #print('abc', self.classes_in_path, self.class_path_end_index)
            self.classes_in_path.append(prev_classification)
            self.class_path_end_index.append(num_grouped_classifications * NUM_GROUP_CLASSIFICATIONS)
            #print('abc', self.classes_in_path, self.class_path_end_index)
        else:
            final_section = self.consensus_grouped_classifications[num_grouped_classifications-(num_concurrent_mixed_unknown*NUM_GROUPS_IN_SECTION):num_grouped_classifications]
            current_section_pred = self.get_most_freq_class_pred(final_section)
            prev_class_index = self.get_index_prev_class(num_grouped_classifications-1,  current_section_pred, self.consensus_grouped_classifications)
            self.class_path_end_index.append(prev_class_index * NUM_GROUP_CLASSIFICATIONS)
            self.classes_in_path.append(prev_classification)

            self.class_path_end_index.append(num_grouped_classifications * NUM_GROUP_CLASSIFICATIONS)
            self.classes_in_path.append(UNKNOWN_CLASS)
            
        #print(self.class_path_end_index, self.classes_in_path)

    def gen_separate_tag_class_bees(self):
        num_coords = 0
        path_class_index = 0
        bee_paths_index_starts = 0
        bee_tags_in_path = []

        class_x_path_coords = None
        class_y_path_coords = None
        class_path_start_frame_num = None

        list_class_x_path_coords = []
        list_class_y_path_coords = []
        list_class_path_start_frame_nums = []
        
        #print(len(self.list_of_all_x_paths), len(self.list_of_all_y_paths), len(self.class_path_end_index))
       
        #print(sum( [len(x) for x in self.list_of_all_x_paths]))

        for paths_index in range(len(self.list_of_all_x_paths)):
            class_x_path_coords = self.list_of_all_x_paths[paths_index]
            class_y_path_coords = self.list_of_all_y_paths[paths_index]
            class_path_start_frame_num = self.start_frame_num_all_paths[paths_index]
            for coord_index in range(len(self.list_of_all_x_paths[paths_index])):
                if num_coords > self.class_path_end_index[path_class_index]:
                    #print(num_coords)
                    #print(len(self.list_of_all_x_paths))
                    list_class_x_path_coords.append(class_x_path_coords[:coord_index])
                    list_class_y_path_coords.append(class_y_path_coords[:coord_index])
                    list_class_path_start_frame_nums.append(class_path_start_frame_num)
                    bee_tag_data = {'bee_id': self.bee_id, 'tag_class': self.classes_in_path[path_class_index], 'x_paths': list_class_x_path_coords, 'y_paths': list_class_y_path_coords, 'start_frame_nums': list_class_path_start_frame_nums}
                    self.bee_id += 1
                    bee_tags_in_path.append(bee_tag_data)

                    class_path_start_frame_num += coord_index
                    class_x_path_coords = class_x_path_coords[coord_index:]
                    class_y_path_coords = class_y_path_coords[coord_index:]
                    list_class_x_path_coords = []
                    list_class_y_path_coords = []
                    list_class_path_start_frame_nums = []

                    path_class_index += 1

                num_coords += 1

            list_class_x_path_coords.append(class_x_path_coords)
            list_class_y_path_coords.append(class_y_path_coords)
            list_class_path_start_frame_nums.append(class_path_start_frame_num)

        bee_tag_data = {'bee_id': self.bee_id, 'tag_class': self.classes_in_path[path_class_index], 'x_paths': list_class_x_path_coords, 'y_paths': list_class_y_path_coords, 'start_frame_nums': list_class_path_start_frame_nums}
        self.bee_id += 1
        bee_tags_in_path.append(bee_tag_data)

        return bee_tags_in_path


In [104]:
class ProcessPaths:

    def __init__(self):
        pass

    def gen_gap_coords(self, x1, y1, x2, y2, difference_prev_frame):
        x_diff_per_frame = (x2 - x1) / float(difference_prev_frame)
        y_diff_per_frame = (y2 - y1) / float(difference_prev_frame)

        gap_coords = {'x': [], 'y': []}
        for gap in range(1, difference_prev_frame + 1):
            x_gap_coord = x2 - x_diff_per_frame * gap
            y_gap_coord = y2 - y_diff_per_frame * gap
            gap_coords['x'].append(x_gap_coord)
            gap_coords['y'].append(y_gap_coord)

        return gap_coords

    def get_class_num_frames_tracked(self, bee_tags_in_path):
        path_class_frames_tracked_list = []
        for bee in bee_tags_in_path:
            tag_class = bee['tag_class']
            num_frames_tracked = 0
            for path_index in range(len(bee['x_paths'])):
                num_frames_tracked += len(bee['x_paths'][path_index])
            path_class_frames_tracked_list.append({'tag_class': tag_class, 'num_frames_tracked': num_frames_tracked})

            return path_class_frames_tracked_list

    def process_paths(self, bee_df):
        x_list = bee_df['x'].tolist()
        y_list = bee_df['y'].tolist()
        frame_nums_list = bee_df['frame_nums'].tolist()
        classifications_list = bee_df['classifications'].tolist()

        bee_data = BeeData(classifications_list[0])
        path_frame_num_start = frame_nums_list[0]
        path_frame_num_end = frame_nums_list[0]
        x_path = [x_list[0]]
        y_path = [y_list[0]]
    
        print(len(x_list))
        for i in range(1, len(x_list)):
            difference_prev_frame = frame_nums_list[i] - path_frame_num_end

            if difference_prev_frame == 1:
                path_frame_num_end = frame_nums_list[i]
                x_path.append(x_list[i])
                y_path.append(y_list[i])
                bee_data.add_classification(classifications_list[i])

            elif difference_prev_frame < MAX_FRAME_GAP_BETWEEN_PATHS:
                path_frame_num_end = frame_nums_list[i]
                generated_coord_gaps = self.gen_gap_coords(x_list[i], y_list[i], x_list[i-1], y_list[i-1], difference_prev_frame)
                fill_path_classifications_gap = [GAP_CLASS] * len(generated_coord_gaps)
                fill_path_classifications_gap[-1] = classifications_list[i]
                x_path.extend(generated_coord_gaps['x'])
                y_path.extend(generated_coord_gaps['y'])

                for gap_classification in fill_path_classifications_gap:
                    bee_data.add_classification(gap_classification)
            else:
                bee_data.list_of_all_x_paths.append(x_path)
                bee_data.list_of_all_y_paths.append(y_path)
                bee_data.start_frame_num_all_paths.append(path_frame_num_start)
                x_path = [x_list[i]]
                y_path = [y_list[i]]
                path_frame_num_start = frame_nums_list[i]
                path_frame_num_end = frame_nums_list[i]

                bee_data.add_classification(classifications_list[i])

        if len(x_path) > 0:
            #print(len(x_path))
            bee_data.list_of_all_x_paths.append(x_path)
            bee_data.list_of_all_y_paths.append(y_path)
            bee_data.start_frame_num_all_paths.append(path_frame_num_start)
            bee_data.identify_freq_class_path_group()
            
        bee_data.merge_group_classifications_into_sections()
        bees_identified_by_tag = bee_data.gen_separate_tag_class_bees()

        return bees_identified_by_tag

In [105]:
grouped_bee_id = df.groupby('bee_id')
df_bee_id = grouped_bee_id.get_group(5)
pp = ProcessPaths()
bee_paths_list_broken_up_by_class = pp.process_paths(df_bee_id)
bee_path_classes_num_frames_tracked = pp.get_class_num_frames_tracked(bee_paths_list_broken_up_by_class)

442
(12, 'a1')
(0, False, None)
(0, True, 54)
(0, True, 54)
('abc', [], [])
('abc', [54], [480])
522


IndexError: list index out of range

In [35]:
grouped_bee_id = df.groupby('bee_id')
for bee_id, df_bee_id in grouped_bee_id:
    print(bee_id)
    pp = ProcessPaths()
    bee_paths_list_broken_up_by_class = pp.process_paths(df_bee_id)
    bee_path_classes_num_frames_tracked = pp.get_class_num_frames_tracked(bee_paths_list_broken_up_by_class)

0
1
2
3
4
5


IndexError: list index out of range