In [111]:
import csv
from collections import namedtuple

file_1 = '../TIPs_middle_school_stimuli/content/womens_suffrage_1.md'
file_2 = '../TIPs_middle_school_stimuli/content/womens_suffrage_2.md'
highlight_info = 'undergrad_student_participants/highlights.csv'

def get_lines(filename):
    line_list = []
    with open(filename, 'r') as open_file:
        line_list = [line for line in open_file]
    return line_list

file_1_lines = get_lines(file_1)
file_2_lines = get_lines(file_2)

In [112]:
data = []

with open(highlight_info, 'r') as input_file:
    reader = csv.reader(input_file, delimiter=',', quotechar='"')
    row_class = None
    for row in reader:
        if row_class is None:
            row_class = namedtuple('row_class', row)
            continue
        try:
            data.append(row_class(*tuple(row)))
        except Exception as e:
            print row
            raise e

In [113]:
def to_key(x):
    return (int(x.Participant), x.Condition, int(x.Part))

keys = set([to_key(x) for x in data])

In [114]:
data_dict = {}
for k in keys:
    data_dict[k] = [x.Highlight for x in data if to_key(x) == k]

In [115]:
def get_indexes(key, snippets):
    lines_list = file_1_lines if key[2] == 1 else file_2_lines
    snippet_indexes = []
    snippet_index = 0
    snip = snippets[snippet_index].strip()
    for list_index in range(len(lines_list)):
        line = lines_list[list_index]
        offset = 0
        find_snip = line.find(snip)
        while find_snip > -1:
            index = (list_index, find_snip+offset, find_snip+offset+len(snip))
            snippet_indexes.append(index)
            offset += find_snip+len(snip)
            line = line[find_snip+len(snip):]
            # look for next snippet
            snippet_index += 1
            if snippet_index >= len(snippets):
                return snippet_indexes
            snip = snippets[snippet_index].strip()
            find_snip = line.find(snip)
    if len(snippet_indexes) != snippets:
        print key
        print snippet_indexes
        raise Exception("Couldn't find "+str(snip))
    return snippet_indexes

In [128]:
from collections import defaultdict

def build_indexes(data_dict):
    all_indexes = {1: defaultdict(list), 2: defaultdict(list)}
    for key in data_dict:
        file_index = key[2]
        index_list = get_indexes(key, data_dict[key])
        for index in index_list:
            line_index, start, end = index
            all_indexes[file_index][line_index] += [(start, (key[0], key[1], 'start')), (end, (key[0], key[1], 'end'))]
    for file_index in [1,2]:
        for line in all_indexes[file_index]:
            all_indexes[file_index][line].sort()
    return all_indexes

all_indexes = build_indexes(data_dict)

In [129]:
def write_files():
    for file_index in [1,2]:
        file_list = file_1_lines if file_index == 1 else file_2_lines
        with open('file_'+str(file_index)+'.md', 'w') as output_file:
            for line_index in range(len(file_list)):
                if line_index in all_indexes[file_index]:
                    old_index = 0
                    for value in all_indexes[file_index][line_index]:
                        text = file_list[line_index][old_index:value[0]]
                        if value[1][2] == 'start':
                            tag = '<highlight class="participant_'+str(value[1][0])+' '+value[1][1]+'">'
                        else:
                            tag = '</highlight>'
                        output_file.write(text+tag)
                        old_index = value[0]
                    text = file_list[line_index][old_index:]
                    output_file.write(text)
                else:
                    output_file.write(file_list[line_index])

write_files()

In [125]:
all_indexes[1][4]

[(6, (3, 'paper', 'end')),
 (6, (3, 'paper', 'start')),
 (6, (7, 'paper', 'end')),
 (6, (7, 'paper', 'start')),
 (49, (3, 'paper', 'end')),
 (49, (3, 'paper', 'start'))]

In [127]:
key = (3, 'paper', 1)
get_indexes(key, data_dict[key])

[(4, 6, 41),
 (4, 49, 71),
 (6, 102, 115),
 (6, 147, 166),
 (6, 256, 295),
 (6, 301, 386),
 (8, 13, 44),
 (8, 65, 86),
 (8, 91, 123),
 (8, 147, 156),
 (8, 199, 245),
 (8, 251, 265),
 (8, 280, 350),
 (8, 363, 418),
 (8, 443, 484),
 (8, 519, 541),
 (10, 56, 104),
 (10, 110, 127),
 (10, 145, 149),
 (10, 151, 199),
 (10, 205, 245),
 (18, 52, 56),
 (18, 62, 87),
 (18, 136, 158),
 (18, 202, 260),
 (18, 332, 362),
 (18, 378, 396),
 (18, 440, 533),
 (20, 38, 86),
 (20, 227, 294),
 (20, 334, 389)]

In [130]:
from collections import Counter

Counter([x.Participant for x in data])

Counter({'1': 8, '2': 9, '3': 52, '4': 11, '6': 19, '7': 41, '8': 16})