In [2]:
from final_process.database_util import DatabaseConnection
from final_process.book_char_dataset import BasicBookCharDataset

In [3]:
db_conn = DatabaseConnection('lcdata-wayback')
dataset = BasicBookCharDataset.load_from_database(db_conn)

In [4]:
from final_process.key_translator import KeyTranslator

In [5]:
kt = KeyTranslator.load_from_json_files(
    'static/new_book_key_to_old_book_key_mapping.json',
    'static/old_book_key_to_new_book_key_mapping.json',
    'static/new_char_key_to_old_char_key_mapping.json',
    'static/old_char_key_to_new_char_key_mapping.json',
)

In [6]:
from final_process.common_util import read_jsonl

final_data_filename = 'data/v04/masked_char_data_filtered_by_name_and_description.jsonl'
final_data = read_jsonl(final_data_filename)

In [7]:
old_book_keys = list(set([(d['book_title'], d['source']) for d in final_data]))
old_book_lookup = {
    (d['book_title'], d['source']): d
    for d in final_data
}

old_char_keys = list(set([(d['book_title'], d['source'], d['character_name']) for d in final_data]))
old_char_lookup = {
    (d['book_title'], d['source'], d['character_name']): d
    for d in final_data
}

In [8]:
for old_book_key in old_book_keys:
    new_book_key = kt.to_new_book_key(old_book_key)
    assert(new_book_key in dataset.book_lookup) 

for new_book_key in dataset.book_lookup.keys():
    old_book_key = kt.to_old_book_key(new_book_key)
    assert(old_book_key in old_book_keys)

In [9]:
for new_char_key in dataset.char_lookup.keys():
    old_char_key = kt.to_old_char_key(new_char_key)
    if old_char_key is None: continue
    assert(old_book_key in old_book_keys)

In [10]:
new_char_lookup = dataset.char_lookup
new_book_lookup = dataset.book_lookup

char_lookup = {}
for new_key, new_info in new_char_lookup.items():
    old_key = kt.to_old_char_key(new_key)
    if old_key is None: continue
    old_info = old_char_lookup[old_key]
    char_lookup[old_key] = {
        'book_title': old_info['book_title'],
        'source': old_info['source'],
        'character_name': old_info['character_name'],
        'old_description': old_info['description'],
        'new_description': new_info.description,
    }
char_infos = list(char_lookup.values())

book_lookup = {}
for new_key, new_info in new_book_lookup.items():
    old_key = kt.to_old_book_key(new_key)
    if old_key is None: continue
    old_info = old_book_lookup[old_key]
    book_lookup[old_key] = {
        'book_title': old_info['book_title'],
        'source': old_info['source'],
        'old_summary': old_info['summary'],
        'new_summary': new_info.summary,
    }
book_infos = list(book_lookup.values())

In [11]:
def pre_clean_description(description, char_key):
    while description.endswith(' Read an'):
        description = description[:-8]
    
    description = description.replace(
        'M&amp;Ms; or peanut',
        'M&amp;Ms or peanut',
    )

    if char_key == ('Walden', 'shmoop', 'Thoreau'):
        description = description[245:] + description[:245]
    
    return description

In [12]:
from final_process.text_diff_tool import TextDiffTool
from IPython.display import clear_output

count = 0
source_counter = {
    'cliffnotes': 0,
    'sparknotes': 0,
    'shmoop': 0,
    'litcharts': 0,
}
for char_info in char_infos:
    char_key = (
        char_info['book_title'],
        char_info['source'],
        char_info['character_name'],
    )
    text1 = char_info['new_description']
    text1 = pre_clean_description(text1, char_key)
    text2 = char_info['old_description']
    text_diff = TextDiffTool.diff_text(text1, text2)
    repaired_text1 = repair_text(text1, text_diff)
    forward_changes = text_diff.get_forward_changes()
    changes = text_diff.get_changes()
    changes = list(filter(
        lambda c: c[0] != ' ' or c[1] != '',
        changes,
    ))
    if len(changes) > 1:
        count += 1
        source_counter[char_info['source']] += 1

        clear_output(wait=True)
        print(char_info['book_title'])
        print(char_info['source'])
        print(char_info['character_name'])
        print(text1)
        print()
        print(text2)
        print()
        print(changes)
        input()
    assert(repaired_text1 == text2)
print(source_counter)

{'cliffnotes': 0, 'sparknotes': 0, 'shmoop': 0, 'litcharts': 0}


In [22]:
from final_process.text_diff_tool import TextDiffTool
from IPython.display import clear_output

text_diffs = {}
for char_info in char_infos:
    char_key = (
        char_info['book_title'],
        char_info['source'],
        char_info['character_name'],
    )
    text1 = char_info['new_description']
    text1 = pre_clean_description(text1, char_key)
    text2 = char_info['old_description']
    text_diff = TextDiffTool.diff_text(text1, text2)
    changes = text_diff.get_forward_changes()
    filtered_changes = list(filter(
        lambda c: c[0][1] - c[0][0] != 1 or c[1] != '',
        changes,
    ))
    assert(len(filtered_changes) < 2)
    if len(changes) > 0:
        text_diffs[str(char_key)] = changes

In [24]:
from final_process.text_diff_tool import TextDiffTool
from IPython.display import clear_output

text_diffs = {}
for book_info in book_infos:
    text1 = book_info['new_summary']
    text2 = book_info['old_summary']
    text_diff = TextDiffTool.diff_text(text1, text2)
    changes = text_diff.get_forward_changes()
    if len(changes) > 0:
        text_diffs[str(char_key)] = changes

In [20]:
from final_process.text_diff_tool import TextDiffTool
from IPython.display import clear_output

list_diffs = {}
for char_key, char_info in old_char_lookup.items():
    list1 = char_info['description'].split()
    list2 = char_info['masked_description'].split()
    # list2 = list2.replace('[MASK]', '???')
    text_diff = TextDiffTool.diff_text(list1, list2)
    changes = text_diff.get_forward_changes()
    restored_list1 = TextDiffTool.restore_list(list1, changes)
    assert(list2 == restored_list1)
    list_diffs[str(char_key)] = changes

In [21]:
import json
with open('static/masked_desciption_changes.json', 'w') as out_f:
    json.dump(list_diffs, out_f)

In [28]:
final_data = read_jsonl('final_data/data.jsonl')
final_char_lookup = {
    (d['book_title'], d['source'], d['character_name']): d
    for d in final_data
}

In [30]:
for key in char_lookup.keys():
    assert(key in final_char_lookup)

for key in final_char_lookup.keys():
    assert(key in char_lookup)

for key in old_char_lookup.keys():
    old_char_info = old_char_lookup[key]
    final_char_info = final_char_lookup[key]
    assert(old_char_info['book_title'] == final_char_info['book_title'])
    assert(old_char_info['source'] == final_char_info['source'])
    assert(old_char_info['character_name'] == final_char_info['character_name'])
    assert(old_char_info['description'] == final_char_info['description'])
    assert(old_char_info['summary'] == final_char_info['summary'])
    assert(old_char_info['masked_description'] == final_char_info['masked_description'])