In [1]:
filename = 'data/v04/masked_char_data_filtered_by_name_and_description.jsonl'

In [2]:
import json
data = []
with open(filename) as in_f:
    for line in in_f.readlines():
        data.append(json.loads(line))

In [3]:
lit_keys = set()
char_keys = set()
for d in data:
    title = d['book_title']
    source = d['source']
    char = d['character_name']
    lit_keys.add((title, source))
    char_keys.add((title, source, char))
lit_keys = sorted(list(lit_keys))
char_keys = sorted(list(char_keys))

In [10]:
from data_processor.orig_data_center import OrigBookDataCenter

In [11]:
obdc = OrigBookDataCenter.build_from_database()

In [7]:
lit_infos = []
for key in lit_keys:
    d = obdc.literatures[key]
    title = key[0]
    source = key[1]
    url = d.book_url
    lit_infos.append((title, source, url))

char_infos = []
for key in char_keys:
    d = obdc.characters[key]
    title = key[0]
    source = key[1]
    char = key[2]
    url = d.description_url
    char_infos.append((title, source, char, url))

In [156]:
import csv
with open('temp_files/lit_infos.csv', 'w') as out_f:
    writer = csv.writer(out_f, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
    for info in lit_infos:
        writer.writerow(info)

with open('temp_files/char_infos.csv', 'w') as out_f:
    writer = csv.writer(out_f, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
    writer.writerow(['book_title', 'source', 'character_name', 'url'])
    for info in char_infos:
        writer.writerow(info)

In [131]:
lit_infos_clean = []
with open('temp_files/list_literatures_info_clean.txt') as in_f:
    for line in in_f.readlines():
        title, source, url = line.strip().split('|')
        lit_infos_clean.append((title, source, url))

In [148]:
lit_lookup = {(title, source): url for title, source, url in lit_infos}
translation = {
    ('A Room of One’s Own', 'sparknotes'): ("A Room of One's Own", 'sparknotes'),
    ('A Storm of Swords', 'sparknotes'): ('Game of Thrones: A Storm of Swords', 'sparknotes'),
    ('Alice’s Adventures in Wonderland', 'sparknotes'): ('Alice in Wonderland', 'sparknotes'), 
    ('Angela’s Ashes', 'sparknotes'): ("Angela's Ashes", 'sparknotes'),
    ('Cat’s Eye', 'sparknotes'): ("Cat's Eye", 'sparknotes'),
    ('For Whom The Bell Tolls', 'sparknotes'): ('For Whom the Bell Tolls', 'sparknotes'),
    ('Harry Potter and the Sorcerer’s Stone', 'sparknotes'): ("Harry Potter and the Sorcerer's Stone", 'sparknotes'), 
    ('Midnight’s Children', 'sparknotes'): ("Midnight's Children", 'sparknotes'),
    ('Poisonwood Bible', 'sparknotes'): ('The Poisonwood Bible', 'sparknotes'),
    ('Sophie’s Choice', 'sparknotes'): ("Sophie's Choice", 'sparknotes'),
    ('The Bonesetter’s Daughter', 'sparknotes'): ("The Bonesetter's Daughter", 'sparknotes'),
    ('The Epic of Gilgamesh', 'sparknotes'): ('Gilgamesh', 'sparknotes'),
    ('The Oedipus Plays', 'sparknotes'): ('Antigone (The Oedipus Plays)', 'sparknotes'),
    ('The Pilgrim’s Progress', 'sparknotes'): ('Pilgrim’s Progress', 'sparknotes'),
    ('The Return of the King', 'sparknotes'): ('Lord of the Rings: The Return of the King', 'sparknotes'),
    ('The Two Towers', 'sparknotes'): ('Lord of the Rings: The Two Towers', 'sparknotes'),
    ('Through the Looking-Glass', 'sparknotes'): ('Through the Looking Glass', 'sparknotes'),
}

In [149]:
lit_infos_combined = []
for title, source, url in lit_infos_clean:
    try:
        orig_url = lit_lookup[(title, source)]
        lit_infos_combined.append({
            'old_book_title': title,
            'old_source': source,
            'new_book_title': title,
            'new_source': source,
            'orig_url': orig_url,
            'cached_url': url,
        })
        del lit_lookup[(title, source)]
    except Exception:
        old_title, old_source = translation[(title, source)]
        orig_url = lit_lookup[(old_title, old_source)]
        lit_infos_combined.append({
            'old_book_title': old_title,
            'old_source': old_source,
            'new_book_title': title,
            'new_source': source,
            'orig_url': orig_url,
            'cached_url': url,
        })
        del lit_lookup[(old_title, old_source)]
assert(len(lit_lookup) == 0)

In [150]:
with open('temp_files/full_lit_infos.jsonl', 'w') as out_f:
    for d in lit_infos_combined:
        line = json.dumps(d)
        out_f.write(line+'\n')

In [151]:
with open('temp_files/list_characters_cached.txt') as in_f:
    urls = in_f.readlines()

In [1]:
import json, csv
def read_jsonl(filename):
    data = []
    with open(filename) as in_f:
        for line in in_f.readlines():
            data.append(json.loads(line))
    return data

def write_jsonl(filename, data):
    with open(filename, 'w') as out_f:
        for d in data:
            out_f.write(json.dumps(d)+'\n')

def read_csv(filename):
    data = []
    with open(filename) as in_f:
        reader = csv.reader(in_f, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
        header = None
        for i, row in enumerate(reader):
            if i == 0:
                header = row
            else:
                data.append({
                    key: val for key, val in zip(header, row)
                })
    return data

In [238]:
lit_data = read_jsonl('temp_files/full_lit_infos.jsonl')
char_data = read_csv('temp_files/char_infos.csv')
char_cached_urls = []
with open('temp_files/list_characters_cached_clean.txt') as in_f:
    for line in in_f.readlines():
        char_cached_urls.append(line.strip())

In [239]:
lit_translation = {}
for d in lit_data:
    lit_translation[(d['old_book_title'], d['old_source'])] = (d['new_book_title'], d['new_source'])

char_orig_urls = set()
for d in char_data:
    char_orig_urls.add(d['url'])

In [240]:
import re

ex_urls = {
    'https://www.shmoop.com/study-guides/literature/animal-dreams/stitch-and-b%2A%2A%2A%2A-club': 'https://www.shmoop.com/study-guides/literature/animal-dreams/stitch-and-b****-club',
    'https://www.shmoop.com/study-guides/literature/freedom-franzen/jonathan': 'https://www.shmoop.com/study-guides/literature/freedom-franzen/Jonathan',
    'https://www.shmoop.com/study-guides/literature/speak-memory/vladimir-dmitrievich-nabokov': 'https://www.shmoop.com/study-guides/literature/speak-memory/Vladimir-dmitrievich-nabokov',
    'https://www.shmoop.com/study-guides/literature/wings-book/jamison': 'https://www.shmoop.com/study-guides/literature/wings-book/Jamison',
}

char_url_translation = {}
seen = set()
for url in char_cached_urls:
    sub_url = re.search(r'^http://web.archive.org/web/\d{14}/(.*)$', url).group(1)
    temp_url = sub_url
    if sub_url in ex_urls:
        temp_url = ex_urls[sub_url]
    temp_url = temp_url.replace('http:', 'https:')
    assert(temp_url in char_orig_urls)
    seen.add(temp_url)
    char_url_translation[temp_url] = url

In [241]:
full_char_data = []
for d in char_data:
    full_char_data.append({
        'old_book_title': d['book_title'],
        'old_source': d['source'],
        'old_character_name': d['character_name'],
        'orig_url': d['url'],
        'cached_url': char_url_translation[d['url']],
    })

with open('temp_files/partial_char_infos.jsonl', 'w') as out_f:
    for d in full_char_data:
        out_f.write(json.dumps(d)+'\n')

In [242]:
data = read_jsonl('temp_files/partial_char_infos.jsonl')
# with open('temp_files/list_characters_cached_clean.txt', 'w') as out_f:
#     for d in data:
#         out_f.write(d['cached_url']+'\n')

In [243]:
obdc_wayback = OrigBookDataCenter.build_from_wayback_database()

In [244]:
def get_char_keys(local_obdc, url):
    keys = []
    for key, info in local_obdc.characters.items():
        if info.description_url == url:
            keys.append(key)
    return keys

In [245]:
char_infos = read_jsonl('temp_files/partial_char_infos.jsonl')
lit_infos = read_jsonl('temp_files/full_lit_infos.jsonl')

In [246]:
lit_key_lookup = {}
for info in lit_infos:
    title_lookup[(info['old_book_title'], info['old_source'])] = (info['new_book_title'], info['new_source'])

In [331]:
full_char_infos = []
for info in char_infos:
    title = info['old_book_title']
    source = info['old_source']
    char_name = info['old_character_name']
    url = info['cached_url']
    keys = get_char_keys(obdc_wayback, url)

    new_title, new_source = title_lookup[(title, source)]
    temp_name = char_name
    if new_source == 'litcharts' and char_name == 'Viscount de Valvert':
        temp_name = 'Cyrano de Bergerac'
    elif new_source == 'litcharts' and char_name in [
        'Ezra D. Wannafeller',
        'Mrs. Higgins',
    ]:
        temp_name = 'Eliza Doolittle'
    elif new_source == 'litcharts' and char_name in [
        'Adèle Ratignolle',
        'Léonce Pontellier',
        'Mademoiselle Reisz',
    ]:
        temp_name = 'Edna Pontellier'
    elif new_source == 'litcharts' and char_name in [
        'Girl-Cousin',
    ]:
        temp_name = 'Nicholas'
    elif new_source == 'litcharts' and char_name in [
        'The Scholar',
    ]:
        temp_name = 'The Schoolmaster'
    elif new_source == 'litcharts' and char_name in [
        '\nBridget Allworthy % \n',
        '\nHarriet Fitzpatrick % \n',
        '\nJenny Jones % \n',
        '\nMr. Allworthy % \n',
        '\nMr. Fitzpatrick % \n',
        '\nMrs. Western % \n',
        '\nSophia Western % \n',
        '\nSquire Western % \n',
        '\nTom Jones % \n',
    ]:
        temp_name = char_name.replace('%', '')
    clean_name = ' '.join(temp_name.strip().split())
    candidate_keys = [
        (title, new_source, temp_name),
        # (title, new_source, clean_name),
        (new_title, new_source, temp_name),
        # (new_title, new_source, clean_name),
    ]
    for key in candidate_keys:
        if key in keys:
            full_char_infos.append({
                'old_book_title': info['old_book_title'],
                'old_source': info['old_source'],
                'old_character_name': info['old_character_name'],
                'new_book_title': key[0],
                'new_source': key[1],
                'new_character_name': key[2],
                'orig_url': info['orig_url'],
                'cached_url': info['cached_url'],
            })
            break
    else:
        print((new_title, source, char_name))
        print()
        print(keys)
        print(url)
        break
print('finished')

finished


In [332]:
write_jsonl('temp_files/full_char_infos.jsonl', full_char_infos)

In [405]:
obdc_wayback = OrigBookDataCenter.build_from_wayback_database()

In [406]:
lit_infos = read_jsonl('temp_files/full_lit_infos.jsonl')
char_infos = read_jsonl('temp_files/full_char_infos.jsonl')

In [407]:
data = []
key_lookup = {}
for info in char_infos:
    title = info['new_book_title']
    source= info['new_source']
    char_name = info['new_character_name']
    lit_key = (title, source)
    char_key = (title, source, char_name)
    key_lookup[char_key] = (info['old_book_title'], info['old_source'], info['old_character_name'])

    lit_data = obdc_wayback.literatures[lit_key]
    char_data = obdc_wayback.characters[char_key]
    data.append({
        'book_title': title,
        'source': source,
        'character_name': char_name,
        'summary': lit_data.summary_text,
        'description': char_data.description_text,
    })

In [408]:
write_jsonl('temp_files/raw_char_data_wayback.jsonl', data)

In [361]:
old_char_data = read_jsonl('data/v04/masked_char_data_filtered_by_name_and_description.jsonl')

old_data = {}
for d in old_char_data:
    key = (d['book_title'], d['source'], d['character_name'])
    old_data[key] = {
        'book_title': d['book_title'],
        'source': d['source'],
        'character_name': d['character_name'],
        'summary': d['summary'],
        'description': d['description'],
        'masked_description': d['masked_description'],
    }

old_char_data = read_jsonl('data/v04/truncated_char_data_filtered_by_name_and_description.jsonl')
for d in old_char_data:
    key = (d['book_title'], d['source'], d['character_name'])
    old_data[key]['coref_truncated_summary'] = d['coref_truncated_summary']

In [362]:
import difflib

def print_diff(s1, s2):
    prev_mode = None
    buf = ''
    si = -1
    for i, s in enumerate(difflib.ndiff(s1, s2)):
        if s[0] == ' ':
            if len(buf) > 0:
                print(f'{"Add" if prev_mode == "+" else "Delete"} "{buf}" (position {si} to {i-1})')
                buf = ''
                si = i
                prev_mode = None
        elif s[0] == '-':
            if prev_mode == '-':
                buf += s[-1]
                continue
            elif prev_mode == '+':
                print(f'Add "{buf}" (position {si} to {i-1})')
            buf = s[-1]
            si = i
            prev_mode = '-'
        elif s[0] == '+':
            if prev_mode == '+':
                buf += s[-1]
                continue
            elif prev_mode == '-':
                print(f'Delete "{buf}" (position {si} to {i-1})')
            buf = s[-1]
            si = i
            prev_mode = '+'
    if len(buf) > 0:
        print(f'{"Add" if prev_mode == "+" else "Delete"} "{buf}" (position {si} to {i-1})')

In [422]:
count = 0
lit_visited = set()
char_visited = set()
for d in data:
    key = (d['book_title'], d['source'], d['character_name'])
    old_key = key_lookup[key]
    old_d = old_data[old_key]

    lit_key = (d['book_title'], d['source'])
    summ = d['summary']
    old_summ = old_d['summary']

    # compare summary
    if not lit_key in lit_visited:
        lit_visited.add(lit_key)
        # if lit_key == ('Melville Stories', 'sparknotes'):
        #     print(summ)
        #     print('----')
        #     print(old_summ)

        if len(d['summary']) != len(old_d['summary']):
            lit_visited.add((d['book_title'], d['source']))
            print((d['book_title'], d['source']))
            print_diff(old_d['summary'], d['summary'])
            print()
    
    # compare description
    desc = d['description']
    old_desc = old_d['description']
    
    
    i = desc.find(' in-depth analysis of')
    if i > -1: desc = desc[:i]

    replacements = [
        (' Read an', ''),
        ('( )', '()'),
        ('$ ,', '$,'),
        ('– )', '–)'),
        ('slum, in', 'slum,in'),
        ('Enkidu looks', 'Enkidulooks'),
        ('Gilgamesh and', 'Gilgameshand'),
        ('Gilgamesh is', 'Gilgameshis'),
        ('ladinos and', 'ladinosand'),
        ('case. Julia', 'case.Julia'),
        ('Obasan (aunt)', 'Obasan ()'),
        ('(I.iii. )', '(I.iii.)'),
        ('Gilgameshis—hold', 'Gilgamesh is—hold'),
        ('Gilgameshand', 'Gilgamesh and'),
    ]
    for a, b in replacements:
        desc = desc.replace(a, b)

    if not key in char_visited and d['character_name'] != 'Unnamed narrator':
        if len(desc) != len(old_desc):
            char_visited.add(key)
            print(key)
            print_diff(old_desc, desc)
            print()
            print(repr(desc))
            print('----')
            print(repr(old_desc))
            print()
            break

('The Epic of Gilgamesh', 'sparknotes', 'Enkidu')
Delete "and" (position 217 to 219)
Delete "is " (position 221 to 223)
Add "nd" (position 225 to 226)
Delete "lmost" (position 227 to 231)
Delete "h" (position 233 to 233)
Add " almost his " (position 236 to 247)

'Companion and friend of Gilgamesh. Hairy-bodied and brawny, Enkidu was raised by animals. Even after he joins the civilized world, he retains many of his undomesticated characteristics. Enkidulooks much like Gilgamesh and is almost his physical equal. He aspires to be Gilgamesh’s rival but instead becomes his soul mate. The gods punish Gilgamesh and Enkidu by giving Enkidu a slow, painful, inglorious death for killing the demon Humbaba and the Bull of Heaven.'
----
'Companion and friend of Gilgamesh. Hairy-bodied and brawny, Enkidu was raised by animals. Even after he joins the civilized world, he retains many of his undomesticated characteristics. Enkidulooks much like Gilgameshand is almost hisphysical equal. He aspires to b

In [367]:
infos = read_jsonl('temp_files/full_char_infos.jsonl')
visited = set()
for info in infos:
    url = info['cached_url']
    if 'sparknotes' in url and url not in visited:
        visited.add(url)
        print(url)

http://web.archive.org/web/20201209063839/https://www.sparknotes.com/lit/1984/characters/
http://web.archive.org/web/20201024065851/https://www.sparknotes.com/lit/2001/characters/
http://web.archive.org/web/20201220160729/https://www.sparknotes.com/lit/bend-in-the-river/characters/
http://web.archive.org/web/20201220160844/https://www.sparknotes.com/lit/borderpassage/characters/
http://web.archive.org/web/20201024100222/https://www.sparknotes.com/lit/a-clash-of-kings/characters/
http://web.archive.org/web/20201220163601/https://www.sparknotes.com/short-stories/a-clean-well-lighted-place/characters/
http://web.archive.org/web/20201220161323/https://www.sparknotes.com/lit/clockworkorange/characters/
http://web.archive.org/web/20201020145709/https://www.sparknotes.com/lit/adaynopigs/characters/
http://web.archive.org/web/20200921094058/https://www.sparknotes.com/lit/deathinthefamily/characters/
http://web.archive.org/web/20201019221423/https://www.sparknotes.com/short-stories/a-death-in-t

In [100]:
data = []
data.extend(read_csv('temp_files/final/gutenberg_output.csv'))
data.extend(read_csv('temp_files/final/smashwords_output.csv'))
data.extend(read_csv('temp_files/final/relationships.csv'))

with open('temp_files/final/bookcorpus_title.txt') as in_f:
    for line in in_f:
        title = line.strip()
        if title is not None: data.append({
            'title': title,
        })
lit_infos = read_jsonl('temp_files/final/full_lit_infos.jsonl')
char_infos = read_jsonl('temp_files/final/full_char_infos.jsonl')


In [96]:
obdc_wayback = OrigBookDataCenter.build_from_wayback_database()

In [107]:
gbooks = {}
for d in data:
    title = d['title'].lower()
    title = ''.join(title.split())
    author = d['author'] if 'author' in d else None
    authors = gbooks.get(title, [])
    if author is not None:
        author = ' '.join(author.split(', ')[::-1])
        authors.append(author.lower())
    gbooks[title] = authors

lbook_titles = []
for info in char_infos:
    title = info['new_book_title'].lower()
    title = ''.join(title.split())
    lbook_titles.append(title)

lbooks = {}
for (title, _), info in obdc_wayback.literatures.items():
    title = title.lower()
    title = ''.join(title.split())
    if title not in lbook_titles: continue
    author = info.author
    authors = lbooks.get(title, [])
    if author is not None:
        authors.append(author.lower())
    lbooks[title] = authors

In [108]:
from IPython.display import clear_output
def main(rough_match=False, manual_check=False):
    count = 0
    not_found = []
    for title in lbook_titles:
        author = lbooks[title][0]
        gbook_title = gbooks.get(title, None)
        found = False
        if gbook_title is not None:
            count += 1
            found = True
        elif rough_match:
            print(f'original: {title}, {author}')
            print()
            candidates = []
            second_candidates = []
            others = []
            for key, authors in gbooks.items():
                if title in key:
                    if author in authors:
                        candidates.append((key, authors))
                        continue
                    
                    found = False
                    for token in author.split():
                        for name in authors:
                            if token in name:
                                found = True
                                second_candidates.append((key, authors))
                                break
                        if found: break
                    if found: continue

                    others.append((key, authors))
                                
            if len(candidates) > 0 or len(second_candidates) > 0:
                count += 1
                found = True
            else:
                if len(others) > 0:
                    print('Others')
                    for okey, oauthors in others:
                        print(okey, oauthors)
                    print()

                    if manual_check:
                        a = input()
                        if a == 'q': break
                        if a == 'a':
                            count += 1
                            found = True
            clear_output(wait=True)
        if not found: not_found.append((title, author))
    print(count)
    return not_found

In [109]:
not_found = main(rough_match=True)

3287


In [106]:
3283 / len(char_infos)

0.3456153279292557

In [93]:
import csv
with open('temp_files/not_found_titles.csv', 'w') as out_f:
    writer = csv.writer(out_f, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
    writer.writerow(['title', 'author'])
    for title, author in not_found:
        writer.writerow([title, author])

In [2]:
# helper functions
import json, csv
def read_jsonl(filename):
    data = []
    with open(filename) as in_f:
        for line in in_f.readlines():
            data.append(json.loads(line))
    return data

def write_jsonl(filename, data):
    with open(filename, 'w') as out_f:
        for d in data:
            out_f.write(json.dumps(d)+'\n')

def read_csv_as_object(filename):
    data = []
    with open(filename) as in_f:
        reader = csv.reader(in_f, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
        header = None
        for i, row in enumerate(reader):
            if i == 0:
                header = row
            else:
                data.append({
                    key: val for key, val in zip(header, row)
                })
    return data

In [3]:
# old sources
old_final_data = read_jsonl('data/v04/masked_char_data_filtered_by_name_and_description.jsonl')

In [5]:
lit_info_data = read_jsonl('temp_files/full_lit_infos.jsonl')
lit_old_to_new = {}
lit_new_to_old = {}
for d in lit_info_data:
    old_lit_key = (d['old_book_title'], d['old_source'])
    new_lit_key = (d['new_book_title'], d['new_source'])
    
    if old_lit_key in lit_old_to_new: raise KeyError
    lit_old_to_new[old_lit_key] = new_lit_key
    
    if new_lit_key in lit_new_to_old: raise KeyError
    lit_new_to_old[new_lit_key] = old_lit_key

char_info_data = read_jsonl('temp_files/full_char_infos.jsonl')
char_old_to_new = {}
char_new_to_old = {}
for d in char_info_data:
    old_char_key = (d['old_book_title'], d['old_source'], d['old_character_name'])
    new_char_key = (d['new_book_title'], d['new_source'], d['new_character_name'])
    
    if old_char_key in char_old_to_new: raise KeyError
    char_old_to_new[old_char_key] = new_char_key

    if new_char_key in char_new_to_old: raise KeyError
    char_new_to_old[new_char_key] = old_char_key

('Pygmalion', 'litcharts', 'Eliza Doolittle')


KeyError: 