In [4]:
from data_processor.orig_data_center import OrigBookDataCenter

In [71]:
obdc = OrigBookDataCenter.build_from_database()

In [59]:
obdc = OrigBookDataCenter.build_from_wayback_database()

In [66]:
import json

def load_dataset_from_json(filename):
    with open(filename) as in_file:
        return json.load(in_file)

def load_dataset_from_jsonl(filename):
    dataset = []
    with open(filename) as in_file:
        for line in in_file.readlines():
            d = json.loads(line)
            dataset.append(d)
    return dataset

def export_dataset_as_jsonl(data, filename):
    with open(filename, 'w') as out_file:
        for d in data:
            json.dump(d, out_file)
            out_file.write('\n')

In [67]:
filename1 = 'data/v04/masked_char_data_filtered_by_name_and_description.jsonl'
filename2 = 'data/v04/truncated_char_data_filtered_by_name_and_description.jsonl'

In [81]:
old_books = {}
with open('list_literatures_info.txt') as in_f:
    for line in in_f.readlines():
        title, source, url = line.strip().split('|')
        old_books[url] = (title, source)

new_books = {}
with open('list_literatures_info_clean.txt') as in_f:
    for line in in_f.readlines():
        title, source, url = line.strip().split('|')
        new_books[url] = (title, source)

book_mapping = {}
for key, val in old_books.items():
    try:
        book_mapping[val] = new_books[key]
    except Exception:
        print(key, val)

In [83]:
data = load_dataset_from_jsonl(filename1)
literatures = set()
characters = set()
character_lists = set()
for d in data:
    book_title = d['book_title']
    source = d['source']
    char_name = d['character_name']

    lit_key = (book_title, source)
    new_book_title, new_source = book_mapping[lit_key]
    char_key = (book_title, source, char_name)

    summ_url = obdc.literatures[lit_key].summary_url
    char_list_url = obdc.characters[char_key].character_list_url
    desc_url = obdc.characters[char_key].description_url
    literatures.add((new_book_title, new_source, summ_url))
    characters.add((new_book_title, new_source, desc_url))
    # if char_list_url is not None:
    #     character_lists.add(char_list_url)
literatures = sorted(list(literatures))
# character_lists = sorted(list(character_lists))
characters = sorted(list(characters))

out_f = open('list_literatures.txt', 'w')
for info in literatures:
    out_f.write('|'.join(info)+'\n')
out_f.close()

out_f = open('list_characters.txt', 'w')
for info in characters:
    out_f.write('|'.join(info)+'\n')
out_f.close()

# out_f = open('list_character_lists.txt', 'w')
# for url in character_lists:
#     out_f.write(url+'\n')
# out_f.close()

In [8]:
import os
import json
import requests
from tqdm.notebook import tqdm
import concurrent.futures
import time
from datetime import datetime
import random

In [None]:
def process_url(url):
    while True:
        try:
            local_random = random.Random()
            local_random.seed(datetime.now())
            for _ in range(local_random.randint(20000000, 40000000)): pass
            response = requests.get(
                f'http://web.archive.org/wayback/available?url={url}&timestamp=202012',
                # proxies={
                #     "http": "http://49b2b65fb41f4f089c4056b5abf63af7:@proxy.crawlera.com:8010/",
                # },
            ).json()
            snapshots = response['archived_snapshots']
            if 'closest' in snapshots:
                timestamp = snapshots['closest']['timestamp']
                yyyy_mm = timestamp[:6]
                if yyyy_mm >= '202009':
                    return (True, snapshots['closest']['url'])
            # os.system(f'/home/huangme-pop/anaconda3/envs/lcdata/bin/archiver {url}')
            return (False, url)
        except Exception as e:
            print(e)

with concurrent.futures.ThreadPoolExecutor(max_workers=4) as executor:
    futures = []
    missed_urls = missed_again_urls
    missed_again_urls = []

    for url in missed_urls:
        futures.append(executor.submit(process_url, url=url))

    pbar = tqdm(total=len(missed_urls))
    count = 0
    for future in concurrent.futures.as_completed(futures):
        found, url = future.result()
        if not found: missed_again_urls.append(url)
        else: found_urls.append(url)
        pbar.update(1)
    pbar.close()

In [None]:
# for i in tqdm(range(len(literatures))):
#     url = literatures[i]
#     response = requests.get(f'https://archive.org/wayback/available?url={url}')
#     response = json.loads(response.text)
#     snapshots = response['archived_snapshots']
#     if 'closest' in snapshots:
#         timestamp = snapshots['closest']['timestamp']
#         yyyy_mm = timestamp[:6]
#         if yyyy_mm >= '202009': continue
#     return_code = os.system(f'/home/huangme-pop/anaconda3/envs/lcdata/bin/archiver {url}')
#     print(return_code)
found_urls = list(set(found_urls))

In [None]:
# for url in missed_again_urls:
#     print(url)

print(len(missed_again_urls))
print(len(found_urls))

out_f = open('list_character_lists_cached.txt', 'w')
for url in found_urls:
    out_f.write(url+'\n')
out_f.close()

In [80]:
import re
urls = {}
with open('list_literatures_cached.txt') as in_f:
    for line in in_f.readlines():
        if len(line) == 0: continue
        a = re.search(r'/(https?://.*$)', line)
        url = a.group(1)
        if url[-1] == '/': url = url[:-1]
        urls[url] = line.strip()

out_f = open('list_literatures_info.txt', 'w')
with open('list_literatures.txt') as in_f:
    for line in in_f.readlines():
        try:
            title, source, url = line.strip().split('|')
            if url[-1] == '/': url = url[:-1]
            out_f.write(f'{title}|{source}|{urls[url]}\n')
        except Exception:
            print(line)
out_f.close()


In [63]:
out_f = open('list_literatures_info_clean.txt', 'w')
l = []
with open('list_literatures_info.txt') as in_f:
    for line in in_f.readlines():
        info = line.strip().split('|')
        l.append(info)
l.sort()
for info in l:
    out_f.write('|'.join(info)+'\n')
out_f.close()