Make sure all the entries in Airtable are also in Mongo, and updated with latest info

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import airtable
import joblib
from tqdm import tqdm

In [3]:
table_name = 'M00_Register'

In [8]:
with open('.attoken', 'r', encoding='utf-8') as infile:
    attoken = infile.read().strip()

In [9]:
# Older base
#base_id = "appsHVqTWtD7QXeQb"
base_id = "appnZn2NYRxLtEvsT"
at = airtable.Airtable(base_id, attoken)
chron_result = at.get(table_name)
# chron_result has keys: 'record' and 'offset'

In [12]:
chron_result['records']

[OrderedDict([('id', 'rec00iOFJrJ37XzQe'),
              ('createdTime', '2022-09-28T21:54:22.000Z'),
              ('fields',
               OrderedDict([('entry_id', 'E0052'),
                            ('full_text',
                             'Austrian defeats. (GJ (W) 1849: Mar c.14. (P) Mar 14, in NRZ. N/t; n/s. (Tr) MECW 9:63-64. (++) Not in MEW. -♦ Context in 49:15.'),
                            ('num_chron_refs', 0),
                            ('written', '1849: Mar c.14.'),
                            ('reg_section', 'E'),
                            ('published', 'Mar 14, in NRZ. N/t; n/s.'),
                            ('mongo_id', 'ObjectId(630af2b05e934cb195acadb2)'),
                            ('ent_num', 52),
                            ('more_info', 'Not in MEW. -♦ Context in 49:15.'),
                            ('translation', 'MECW 9:63-64.'),
                            ('title', 'Austrian defeats.'),
                            ('lang_orig', ['German']),
    

In [11]:
chron_result.keys()

odict_keys(['records', 'offset'])

In [10]:
chron_result['offset']

'itrmiFtEds3gybqk6/rec2CH0LnS1dIPZpC'

In [39]:
all_entries = []
# Basically for entries with more than one event, extend the ID so that it's like 48_33.0, 48_33.1, etc.
subentry_counts = {}
for cur_record in tqdm(at.iterate(table_name)):
    cur_fields = cur_record['fields']
    cur_entry_data = {
        'at_id': cur_record['id'],
        'at_created': cur_record['createdTime']
    }
    #print(cur_record.keys())
    #print(cur_fields.keys())
    #numeric_id = cur_fields['id']
    entry_id = cur_fields['entry_id']
    entry_id_base = "_".join(entry_id.split("_")[0:2])
    #print(entry_id_base)
    #cyclo_id = cur_fields['ME_Cyclopedia_id']
    #print(numeric_id, entry_id, cyclo_id)
    cur_entry_data['entry_id'] = entry_id
    cur_entry_data['entry_id_base'] = entry_id_base
    # And now check the subentry dict
    if entry_id_base not in subentry_counts:
        # New entry id
        subentry_counts[entry_id_base] = 0
    else:
        subentry_counts[entry_id_base] = subentry_counts[entry_id_base] + 1
    entry_id_full = f"{entry_id_base}.{subentry_counts[entry_id_base]}"
    #print(entry_id_full)
    cur_entry_data['entry_id_full'] = entry_id_full
    all_entries.append(cur_entry_data)

7445it [00:39, 189.17it/s]


In [40]:
len(all_entries)

7445

In [42]:
joblib.dump(all_entries, "at_entries.pkl")

['at_entries.pkl']

In [43]:
# Now get rid of the .0 for entries with only one subentry

In [51]:
single_entries = [k for k,v in subentry_counts.items() if v == 1]

In [52]:
len(single_entries)

671

In [53]:
for cur_entry in single_entries:
    print(cur_entry)
    break

65_55


In [50]:
all_entries[:5]

[{'at_id': 'rec00WLmO0M3GMg8P',
  'at_created': '2020-12-17T01:44:33.000Z',
  'entry_id': '48_33_739',
  'entry_id_base': '48_33',
  'entry_id_full': '48_33.0'},
 {'at_id': 'rec00bY5ogUHtBRV9',
  'at_created': '2020-12-17T01:44:33.000Z',
  'entry_id': '65_55_2976',
  'entry_id_base': '65_55',
  'entry_id_full': '65_55.0'},
 {'at_id': 'rec00egTfaoHNVRi8',
  'at_created': '2020-12-23T13:34:59.000Z',
  'entry_id': '93_42_6218',
  'entry_id_base': '93_42',
  'entry_id_full': '93_42.0'},
 {'at_id': 'rec01gGWC4uAohMl8',
  'at_created': '2020-12-17T01:44:33.000Z',
  'entry_id': '56_4_2036',
  'entry_id_base': '56_4',
  'entry_id_full': '56_4.0'},
 {'at_id': 'rec02OW32X4PtAchT',
  'at_created': '2020-12-17T01:44:33.000Z',
  'entry_id': '37_13_74',
  'entry_id_base': '37_13',
  'entry_id_full': '37_13.0'}]

In [54]:
for entry_index in range(len(all_entries)):
    cur_entry = all_entries[entry_index]
    cur_id_base = cur_entry['entry_id_base']
    if cur_id_base in single_entries:
        # Only one entry, so remove the .0
        all_entries[entry_index]['entry_id_clean'] = cur_id_base
    else:
        all_entries[entry_index]['entry_id_clean'] = all_entries[entry_index]['entry_id_full']

In [55]:
all_entries[:5]

[{'at_id': 'rec00WLmO0M3GMg8P',
  'at_created': '2020-12-17T01:44:33.000Z',
  'entry_id': '48_33_739',
  'entry_id_base': '48_33',
  'entry_id_full': '48_33.0',
  'entry_id_clean': '48_33.0'},
 {'at_id': 'rec00bY5ogUHtBRV9',
  'at_created': '2020-12-17T01:44:33.000Z',
  'entry_id': '65_55_2976',
  'entry_id_base': '65_55',
  'entry_id_full': '65_55.0',
  'entry_id_clean': '65_55'},
 {'at_id': 'rec00egTfaoHNVRi8',
  'at_created': '2020-12-23T13:34:59.000Z',
  'entry_id': '93_42_6218',
  'entry_id_base': '93_42',
  'entry_id_full': '93_42.0',
  'entry_id_clean': '93_42.0'},
 {'at_id': 'rec01gGWC4uAohMl8',
  'at_created': '2020-12-17T01:44:33.000Z',
  'entry_id': '56_4_2036',
  'entry_id_base': '56_4',
  'entry_id_full': '56_4.0',
  'entry_id_clean': '56_4'},
 {'at_id': 'rec02OW32X4PtAchT',
  'at_created': '2020-12-17T01:44:33.000Z',
  'entry_id': '37_13_74',
  'entry_id_base': '37_13',
  'entry_id_full': '37_13.0',
  'entry_id_clean': '37_13.0'}]

## Part 2: Now add at_id, at_created, entry_id_base, and entry_id_clean to mongo

In [14]:
import os

import dotenv
from pymongo import MongoClient

client = MongoClient()

In [15]:
os.path.basename(os.getcwd())

'mongo'

In [16]:
if os.path.basename(os.getcwd()) != "marxdb":
    # Change into the dir that has the .env file
    os.chdir("..")

In [17]:
dotenv.load_dotenv(".env")

True

In [18]:
mongo_pw = os.getenv("MONGO_ADMIN_PW")
conn_str = f"mongodb+srv://admin:{mongo_pw}@cluster0.cg6nz.mongodb.net/test"

In [19]:
client = MongoClient(conn_str, connect=False)

In [20]:
db = client['marxdb']
coll = db['register']

In [84]:
for cur_entry in all_entries:
    print(cur_entry)
    break

{'at_id': 'rec00WLmO0M3GMg8P', 'at_created': '2020-12-17T01:44:33.000Z', 'entry_id': '48_33_739', 'entry_id_base': '48_33', 'entry_id_full': '48_33.0', 'entry_id_clean': '48_33.0'}


In [99]:
all_results = coll.find()

In [100]:
def get_at_result(entry_id):
    results = [entry for entry in all_entries if entry['entry_id'] == entry_id]
    if len(results) < 1:
        raise Exception(f"Entry {entry_id} not found in AT results")
    if len(results) > 1:
        raise Exception(f"Multiple AT results for {entry_id}")
    return results[0]

In [101]:
result_iter = tqdm(all_results)
for cur_result in result_iter:
    mongo_id = cur_result['_id']
    mongo_entry_id = cur_result['entry_id']
    result_iter.set_description(mongo_entry_id)
    #print(mongo_entry_id)
    at_record = get_at_result(mongo_entry_id)
    #print(at_record)
    new_fields = {
        'at_id': at_record['at_id'],
        'at_created': at_record['at_created'],
        'entry_id_base': at_record['entry_id_base'],
        'entry_id_clean': at_record['entry_id_clean']
    }
    update_one_result = coll.update_one({'_id': mongo_id}, {'$set': new_fields})
    #print(update_one_result.modified_count)

95_37_6455: : 7445it [24:31,  5.06it/s]
