# Intervis data processing

Download and convert the named pages of the related google spreadsheet:

In [1]:
import pandas as pd
import json

file_gids = {
    'links': '186216843',
    'references': '1115773066',
    'texts': '0',
    'glossary': '1127543685',
    'disclosure': '575388282',
}

spreadsheet_url = 'https://docs.google.com/spreadsheets/d/e/2PACX-1vTy2ONiejYXptt3uRLSeRqV1CJbbpi_68cz4Yeg9ZAdCC6tBhwK4DPgnLp6AwRK3EbYiMA2rLIVo0Z7/pub?output=csv'

for filename, gid in file_gids.items():
    df = pd.read_csv('%s&gid=%s' % (spreadsheet_url, gid), delimiter=',').dropna(how='all').fillna('')
    data = df.to_dict('records')
    
    filepath = './%s.json' % filename
    with open(filepath, 'w') as outfile:
        json.dump(data, outfile, sort_keys=False, indent=4)
        print('Wrote file %s.' % filepath)

print('Done.')


Wrote file ./links.json.
Wrote file ./references.json.
Wrote file ./texts.json.
Wrote file ./glossary.json.
Wrote file ./disclosure.json.
Done.


Convert downloaded google files into language files

In [2]:
from bs4 import BeautifulSoup
import pandas as pd
import json

files = {
    'German': '../locales/de.js',
    'English': '../locales/en.js',
}

# Merge google files
texts_data = pd.read_json('texts.json')
disclosure_data = pd.read_json('disclosure.json')

data = pd.concat([texts_data, disclosure_data], ignore_index=True)
data = data.set_index('ID')

# Process glossary data
glossary = {}
glossary_data = pd.read_json('glossary.json')
for language in files:
    language_data = glossary_data.loc[glossary_data.language == language].set_index('ID').drop('language', axis=1)
    glossary[language] = language_data.to_dict('index')

# Process text data
for column in files:
    data[column] = data[column].str.replace('\n','<br>') # add <br>
    
    # add title tags
    for index, item in data[column].iteritems():
        #print(index, item, data[column][index])
        
        soup = BeautifulSoup(item, 'html.parser')
        for tooltip in soup.find_all(class_="tooltip"):
            if tooltip.has_attr('ref') and tooltip['ref'] in glossary[column]:
                glossary_entry = glossary[column][tooltip['ref']]
                tooltip['title'] = glossary_entry['description']
            
        data[column][index] = str(soup)

# output language files

for (column, filepath) in files.items():
    entries = data[column].to_dict()
    output = 'export default ' + json.dumps(entries, indent=2)
    
    with open(filepath, 'w') as file:
        file.write(output)
        file.close()
    
    print('Wrote file %s' % filepath)

# Done
print('Done.')


Wrote file ../locales/de.js
Wrote file ../locales/en.js
Done.




Add structured version of links file (plain list to dict with list for each type):

In [3]:
import pandas as pd
import json

data = {}
type_key = 'Type'
output_file = './links_structured.json'

df = pd.read_json('./links.json')
for link_type in df[type_key].unique():
    data[link_type] = df.loc[df[type_key] == link_type].to_dict('records')

with open(output_file, 'w') as file:
    json.dump(data, file, indent=2)
    print('Wrote file %s' % output_file)
    file.close()
    
print('Done.')

Wrote file ./links_structured.json
Done.
