# Intervis data processing

## Process content from spreadsheet

Download and convert the named pages of the related google spreadsheet:

In [1]:
import pandas as pd
import json

file_gids = {
    'links': '186216843',
    'references': '1115773066',
    'texts': '0',
    'glossary': '1127543685',
    'disclosure': '575388282',
}

spreadsheet_url = 'https://docs.google.com/spreadsheets/d/e/2PACX-1vTy2ONiejYXptt3uRLSeRqV1CJbbpi_68cz4Yeg9ZAdCC6tBhwK4DPgnLp6AwRK3EbYiMA2rLIVo0Z7/pub?output=csv'

for filename, gid in file_gids.items():
    df = pd.read_csv('%s&gid=%s' % (spreadsheet_url, gid), delimiter=',').dropna(how='all').fillna('')
    data = df.to_dict('records')
    
    filepath = './%s.json' % filename
    with open(filepath, 'w') as outfile:
        json.dump(data, outfile, sort_keys=False, indent=4)
        print('Wrote file %s.' % filepath)

print('Done.')


Wrote file ./links.json.
Wrote file ./references.json.
Wrote file ./texts.json.
Wrote file ./glossary.json.
Wrote file ./disclosure.json.
Done.


Convert downloaded google files into language files

In [2]:
from bs4 import BeautifulSoup
import pandas as pd
import json

files = {
    'German': '../locales/de.js',
    'English': '../locales/en.js',
}

# Merge google files
texts_data = pd.read_json('texts.json')
disclosure_data = pd.read_json('disclosure.json')

data = pd.concat([texts_data, disclosure_data], ignore_index=True)
data = data.set_index('ID')

# Process glossary data
glossary = {}
glossary_data = pd.read_json('glossary.json')
for language in files:
    language_data = glossary_data.loc[glossary_data.language == language].set_index('ID').drop('language', axis=1)
    glossary[language] = language_data.to_dict('index')

# Process text data
for column in files:
    data[column] = data[column].str.replace('\n','<br>') # add <br>
    
    # add title tags
    for index, item in data[column].iteritems():
        #print(index, item, data[column][index])
        
        soup = BeautifulSoup(item, 'html.parser')
        for tooltip in soup.find_all(class_="tooltip"):
            if tooltip.has_attr('ref') and tooltip['ref'] in glossary[column]:
                glossary_entry = glossary[column][tooltip['ref']]
                tooltip['title'] = glossary_entry['description']
            
            if tooltip.has_attr('ref') and not tooltip['ref'] in glossary[column]:
                print('Warning: glossary entry not found (%s: %s)' % (column, tooltip['ref']))
        
        for tooltip in soup.find_all('a'):
            tooltip['target'] = '_blank'
            
        data[column][index] = str(soup)

# output language files

for (column, filepath) in files.items():
    entries = data[column].to_dict()
    output = 'export default ' + json.dumps(entries, indent=2)
    
    with open(filepath, 'w') as file:
        file.write(output)
        file.close()
    
    print('Wrote file %s' % filepath)

# Done
print('Done.')




Wrote file ../locales/de.js
Wrote file ../locales/en.js
Done.


Add structured version of links file (plain list to dict with list for each type):

In [3]:
import pandas as pd
import json

data = {}
type_key = 'Type'
output_file = './links_structured.json'

df = pd.read_json('./links.json')
for link_type in df[type_key].unique():
    data[link_type] = df.loc[df[type_key] == link_type].to_dict('records')

with open(output_file, 'w') as file:
    json.dump(data, file, indent=2)
    print('Wrote file %s' % output_file)
    file.close()
    
print('Done.')

Wrote file ./links_structured.json
Done.


## Process images

Compress fallback images

In [4]:
import os, pathlib, shutil
from PIL import Image 

# settings
fallback_image_dirname = './_raw/fallback'
destination_dirname = '../assets/fallback'

# helper functions
def copy_file(src_fpath, dest_fpath):
    os.makedirs(os.path.dirname(dest_fpath), exist_ok=True)
    shutil.copy(src_fpath, dest_fpath)

def compress_image(filepath):
    image_file = pathlib.Path(filepath)
    image = Image.open(image_file)
    
    dpi = 150, 150
    size = round(image.size[0] / 3), round(image.size[1] / 3)
    
    image.thumbnail(size)
    image.save(image_file, dpi=dpi)
        
# image compression
for path, subdirs, files in os.walk(fallback_image_dirname):
    for name in files:
        if name == '.DS_Store':
            continue
        
        # Define ouput path
        source_path = pathlib.PurePath(path, name)
        output_path = pathlib.PurePath(destination_dirname, source_path.relative_to(*source_path.parts[:2]))
        
        # Copy and compress image
        copy_file(source_path, output_path)
        compress_image(output_path)
        
print('Done.')

Done.


Compress disclosure figures

In [5]:
import os, pathlib, shutil
from PIL import Image, ImageColor

# settings
disclosure_image_dirname = './_raw/disclosure'
destination_dirname = '../assets/disclosure'

# helper functions
def copy_file(src_fpath, dest_fpath):
    os.makedirs(os.path.dirname(dest_fpath), exist_ok=True)
    shutil.copy(src_fpath, dest_fpath)

def compress_image(filepath):
    image_file = pathlib.Path(filepath)
    image = Image.open(image_file)
    
    dpi = 150, 150
    size = 1200, 1200
    
    image.thumbnail(size)
    image.save(image_file, dpi=dpi)

def convert_image(png_filepath, jpg_filepath, background_color):
    image_file = pathlib.Path(png_filepath)
    image = Image.open(image_file)
    
    # add background by pasting it to another image
    new_image = Image.new("RGBA", image.size, ImageColor.getrgb(background_color))
    new_image.paste(image, (0, 0), image)
    
    new_image.convert('RGB').save(jpg_filepath, quality=95, optimize=True, progressive=True)
        
# image compression
for path, subdirs, files in os.walk(disclosure_image_dirname):
    for name in files:
        if name == '.DS_Store':
            continue
        
        # Define ouput path
        source_path = pathlib.PurePath(path, name)
        png_output_path = pathlib.PurePath(destination_dirname, source_path.relative_to(*source_path.parts[:2]))
        
        # Copy and compress image
        copy_file(source_path, png_output_path)
        compress_image(png_output_path)
        
        # Convert image to jpeg
        parentdir, filename = os.path.split(png_output_path)
        filestem, fileextension = os.path.splitext(filename)

        jpg_output_path = os.path.join(parentdir, filestem + '.jpg')
        convert_image(png_output_path, jpg_output_path, '#F9FAFF')
        
        os.remove(png_output_path)
        
print('Done.')

Done.


Compress grid

In [6]:
import os, pathlib, shutil
from PIL import Image, ImageColor

# settings
grid_image_dirname = './_raw/grid'
destination_dirname = '../assets/grid'

# helper functions
def copy_file(src_fpath, dest_fpath):
    os.makedirs(os.path.dirname(dest_fpath), exist_ok=True)
    shutil.copy(src_fpath, dest_fpath)

def compress_image(filepath):
    image_file = pathlib.Path(filepath)
    image = Image.open(image_file)
    
    dpi = 150, 150
    size = round(image.size[0] / 2), round(image.size[1] / 2)
    
    image.thumbnail(size)
    image.save(image_file, dpi=dpi)

def convert_image(png_filepath, jpg_filepath, background_color):
    image_file = pathlib.Path(png_filepath)
    image = Image.open(image_file)
    
    # add background by pasting it to another image
    new_image = Image.new("RGBA", image.size, ImageColor.getrgb(background_color))
    new_image.paste(image, (0, 0), image)
    
    new_image.convert('RGB').save(jpg_filepath, quality=95, optimize=True, progressive=True)

# image compression
for path, subdirs, files in os.walk(grid_image_dirname):
    for name in files:
        if name == '.DS_Store':
            continue
        
        # Define ouput path and copy file
        source_path = pathlib.PurePath(path, name)
        png_output_path = pathlib.PurePath(destination_dirname, source_path.relative_to(*source_path.parts[:2]))
        copy_file(source_path, png_output_path)
        
        # Compress image
        # compress_image(png_output_path)
        
        # Convert image to jpeg
        jpg_conversions = [
            'chapter3_frauenmitbehinderung.png',
            'chapter3_frauenmithijab.png',
            'chapter3_sintiundroma.png',
            'dark_gray.png'
        ]
        if name in jpg_conversions:
            parentdir, filename = os.path.split(png_output_path)
            filestem, fileextension = os.path.splitext(filename)
            jpg_output_path = os.path.join(parentdir, filestem + '.jpg')
            
            convert_image(png_output_path, jpg_output_path, '#F9FAFF')
            os.remove(png_output_path)
        
print('Done.')

Done.
