In [101]:
from bs4 import BeautifulSoup
import os 
import glob
import time
import re
import numpy as np
from tqdm import tqdm
import pandas as pd

In [93]:
def get_metadata(f):
    handler = open(f).read()
    soup = BeautifulSoup(handler, 'lxml')
    metadata = str(soup.find_all("metadata"))
    return(metadata)

def get_artist(metadata):
    start = '<dc:creator>'
    end = '</dc:creator>'
    return ','.join(re.findall(start+'(.*?)'+end, metadata))

def get_type(metadata):
    start = '<dc:type>'
    end = '</dc:type>'
    return ','.join(re.findall(start+'(.*?)'+end, metadata))

def get_description(metadata):
    start = '<dc:description>'
    end = '</dc:description>'
    return ','.join(re.findall(start+'(.*?)'+end, metadata))

def get_title(metadata):
    start = '<dc:title>'
    end = '</dc:title>'
    return ','.join(re.findall(start+'(.*?)'+end, metadata))

def get_date(metadata):
    start = '<dc:date>'
    end = '</dc:date>'
    startdate, enddate = '', ''
    date = ','.join(re.findall(start+'(.*?)'+end, metadata))
    dates = [i.strip() for i in date.split('-')]
    if len(dates) > 1:
        startdate = dates[0]
        enddate = dates[1]
        return (startdate, enddate)
    elif len(dates) == 1:
        startdate = dates[0]
        enddate = dates[0]  
        return (startdate, enddate)
    else:
        return (startdate, enddate)

def get_material(metadata):
    start = '<dc:format>materiaal: '
    end = '</dc:format>'
    return ','.join(re.findall(start+'(.*?)'+end, metadata))

def get_id(metadata):
    start = '<dc:identifier>'
    end = '</dc:identifier>'
    return (re.findall(start +'(.*?)'+end, metadata)[0])

def get_url(metadata):
    start = '<dc:format>https:'
    end = '</dc:format>'    
    return (re.findall(start+'(.*?)'+end, metadata))

def get_place(metadata):
    start = '<dc:coverage>'
    end = '</dc:coverage>'    
    return ','.join(re.findall(start+'(.*?)'+end, metadata))

def get_dimension(metadata):
    start1 = '<dc:format>hoogte '
    end1 = '</dc:format>'    
    start2 = '<dc:format>breedte '
    end2 = '</dc:format>'    
    start3 = '<dc:format>diepte '
    end3 = '</dc:format>'  
    height, width, depth = '', '', ''
    if re.findall(start1+'(.*?)'+end1, metadata):
        height = re.findall(start1+'(.*?)'+end1, metadata)[0]
    if re.findall(start2+'(.*?)'+end2, metadata):
        width = re.findall(start2+'(.*?)'+end2, metadata)[0]
    if re.findall(start3+'(.*?)'+end3, metadata):
        depth = re.findall(start3+'(.*?)'+end3, metadata)[0]
    dim = ','.join([height, width, depth])
    return dim

In [139]:
data = {'id': [], 'title':[], 'startYear':[], 'endYear':[], 'dimension':[], 'artist':[], 'category':[], 
        'material':[], 'place':[], 'description':[], 'url':[], 'Thumbnailurl':[]}
def read_xml():
    extension = 'xml'
    result = glob.glob('xml2/*.{}'.format(extension))
    for i in tqdm(result[100000:]):
        metadata = get_metadata(i)
        data['id'].append(get_id(metadata))
        data['title'].append(get_title(metadata))
        data['startYear'].append(get_date(metadata)[0])
        data['endYear'].append(get_date(metadata)[1])
        data['dimension'].append(get_dimension(metadata))
        data['artist'].append(get_artist(metadata))
        data['category'].append(get_type(metadata)) 
        data['material'].append(get_material(metadata))
        data['place'].append(get_place(metadata))
        data['description'].append(get_description(metadata))
        data['url'].append(get_url(metadata)[0])
        data['Thumbnailurl'].append(get_url(metadata)[1])
    

In [1]:
read_xml()

In [2]:
df = pd.DataFrame(data)
# df.to_csv('rijks.csv')

In [None]:
### Translation:

In [None]:
def remove_punctuation(s):
    punc = '''!()-[]{};:'"\,<>./?@#$%^&*_~='''
    s = ''.join([i.lower() if not i in punc else ' ' for i in s])
    s = ' '.join([i for i in s.split('\n') if i != ' '])
    s = ' '.join([i for i in s.split('\r') if i != ' '])
    s = ' '.join([i for i in s.split('\t') if i != ' '])
    return s

In [None]:
# Phrase to phrase translation:
for i in ['title', 'category', 'place', 'materials']:
    feat_list = df['title'].tolist()
    english = []
    for i in tqdm(feat_list):
        english.append(translated(i, 'dutch', 'english'))
    name = i+'_en'
    df[name] = title_english

# word by word translation of description:
description = data['description'].tolist()
description_words = []
for i in description:
    description_words += [j.strip() for j in i.split(' ')]
description_words = list(set(description_words))
# len(description_words)
description_word_en_1 = []
for i in tqdm(description_words):
    if type(i) == float:
        description_word_en_1.append('')
    elif i.isnumeric():
        description_word_en_1.append(i)
    else:
        description_word_en_1.append(translated(i, 'french', 'english'))
        
description_word_dict = dict(zip(description_words, description_word_en_1))
description = df['description'].tolist()
description_english = []
for i in tqdm(description):
    if type(i) == float:
        description_english.append('')
    else:
        d = ' '.join([description_word_dict[j].lower() for j in i.split(' ') if j.isalpha()])
        description_english.append(d)
df['description_en'] = description_english