In [1]:
import os
import boto
import boto.s3.connection
from boto.s3.key import Key
from smart_open import s3_iter_bucket

In [2]:
from olive_importer import detect_journal_issues, canonical_path

In [3]:
access_key = os.environ["SE_ACCESS_KEY"]
secret_key = os.environ["SE_SECRET_KEY"]
conn = boto.connect_s3(
        aws_access_key_id = access_key,
        aws_secret_access_key = secret_key,
        host = 'os.zhdk.cloud.switch.ch',
        calling_format = boto.s3.connection.OrdinaryCallingFormat(),
)

In [4]:
b = conn.create_bucket('canonical-json-test')

In [6]:
list(b.list())

[<Key: canonical-json-test,out/GDL/1900/01/02/a/GDL-1900-01-02-a-info.json>,
 <Key: canonical-json-test,out/GDL/1900/01/02/a/GDL-1900-01-02-a-p0001.json>,
 <Key: canonical-json-test,out/GDL/1900/01/02/a/GDL-1900-01-02-a-p0002.json>,
 <Key: canonical-json-test,out/GDL/1900/01/02/a/GDL-1900-01-02-a-p0003.json>,
 <Key: canonical-json-test,out/GDL/1900/01/02/a/GDL-1900-01-02-a-p0004.json>,
 <Key: canonical-json-test,out/GDL/1900/01/03/a/GDL-1900-01-03-a-info.json>,
 <Key: canonical-json-test,out/GDL/1900/01/03/a/GDL-1900-01-03-a-p0001.json>,
 <Key: canonical-json-test,out/GDL/1900/01/03/a/GDL-1900-01-03-a-p0002.json>,
 <Key: canonical-json-test,out/GDL/1900/01/03/a/GDL-1900-01-03-a-p0003.json>,
 <Key: canonical-json-test,out/GDL/1900/01/03/a/GDL-1900-01-03-a-p0004.json>,
 <Key: canonical-json-test,out/GDL/1900/01/04/a/GDL-1900-01-04-a-info.json>,
 <Key: canonical-json-test,out/GDL/1900/01/04/a/GDL-1900-01-04-a-p0001.json>,
 <Key: canonical-json-test,out/GDL/1900/01/04/a/GDL-1900-01-04-a-p0

In [9]:
issues = detect_journal_issues("out/")

In [31]:
issues[0].path

'out/GDL/1900/01/02'

In [36]:
files = [
    os.path.join(i.path, "a", f)
    for i in issues
    for f in os.listdir(os.path.join("out", canonical_path(i, path_type="dir")))
]

In [37]:
files

['out/GDL/1900/01/02/a/GDL-1900-01-02-a-info.json',
 'out/GDL/1900/01/02/a/GDL-1900-01-02-a-p0001.json',
 'out/GDL/1900/01/02/a/GDL-1900-01-02-a-p0002.json',
 'out/GDL/1900/01/02/a/GDL-1900-01-02-a-p0003.json',
 'out/GDL/1900/01/02/a/GDL-1900-01-02-a-p0004.json',
 'out/GDL/1900/01/03/a/GDL-1900-01-03-a-info.json',
 'out/GDL/1900/01/03/a/GDL-1900-01-03-a-p0001.json',
 'out/GDL/1900/01/03/a/GDL-1900-01-03-a-p0002.json',
 'out/GDL/1900/01/03/a/GDL-1900-01-03-a-p0003.json',
 'out/GDL/1900/01/03/a/GDL-1900-01-03-a-p0004.json',
 'out/GDL/1900/01/04/a/GDL-1900-01-04-a-info.json',
 'out/GDL/1900/01/04/a/GDL-1900-01-04-a-p0001.json',
 'out/GDL/1900/01/04/a/GDL-1900-01-04-a-p0002.json',
 'out/GDL/1900/01/04/a/GDL-1900-01-04-a-p0003.json',
 'out/GDL/1900/01/04/a/GDL-1900-01-04-a-p0004.json',
 'out/GDL/1900/01/05/a/GDL-1900-01-05-a-info.json',
 'out/GDL/1900/01/05/a/GDL-1900-01-05-a-p0001.json',
 'out/GDL/1900/01/05/a/GDL-1900-01-05-a-p0002.json',
 'out/GDL/1900/01/05/a/GDL-1900-01-05-a-p0003.json

In [38]:
for f in files:
    k = Key(b) # create a key
    k.key = f # set the original filename (with path) as the key
    k.set_contents_from_filename(f) # copy the content of the file into the key
    k.close()

In [8]:
for key, content in s3_iter_bucket(b, prefix="out/GDL/1900/"):
    print(key.name, key.size)

out/GDL/1900/01/02/a/GDL-1900-01-02-a-p0004.json 92143
out/GDL/1900/01/04/a/GDL-1900-01-04-a-p0002.json 832407
out/GDL/1900/01/05/a/GDL-1900-01-05-a-info.json 13768
out/GDL/1900/01/02/a/GDL-1900-01-02-a-info.json 14782
out/GDL/1900/01/03/a/GDL-1900-01-03-a-p0003.json 783433
out/GDL/1900/01/03/a/GDL-1900-01-03-a-p0001.json 679561
out/GDL/1900/01/03/a/GDL-1900-01-03-a-p0004.json 62885
out/GDL/1900/01/05/a/GDL-1900-01-05-a-p0004.json 15
out/GDL/1900/01/03/a/GDL-1900-01-03-a-p0002.json 720926
out/GDL/1900/01/02/a/GDL-1900-01-02-a-p0003.json 755319
out/GDL/1900/01/04/a/GDL-1900-01-04-a-p0004.json 72513
out/GDL/1900/01/04/a/GDL-1900-01-04-a-info.json 12858
out/GDL/1900/01/03/a/GDL-1900-01-03-a-info.json 13170
out/GDL/1900/01/05/a/GDL-1900-01-05-a-p0003.json 796982
out/GDL/1900/01/05/a/GDL-1900-01-05-a-p0001.json 684448
out/GDL/1900/01/05/a/GDL-1900-01-05-a-p0002.json 841273
out/GDL/1900/01/04/a/GDL-1900-01-04-a-p0001.json 677649
out/GDL/1900/01/02/a/GDL-1900-01-02-a-p0002.json 813049
out/GDL

## data acquisition

In [6]:
from olive_importer import detect_journal_issues

  return f(*args, **kwds)


In [93]:
from datetime import date

In [7]:
issues = detect_journal_issues("sample_data/")

In [8]:
issues

[IssueDirectory(journal='GDL', date=datetime.date(1900, 1, 2), edition='a', path='sample_data/GDL/1900/01/02'),
 IssueDirectory(journal='GDL', date=datetime.date(1900, 1, 3), edition='a', path='sample_data/GDL/1900/01/03'),
 IssueDirectory(journal='GDL', date=datetime.date(1900, 1, 4), edition='a', path='sample_data/GDL/1900/01/04'),
 IssueDirectory(journal='GDL', date=datetime.date(1900, 1, 5), edition='a', path='sample_data/GDL/1900/01/05')]

In [86]:
def parse_filter(filter_string):
    
    filters = {
        f.split("=")[0].strip(): f.split("=")[1].strip() 
        for f in filter_string.split(";")
    }
    
    return filters
    

In [113]:
def apply_filters(filter_dict, issues):
    
    filtered_issues = []
    
    print(filter_dict)
    
    if "journal" in filter_dict:
        filtered_issues = [i for i in issues if i.journal == filter_dict["journal"]]
    else:
        filtered_issues = issues
        
    if "date" in filter_dict:
        
        # date filter is a range
        if "-" in filter_dict["date"]:
            start, end = filter_dict["date"].split("-")
            start = date(*[int(x) for x in start.split("/")])
            end = date(*[int(x) for x in end.split("/")])
            print(start, end)
            filtered_issues = [
                i
                for i in filtered_issues 
                if i.date >= start and i.date <= end
            ]
            
        # date filter is not a range
        else:
            filter_date = date(*[int(x) for x in filtered_dict["date"].split("/")])
            filtered_issues += [
                i
                for i in issues 
                if i.date == filter_date
            ]
        
    return filtered_issues

In [114]:
issue_filter = "date=1900/01/01-1910/01/03"

In [122]:
issue_filter = "journal=JDG"

In [123]:
f = parse_filter(issue_filter)

In [124]:
f

{'journal': 'JDG'}

In [125]:
apply_filters(f, issues)

{'journal': 'JDG'}


[]

In [44]:
issues[0].date

datetime.date(1900, 1, 2)

In [49]:
date(*[int(x) for x in "1950/01/01".split("/")])

[{'font': 'Times-Roman',
  'font_size': 10.0,
  'id': '001',
  'rgb_color': [0, 0, 0]},
 {'font': 'Helvetica-BoldOblique',
  'font_size': 13.0,
  'id': '002',
  'rgb_color': [0, 0, 0]},
 {'font': 'Times-Bold',
  'font_size': 15.0,
  'id': '003',
  'rgb_color': [0, 0, 0]},
 {'font': 'Times-Bold',
  'font_size': 11.0,
  'id': '004',
  'rgb_color': [0, 0, 0]},
 {'font': 'Helvetica-Bold',
  'font_size': 11.0,
  'id': '005',
  'rgb_color': [0, 0, 0]},
 {'font': 'Helvetica-Bold',
  'font_size': 7.0,
  'id': '006',
  'rgb_color': [0, 0, 0]},
 {'font': 'Times-Italic',
  'font_size': 10.0,
  'id': '007',
  'rgb_color': [0, 0, 0]},
 {'font': 'Times-Bold',
  'font_size': 121.99,
  'id': '008',
  'rgb_color': [0, 0, 0]},
 {'font': 'Times-Bold',
  'font_size': 121.964,
  'id': '009',
  'rgb_color': [0, 0, 0]},
 {'font': 'Times-Bold',
  'font_size': 122.009,
  'id': '010',
  'rgb_color': [0, 0, 0]},
 {'font': 'Times-Bold',
  'font_size': 36.0,
  'id': '011',
  'rgb_color': [0, 0, 0]},
 {'font': 'Tim

## passim-related stuff

In [1]:
import codecs
import jsonlines
from bs4 import BeautifulSoup

In [5]:
sample_path = [
    "/Users/rromanello/Downloads/GDL-1998-01-01-60.xml",
    "/Users/rromanello/Downloads/GDL-1998-01-01-100.xml"
]

In [33]:
with codecs.open(sample_path, 'r', 'utf-8') as inp_file:
    xml = inp_file.read()

In [34]:
soup = BeautifulSoup(xml, 'lxml')

In [35]:
entities = soup.findAll('entity')

In [38]:
str(entities[0].find('full_text'))

'<full_text>CINEMAS &amp; Suite des cinémas de la page 23 LA Confidential, de Curtis Hanson, av. Kim Basin ger, Kevin Spacey ; (14 / 16 ans) me / je / di v. o. 17.00, 20.15, ve / sa v. o. 17.00, 20.15, 23.00 (Udo 1). Ghetto, de Thomas Imbach ; (12 /^ 4 ans)\n            me / je en dialecte 17.30, ve-di relâche (Udo 2). 187 (One Eight Seven) de Kevin Reynolds, av. Sa muel L Jackson, John Heard, Kelly Rowan ; (12 / 14 ans) me / je v. o. 20.15, ve-di relâche (Udo 2). Rien à perdre (Nothing to lose) de Steve Œdekerk av.\n            Martin Lawrence, Tim Robbins, Kelly Preston ; (10 / 12 ans) me / je relâche, ve / sa v. o. 17.30, 20.30 22.45, di- v. o. 17.30, 20.30 (Udo 2). 32 a, r. Centrale, 032 / 323 66 55 UDO 1-2 Maman, je m\'occupe des méchants (Home Alone 3) de Raja Gosnell,\n            av. Alex D. Linz, Olek Kru pa, Kevin Kilner ; (7 / 5 ans) me / je v. all. 15.00, p. fr. 17.15, v. o. 20.15, ve-di v. all. 15.00, p. fr. 17.15. Double Team, de Tsui Hark, av. jean-Claude Van Damme, Denn

In [41]:
type(entities[0].find('full_text').text)

str

In [33]:
def XML_to_passim_JSON(xml_files, out_dir):
    documents = []
    
    for path in xml_files:
        
        with codecs.open(path, 'r', 'utf-8') as inp_file:
            xml = inp_file.read()
        
        soup = BeautifulSoup(xml, 'lxml')
        entities = soup.findAll('entity')
        
        if len(entities) == 0:
            continue
        
        fulltext = "\n".join([str(e.find('full_text').encode(formatter="xml").decode('utf-8')) for e in entities])
        documents.append(
            {
                "fulltext": fulltext
            }
        )
    return documents

In [36]:
docs = XML_to_passim_JSON(sample_path, "")

In [37]:
with jsonlines.open('/Users/rromanello/Downloads/test.jsonl', mode='w') as writer:
    writer.write_all(docs)