Tento Jupyter notebook nám umožňuje stiahnuť všetky relevantné zmluvy (vo formáte pdf), nad ktorými následne budeme robiť analýzu. Ako vstup berie tabuľku **CRZ_DB_clean.csv**, zmluvy uloží do priečinku contracts.

Pred samotnou analýzou zmlúv musíme vytvoríme tabuľku, kde jeden riadok tabuľky obsahuje jeden dokument viažúci sa k relevantnej zmluve. Niektoré zmluvy môžu mať totiž typicky viac dokumentov alebo príloh. 

In [None]:
import os
import pandas as pd
import ast
import re

DB_clean    = pd.read_csv('CRZ_DB_clean.csv', delimiter = '|')

header_import = ['Nazov', 'ID', 'Inner-ID', 'Objednavatel_ICO',
    'Objednavatel', 'Objednavatel_adresa', 'Dodavatel_ICO', 'Dodavatel',
    'Dodavatel_adresa', 'Datum_zverejnenia', 'Datum_podpisu',
    'Datum_platnosti', 'Datum_ucinnosti', 'Posledna_zmena', 'Cena_konecna',
    'Cena_podpisana', 'Rezort', 'Typ', 'Stav', 'Prilohy', 'Dodatky']

header_export = ['Nazov', 'ID', 'Inner-ID', 'Objednavatel_ICO',
    'Objednavatel', 'Objednavatel_adresa', 'Dodavatel_ICO', 'Dodavatel',
    'Dodavatel_adresa', 'Datum_zverejnenia', 'Datum_podpisu',
    'Datum_platnosti', 'Datum_ucinnosti', 'Posledna_zmena', 'Cena_konecna',
    'Cena_podpisana', 'Rezort', 'Typ', 'Stav']

DB_clean = DB_clean.drop(DB_clean.columns.difference(header_import), axis=1)
number_of_contracts = DB_clean.shape[0]

row_list = []
for index, row in DB_clean.iterrows():
    print('Processing contract:',index+1,'out of',number_of_contracts)

    # Copy old row into the new one
    new_row = dict((label,row[label]) for label in header_export)

    attachments = ast.literal_eval(row['Prilohy'])

    supplements = []
    if not pd.isnull(row['Dodatky']):
        supplements = ast.literal_eval(row['Dodatky'].replace(' nan,',' "nan",'))

    attachment_number = 0
    for attachment in attachments:
        attachment_number += 1

        new_row['Dodatok'] = 'FALSE'
        new_row['Dodatok_nazov'] = ''
        new_row['Dodatok_ID']    = ''
        new_row['Dodatok_cislo'] = ''
        new_row['Dodatok_datum_podpisu'] = ''
        new_row['Dodatok_datum_ucinnosti'] = ''
        new_row['Dodatok_datum_platnosti'] = ''
        new_row['Dodatok_poznamka'] = ''
        new_row['Dodatok_link'] = ''

        new_row['Priloha_ID']    = attachment[0]
        new_row['Priloha_nazov'] = attachment[1]
        new_row['Priloha_link']  = attachment[2]
        new_row['Priloha_velkost'] = attachment[3]
        new_row['Priloha_cislo'] = attachment_number

        row_list.append(new_row)

    supplement_number = 0
    for supplement in supplements:
        supplement_number += 1

        supplement_attachments = ast.literal_eval(supplement[9])
        supplement_attachment_number = 0

        for attachment in supplement_attachments:
            supplement_attachment_number += 1

            new_row['Dodatok'] = 'TRUE'
            new_row['Dodatok_nazov'] = supplement[0]
            new_row['Dodatok_ID']    = supplement[1]
            new_row['Dodatok_cislo'] = supplement_number
            new_row['Dodatok_datum_podpisu'] = supplement[5]
            new_row['Dodatok_datum_ucinnosti'] = supplement[6]
            new_row['Dodatok_datum_platnosti'] = supplement[7]
            new_row['Dodatok_poznamka'] = supplement[8]
            new_row['Dodatok_link'] = supplement[10]

            new_row['Priloha_ID']    = ''
            new_row['Priloha_nazov'] = attachment[1]
            new_row['Priloha_link']  = attachment[0]
            new_row['Priloha_velkost'] = ''
            new_row['Priloha_cislo'] = supplement_attachment_number

            row_list.append(new_row)

DB = pd.DataFrame(row_list)
DB.to_csv('CRZ_DB_with_supplements_unrolled.csv', sep = '|')

Stiahnutie relevantných zmlúv.

In [None]:
import os
import urllib.request
import numpy as np
import pandas as pd
import ast
import re

if not os.path.exists('contracts'):
    os.makedirs('contracts')

working_dir = os.getcwd()+'\\contracts\\'

DB_clean    = pd.read_csv('CRZ_DB_clean.csv', delimiter = '|')
number_of_contracts = DB_clean.shape[0]

download_link = []
download_name = []
download_size = []

size     = 0

for i in range(0, number_of_contracts):

    attachments = ast.literal_eval(DB_clean.iloc[i,20])
    contract_ID = str(DB_clean.iloc[i,2])

    for attachment in attachments:

        contract_attachment_ID        = str(attachment[0])
        contract_attachment_PDF       = attachment[2]
        contract_attachment_size      = int(attachment[3])

        size += contract_attachment_size/1000000

        download_name.append('CRZ_'+contract_ID+'_'+contract_attachment_ID+'_contract.pdf')
        download_link.append(contract_attachment_PDF)
        download_size.append(size)

    supplements = []
    if not pd.isnull(DB_clean.iloc[i,21]):
        supplements = ast.literal_eval(DB_clean.iloc[i,21].replace(' nan,',' "nan",'))

    for supplement in supplements:
        supplement_ID = str(supplement[1])

        supplement_attachment_number = 0
        supplement_attachments = ast.literal_eval(supplement[9])

        for attachment in supplement_attachments:
            supplement_attachment_number += 1
            download_name.append('CRZ_'+contract_ID+'_'+supplement_ID+'_'+str(supplement_attachment_number)+'_supplement.pdf')
            download_link.append(attachment[0])
            download_size.append(0)

number_of_attachments = len(download_link)

print('Documents to download :', number_of_attachments)
print('Total size            : %.2f MB' % (size),' (without supplemenents)')

print('Download started ...')

for i in range(0,number_of_attachments):
    print('Downloading document: %d out of %d | Rest size: %.2f MB' % (i+1,number_of_attachments,size-download_size[i]))
    urllib.request.urlretrieve(download_link[i], working_dir+download_name[i])
