In [None]:
!python --version
!pip install -r requirements.txt --disable-pip-version-check | tail -n 5

In [None]:
import os,json,sys,warnings,re,io,shutil

import numpy as np
import pandas as pd
import xmltodict
import urllib.request
import folium
import requests

from unidecode import unidecode
from pypdf import PdfReader
from contextlib import redirect_stderr
from geopy.distance import great_circle 
from joblib import Memory
from IPython.display import HTML,Image
from requests_ratelimiter import LimiterSession
from tqdm import tqdm
from lingua import Language, LanguageDetectorBuilder, IsoCode639_1

In [None]:
# create dirs
os.makedirs('__cache/indexes', exist_ok=True)
os.makedirs('__documents', exist_ok=True)
os.makedirs('result-corpus', exist_ok=True) 
os.makedirs('result-figures', exist_ok=True) 

In [None]:
memory = Memory(location='__cache', verbose=0)

tqdm.pandas(file=sys.stdout,ncols=100)

lng_model = LanguageDetectorBuilder.from_languages(
    Language.ENGLISH,Language.CZECH).with_preloaded_language_models().with_minimum_relative_distance(0.9).build()

session = requests.Session()
session_limited = LimiterSession(per_second=1)
session_limited.headers.update({'User-Agent': 'research-data-bussiness',
                                'email':'name@org.com',
                                'Accept-Language': 'en-US'})

# Functions

## REST_API_GET

In [None]:
@memory.cache
def REST_API_GET(url):

    response = pd.NA
    
    try:
        response = session_limited.get(url=url)
    except:
        print('REST API connection error.')
        raise SystemExit
    
    return response

## REST_API_GET_LOCAL

In [None]:
@memory.cache
def REST_API_GET_LOCAL(url):

    response = pd.NA
    
    try:
        response = session.get(url=url)
    except:
        print('REST API connection error.')
        raise SystemExit
    
    return response

## GET_FILE_SOURCE

In [None]:
@memory.cache
def GET_FILE_SOURCE(path):
    
    result = pd.NA

    file_type = path.split('.')[-1]

    warnings.filterwarnings('ignore')

    if file_type == 'pdf':      

        with io.StringIO() as buf, redirect_stderr(buf):
        
            try:
                reader = PdfReader(path)
                text = ''
                
                for page in reader.pages:
                    text += page.extract_text() + '\n'
            
                result = text
            except:
                pass
    
    return result

## GET_LNG

In [None]:
@memory.cache
def GET_LNG(text):

    result = pd.NA
    
    try:
        lng = lng_model.detect_language_of(text).iso_code_639_1.name
        result = lng
    except:
        pass
        
    return result

## geocode_addr

In [None]:
def geocode_addr(addr,raise_error_data=False,raise_error_request=False,try_nominatim_local=False):
    
    result = pd.NA

    local_record = False

    if try_nominatim_local:
        try:
            url = f'http://localhost:8080/search?q={addr}&addressdetails=1&format=json'
            response = REST_API_GET_LOCAL(url=url)
            response_data = json.loads(response.text)
            #print('Local',url)
            if len(response_data) > 0: local_record=True
            #print(response_data)  
        except:
            pass

    if not local_record:
        url = f'https://nominatim.openstreetmap.org/search?q={addr}&addressdetails=1&format=json'
        #print(url)
        response = REST_API_GET(url=url)   
    
    if response.status_code == 200:
        try:
            response_data = json.loads(response.text)

            #print(response.text)
            #print(response_data)
            #print(len(response_data))

            if len(response_data) > 0:

                # most relevant record
                item = response_data[0]

                lat = str(round(float(item['lat']),5))
                lon = str(round(float(item['lon']),5))
                bbox = item['boundingbox']
                bbox = [str(round(float(item), 5)) for item in bbox]
                
                # rb lt crd
                bbox = (bbox[0],bbox[2]),(bbox[1],bbox[3])

                country = response_data[0]['address'].get('country_code',pd.NA)

                road = item['address'].get('road',pd.NA)
                square = item['address'].get('square',pd.NA)
                # try in this order road/square
                place = road
                if pd.isna(place): place = square     
                               
                if pd.notna(country): country = unidecode(country).lower()
                if pd.notna(place): place = unidecode(place).lower()

                #result = (lat,lon),(bbox),municipality,country,len(response_data)
                result = (lat,lon),(bbox),place,country

        
        except:
            if raise_error_data:
                print('REST API data error.')
                raise SystemExit
            else:
                pass
            
    else: 
        if raise_error_data:
            print('REST API request error.')
            raise SystemExit
        else:
            pass          
    
    return result

## download_file

In [None]:
def download_file(url,local_path,use_cached=True,show=False):

    result = pd.NA
    
    if use_cached==False or not os.path.exists(local_path) or os.path.getsize(local_path) == 0:        
        try:
            with urllib.request.urlopen(url) as response:
                with open(local_path, 'wb') as file:
                    file.write(response.read())
                    if show: print(f'Downloaded {local_path}')
                    result = local_path
        except:
            if show: print(f'Error {local_path}')        
            pass
    else:
        result = local_path
        if show: print(f'Stored {local_path}')
      
    return result

## parse_text

In [None]:
def parse_text(text):
    
    text = text.lower()
           
    text = unidecode(text)
    text = re.findall(r'\b[a-zA-Z0-9@,.-]+\b',text)    
    result = ' '.join(text)

    return result

## parse_archive_files

In [None]:
def parse_archive_files(archive_files):

    result = pd.NA

    if type(archive_files) is not list:
        result = (archive_files.get('odkaz',pd.NA),)
    else:        
        files = tuple(pd.json_normalize(archive_files).odkaz.tolist())
        
        if len(files) > 0:
            result = files

    return result

## remove_keywords

In [None]:
def remove_keywords(text_what,text_from):
    
    text_what = text_what.split() 
    text_from = text_from.split() 
    tokeep = [item for item in text_from if item not in text_what]
    
    return ' '.join(tokeep)

## check_int_addr

In [None]:
def check_int_addr(addr):
    chars = "áčďéěíňóřšťúůýžÁČĎÉĚÍŇÓŘŠŤÚŮÝŽ"
    return not any(item in chars for item in addr)

## clear_addr

In [None]:
iso_a3_codes = [
    'afg', 'alb', 'dza', 'and', 'ago', 'arg', 'arm', 'aus', 'aut', 'aze',
    'bhs', 'bhr', 'bgd', 'brb', 'blr', 'bel', 'blz', 'ben', 'btn', 'bol',
    'bih', 'bwa', 'bra', 'brn', 'bgr', 'bfa', 'bdi', 'cpv', 'khm', 'cmr',
    'can', 'caf', 'tcd', 'chl', 'chn', 'col', 'com', 'cog', 'cod', 'cri',
    'civ', 'hrv', 'cub', 'cyp', 'cze', 'dnk', 'dji', 'dma', 'dom', 'ecu',
    'egy', 'slv', 'gnq', 'eri', 'est', 'swz', 'eth', 'fji', 'fin', 'fra',
    'gab', 'gmb', 'geo', 'deu', 'gha', 'grc', 'grd', 'gtm', 'gin', 'gnb',
    'guy', 'hti', 'hnd', 'hun', 'isl', 'ind', 'idn', 'irn', 'irq', 'irl',
    'isr', 'ita', 'jam', 'jpn', 'jor', 'kaz', 'ken', 'kir', 'prk', 'kor',
    'kwt', 'kgz', 'lao', 'lva', 'lbn', 'lso', 'lbr', 'lby', 'lie', 'ltu',
    'lux', 'mdg', 'mwi', 'mys', 'mdv', 'mli', 'mlt', 'mhl', 'mrt', 'mus',
    'mex', 'fsm', 'mda', 'mco', 'mng', 'mne', 'mar', 'moz', 'mmr', 'nam',
    'nru', 'npl', 'nld', 'nzl', 'nic', 'ner', 'nga', 'mkd', 'nor', 'omn',
    'pak', 'plw', 'pan', 'png', 'pry', 'per', 'phl', 'pol', 'prt', 'qat',
    'rou', 'rus', 'rwa', 'kna', 'lca', 'vct', 'wsm', 'smr', 'stp', 'sau',
    'sen', 'srb', 'syc', 'sle', 'sgp', 'svk', 'svn', 'slb', 'som', 'zaf',
    'ssd', 'esp', 'lka', 'sdn', 'sur', 'swe', 'che', 'syr', 'twn', 'tjk',
    'tza', 'tha', 'tls', 'tgo', 'ton', 'tto', 'tun', 'tur', 'tkm', 'tuv',
    'uga', 'ukr', 'are', 'gbr', 'usa', 'ury', 'uzb', 'vut', 'vat', 'ven',
    'vnm', 'yem', 'zmb', 'zwe'
]

iso_a2_codes = [
    'af', 'al', 'dz', 'as', 'ad', 'ao', 'ar', 'am', 'aw', 'au', 
    'at', 'az', 'bs', 'bh', 'bd', 'bb', 'by', 'be', 'bz', 'bj', 
    'bt', 'bo', 'ba', 'bw', 'br', 'bn', 'bg', 'bf', 'bi', 'kh', 
    'cm', 'ca', 'cv', 'ky', 'cf', 'td', 'cl', 'cn', 'co', 'km', 
    'cg', 'cd', 'ck', 'cr', 'ci', 'hr', 'cu', 'cy', 'cz', 'dk', 
    'dj', 'dm', 'do', 'ec', 'eg', 'sv', 'gq', 'er', 'ee', 'sz', 
    'et', 'fj', 'fi', 'fr', 'ga', 'gm', 'ge', 'de', 'gh', 'gr', 
    'gd', 'gt', 'gn', 'gw', 'gy', 'ht', 'hn', 'hu', 'is', 'in', 
    'id', 'ir', 'iq', 'ie', 'il', 'it', 'jm', 'jp', 'jo', 'kz', 
    'ke', 'ki', 'kp', 'kr', 'kw', 'kg', 'la', 'lv', 'lb', 'ls', 
    'lr', 'ly', 'li', 'lt', 'lu', 'mg', 'mw', 'my', 'mv', 'ml', 
    'mt', 'mh', 'mq', 'mr', 'mu', 'mx', 'fm', 'md', 'mc', 'mn', 
    'me', 'ma', 'mz', 'mm', 'na', 'nr', 'np', 'nl', 'nz', 'ni', 
    'ne', 'ng', 'mk', 'no', 'om', 'pk', 'pw', 'pa', 'pg', 'py', 
    'pe', 'ph', 'pl', 'pt', 'qa', 'ro', 'ru', 'rw', 'kn', 'lc', 
    'vc', 'ws', 'sm', 'st', 'sa', 'sn', 'rs', 'sc', 'sl', 'sg', 
    'sk', 'si', 'sb', 'so', 'za', 'ss', 'es', 'lk', 'sd', 'sr', 
    'se', 'ch', 'sy', 'tw', 'tj', 'tz', 'th', 'tl', 'tg', 'to', 
    'tt', 'tn', 'tr', 'tm', 'tv', 'ug', 'ua', 'ae', 'gb', 'us', 
    'uy', 'uz', 'vu', 'va', 've', 'vn', 'ye', 'zm', 'zw'
]

iso_codes = iso_a3_codes+iso_a2_codes

def remove_iso_code(string, iso_codes):
    return ' '.join(string.split()[:-1]) if string.split()[-1] in iso_codes else string

def remove_duplicate_words(string):
    return ' '.join(dict.fromkeys(string.split()))

def clear_addr(addr):
    retult = pd.NA
    
    addr = addr.lower()   
    addr = unidecode(addr) 
    addr = remove_duplicate_words(addr)
    addr = remove_iso_code(addr, iso_codes)
    addr = re.sub(r'\b\d{1,4}\b', '', addr)
    addr = re.sub(r'\b\w*\d\w*\b', '', addr)

    chars_to_replace = ['-', '/']
    for char in chars_to_replace: addr = addr.replace(char, ' ')

    chars_to_replace = [',','#']
    for char in chars_to_replace: addr = addr.replace(char, '')        

    addr = addr.split()
    
    words_to_remove = ['mesto','kraj','psc','id', 'ds','box','suite','room','floor']
    addr = [word for word in addr if word not in words_to_remove]

    words_with_chars_to_remove = ['@', '(', ')', '.',':', ';', '\'']
    addr = [word for word in addr if all(char not in word for char in words_with_chars_to_remove)]

    addr =  ' '.join(addr)    

    result = addr

    return result

## clear_unused_files

In [None]:
def clear_unused_files(files_used):
        
    files = [f'__documents/{file}' for file in os.listdir('__documents')]
    
    files_delete = set(files) - set(files_used)
    
    for file in files_delete:
        if os.path.isfile(file):
            os.remove(file)
        else:
            shutil.rmtree(file)
    return

## frame_info

In [None]:
def frame_info(frame,mem=False,dtype=False,sample=False,n=5,width=50,columns=50):

    size = len(frame)

    if mem: 
        print(round(frame.memory_usage(deep=True)/(1024**3),1).to_string())
        print('Total ',round(frame.memory_usage(deep=True).sum()/(1024**3),1))

    if dtype:
        print()
        print(frame.dtypes.to_string())     

    if sample:
        frame = frame.sample(min(n,size))

    with pd.option_context('display.min_rows', n, 'display.max_rows', n,
                           'display.max_columns', columns,'display.max_colwidth', width):
        display(frame)

    return

# Main

In [None]:
main_index = 'https://data.smlouvy.gov.cz/'
download_file(url=main_index, local_path='__cache/indexes/main_index.xml', use_cached=False)

In [None]:
with open('__cache/indexes/main_index.xml', 'r', encoding='utf-8') as file:
    main_index = file.read()

main_index = xmltodict.parse(main_index)['index']['dump']
main_index = pd.DataFrame(main_index)
main_index = main_index.loc[main_index.den.isna()]
main_index['url'] = main_index.pop('odkaz')
main_index.drop(columns=[col for col in main_index.columns if col != 'url'], inplace=True)

main_index['date'] = main_index.url.apply(lambda x: x.split('dump_')[1].split('.')[0])
main_index = main_index.sort_values(by ='date').reset_index(drop=True)

main_index

In [None]:
main_index['file'] = main_index.url.apply(lambda x: f'month_index{x.split('dump')[-1]}')
main_index.progress_apply(
    lambda r: download_file(url=r.url, local_path=f'__cache/indexes/{r.file}',use_cached=True), axis=1)

main_index

In [None]:
data = pd.DataFrame()

for item in tqdm(main_index.file.tolist(),file=sys.stdout,ncols=100):

    with open(f'__cache/indexes/{item}', 'r', encoding='utf-8') as f:
        item_data = f.read()
    
    # skip potential empty indexes
    try:
        item_data = xmltodict.parse(item_data)['dump']['zaznam']
    except:
        continue
        
    item_data = pd.DataFrame(item_data)

    item_parsed = pd.DataFrame()
    
    item_parsed['regis_id'] = item_data.identifikator.apply(lambda x: x.get('idSmlouvy',pd.NA))
    item_parsed['contr_id'] = item_data.identifikator.apply(lambda x: x.get('idVerze',pd.NA))
    item_parsed['date'] = item_data.casZverejneni
    item_parsed['valid'] = item_data.platnyZaznam
    
    item_parsed['party'] = item_data.smlouva.apply(lambda x: x.get('smluvniStrana',pd.NA))
    item_parsed['party_name'] = item_parsed.party.apply(
        lambda x: x.get('nazev',pd.NA) if isinstance(x, dict) else x[0].get('nazev',pd.NA))    
    item_parsed['party_addr'] = item_parsed.party.apply(
        lambda x: x.get('adresa',pd.NA) if isinstance(x, dict) else x[0].get('adresa',pd.NA))
    item_parsed['party_postbox'] = item_parsed.party.apply(
        lambda x: x.get('datovaSchranka','no_postbox') if isinstance(x, dict) else x[0].get('datovaSchranka','no_postbox'))
    item_parsed.pop('party')        
    
    item_parsed['docs_url'] = item_data.prilohy.apply(lambda x: x.get('priloha',pd.NA) if pd.notna(x) else {})
    item_parsed['docs_url'] = item_parsed.docs_url.apply(lambda x: parse_archive_files(x))
    item_parsed = item_parsed.explode('docs_url',ignore_index = True).rename(columns = {'docs_url':'doc_url'})
    item_parsed['doc_id'] = item_parsed.groupby('contr_id').cumcount()+1
    # item_parsed['doc_cnt'] = item_parsed.groupby('contr_id').doc_id.transform('max')
    item_parsed.insert(2, 'doc_id', item_parsed.pop('doc_id'))
    # item_parsed.insert(3, 'doc_cnt', item_parsed.pop('doc_cnt'))
    data = pd.concat([data,item_parsed],ignore_index=True)

frame_info(data,mem=True,dtype=True)

In [None]:
data['date'] = data.date.str[:10]
data['date'] = pd.to_datetime(data.date,format='%Y-%m-%d')
data = data.sort_values(by='date')

frame_info(data)

In [None]:
data.dropna(inplace=True)

frame_info(data,mem=True)

In [None]:
# clear data
data['party_int_addr'] = data.party_addr.progress_apply(check_int_addr)
todrop = data.loc[(data.valid == '0') | (data.party_postbox != 'no_postbox') | (data.party_int_addr == False)]
data.drop(todrop.index,inplace=True)
data.pop('party_int_addr'); data.pop('party_postbox'); data.pop('valid')

frame_info(data,mem=True)

In [None]:
# clear addr
data['party_addrclear'] = data.party_addr.progress_apply(clear_addr)
data['party_addrclear'] = data.progress_apply(
    lambda r: remove_keywords(text_what=r.party_name,text_from=r.party_addrclear),axis=1)

frame_info(data,mem=True)

In [None]:
# geolocate party
data[['party_coords','party_bbox','party_place','party_country']]  = data.progress_apply(
    lambda r: geocode_addr(r.party_addrclear,try_nominatim_local=False), axis=1,result_type='expand')

todrop = data.loc[data.party_place.isna() | (data.party_country == 'cz')]
display(len(todrop))
data = data.drop(todrop.index)

frame_info(data,mem=True)

In [None]:
# drop uncertain locations
data['party_bbsize'] = data.party_bbox.apply(lambda x: great_circle(x[0],x[1]).m)

todrop = data.loc[(data.party_bbsize > 10000) | (data.party_bbsize < 10)]
display(len(todrop))
data = data.drop(todrop.index)

frame_info(data,n=5)

In [None]:
# check location for party place (street, square)
data['valid_loc'] = data.apply(lambda r: r.party_place in unidecode(r.party_addr.lower()),axis=1)

todrop = data.loc[(data.valid_loc == False)]
display(len(todrop))
data = data.drop(todrop.index)

frame_info(data,mem=True)

In [None]:
data['doc_path'] = data.progress_apply(
    lambda r: download_file(url=r.doc_url,local_path=f"__documents/{r.contr_id}_{r.doc_id}.{r.doc_url.split('.')[-1].lower()}",
                            show=False,use_cached=True), axis=1)

todrop = data.loc[data.doc_path.isna()]
display(len(todrop))
data = data.drop(todrop.index)

frame_info(data,mem=True)

In [None]:
data['doc_raw'] = data.doc_path.progress_apply(GET_FILE_SOURCE)

todrop = data.loc[data.doc_raw.isna()]
display(len(todrop))
data = data.drop(todrop.index)

frame_info(data,mem=True)

In [None]:
data['doc_text'] = data.doc_raw.progress_apply(parse_text)

todrop = data.loc[data.doc_text.isna()]
display(len(todrop))
data = data.drop(todrop.index)

frame_info(data,mem=True)

In [None]:
data['doc_lng'] = data.doc_text.progress_apply(GET_LNG)

todrop = data.loc[(data.doc_lng.isna()) | (data.doc_lng != 'EN')]
display(len(todrop))
data = data.drop(todrop.index)

frame_info(data,mem=True)

In [None]:
# doc cnt
data['doc_cnt'] = data.groupby('contr_id').doc_id.transform('count')
data.insert(3, 'doc_cnt', data.pop('doc_cnt'))

frame_info(data,mem=True)

In [None]:
# clear not used documents
files_used = data.doc_path.to_list()
clear_unused_files(files_used)

In [None]:
data.loc[data.isna().any(axis=1)]

# Reports

In [None]:
len(data)

In [None]:
data.party_country.nunique()

In [None]:
data.party_country.unique()

In [None]:
data.party_coords.nunique()

In [None]:
data.doc_id.value_counts()[:5]

In [None]:
round(len(data.loc[data.doc_id > 1])/len(data),2)

In [None]:
len(data.loc[data.doc_text.str.contains('sanction')])

In [None]:
len(data.loc[data.doc_text.str.contains('export')])

In [None]:
map_fl = folium.Map(zoom_start=13, width=600, height=600)

uninque_locs = data.drop_duplicates(subset='party_coords')

for item in uninque_locs.itertuples():
    folium.Marker(location=[float(item.party_coords[0]),float(item.party_coords[1])],
                 popup=f'Party: {item.party_name} <br><br> Document: {item.doc_path} <br><br> Date: {item.date}',
                lazy=True,
                 ).add_to(map_fl)

map_fl.save(f'result-figures/map-parties-locations.html')

# uncomment here for updated data
display(HTML(open(f'result-figures/map-parties-locations.html').read()));
print()
#Image('result-figures/map-parties-locations-part-static.png')

# Samples

In [None]:
# geo-political example Safran Helicopter, contract id 18923211, document id 2
# walk-trough example, Meerstetter Engineering, contract id 32046292, document id 2
contract_id = '32046292'
document_id = 2

In [None]:
# contract with documents
contract = data.loc[(data.contr_id == contract_id)]
contract

In [None]:
# document
document = data.loc[(data.contr_id == contract_id) & (data.doc_id == document_id)]
document.doc_text.item()[:1000]

In [None]:
document.pop('doc_text');document.pop('doc_raw')
document.to_dict(orient='records')

# Export

In [None]:
export = data[['contr_id','doc_id','doc_cnt','date','party_name','party_coords','party_country',
               'doc_path','doc_text']]
export.to_csv('result-corpus/corpus.csv', encoding='utf-8', index=False, errors='replace')

In [None]:
stored = pd.read_csv('result-corpus/corpus.csv')

frame_info(stored)