#### Description

In [10]:
print('Author:  Leo Pauly (cnlp@leeds.ac.uk) & Nick Wilson (n.wilson@lubs.leeds.ac.uk)')
print('Description: Autmatic database update')

Author:  Leo Pauly (cnlp@leeds.ac.uk) & Nick Wilson (n.wilson@lubs.leeds.ac.uk)
Description: Autmatic database update


#### Usage intructions & Other info

1. Give absolute file paths always
2. Assumed that new entry directory names always start with UKLTD_R_ or UKLTD_W_
3. File dir_list.txt contains the processed directory list
4. Prefefably install python using anacodna (all in one installation): https://www.anaconda.com/products/individual#windows
5. Run this if 'pyreadstat' module is missing':  !pip install pyreadstat 
6. Need only change base directory in most of the case
7. Databases are stored in the directory (Create one if missing) : UKLTD_Database
8. Select user (user variable options: 'leo','nick')
9. Run this if 'patool' module is missing':  !pip install patool pyunpack
10. fastparquet

#### Imports

In [11]:
import os
import sys
import pandas as pd
import numpy as np
import pyreadstat
from zipfile import ZipFile
import pyunpack
import multiprocessing
import dask.dataframe as dd
import time
from dask.diagnostics import ProgressBar
from dask.distributed import Client


print('Info:')
print('Python version:',sys.version)
num_processes = multiprocessing.cpu_count()
print('No: of logical processors:',num_processes)

AC01_header_dtypes={'AC01': 'O', 'REGNUM': 'O','ACCDAT_start': 'O','ACCDAT': 'O', 'number_of_weeks':'float64', 'months':'float64', 'currency': 'O', 'consolidated': 'O', 'acctype': 'float64', 'Turnover': 'float64', 'Export': 'float64', 'Cost_of_Sales': 'float64', 'Gross_Profit': 'float64', 'Wages_Salaries': 'float64', 'Directors_Emoluments': 'float64', 'Operating_Profits': 'float64', 'Depreciation': 'float64', 'Audit_Fees': 'float64', 'Interest_Payments': 'float64', 'Pre_Tax_Profit': 'float64', 'taxation1': 'float64', 'Profit_After_Tax': 'float64', 'Dividends_Payable': 'float64', 'Retained_Profits': 'float64', 'Tangible_Assets': 'float64', 'Intangible_Assets': 'float64', 'Total_Fixed_Assets': 'float64', 'Total_Current_Assets': 'float64', 'Trade_Debtors': 'float64', 'Stock': 'float64', 'Cash': 'float64', 'Other_Current_Assets': 'float64', 'Increase_In_Cash': 'float64', 'Mis_Current_Assets': 'float64', 'Total_Assets': 'float64', 'Total_Current_Liabilities': 'float64', 'Trade_Creditors': 'float64', 'Bank_Overdraft': 'float64', 'Other_Short_Term_Fin': 'float64', 'Mis_Current_Liabilities': 'float64', 'Other_Long_Term_Fin': 'float64', 'Total_Long_Term_Liabilities': 'float64', 'Bank_Overdraft_LTL': 'float64', 'Total_Liabilities': 'float64', 'Net_Assets': 'float64', 'Working_Capital': 'float64', 'Paid_up_equity': 'float64', 'PL_Account_Weserve': 'float64', 'Sundry_Weserves': 'float64', 'Revaluation_Weserve': 'float64', 'Shareholder_Funds': 'float64', 'NetWorth': 'float64', 'NetCashflowfromOperations': 'float64', 'NetCashflowbeforeFinancing': 'float64', 'NetCashflowfromFinancing': 'float64', 'Contingent_Liability': 'float64', 'Capital_Employed': 'float64', 'No_Employees': 'float64', 'status': 'float64', 'UPLOAD': 'O'}
AC01_header_names=['AC01', 'REGNUM', 'ACCDAT_start', 'ACCDAT', 'number_of_weeks', 'months', 'currency', 'consolidated', 'acctype', 'Turnover', 'Export', 'Cost_of_Sales', 'Gross_Profit', 'Wages_Salaries', 'Directors_Emoluments', 'Operating_Profits', 'Depreciation', 'Audit_Fees', 'Interest_Payments', 'Pre_Tax_Profit', 'taxation1', 'Profit_After_Tax', 'Dividends_Payable', 'Retained_Profits', 'Tangible_Assets', 'Intangible_Assets', 'Total_Fixed_Assets', 'Total_Current_Assets', 'Trade_Debtors', 'Stock', 'Cash', 'Other_Current_Assets', 'Increase_In_Cash', 'Mis_Current_Assets', 'Total_Assets', 'Total_Current_Liabilities', 'Trade_Creditors', 'Bank_Overdraft', 'Other_Short_Term_Fin', 'Mis_Current_Liabilities', 'Other_Long_Term_Fin', 'Total_Long_Term_Liabilities', 'Bank_Overdraft_LTL', 'Total_Liabilities', 'Net_Assets', 'Working_Capital', 'Paid_up_equity', 'PL_Account_Weserve', 'Sundry_Weserves', 'Revaluation_Weserve', 'Shareholder_Funds', 'NetWorth', 'NetCashflowfromOperations', 'NetCashflowbeforeFinancing', 'NetCashflowfromFinancing', 'Contingent_Liability', 'Capital_Employed', 'No_Employees', 'status', 'UPLOAD']

Info:
Python version: 3.8.5 (default, Sep  3 2020, 21:29:08) [MSC v.1916 64 bit (AMD64)]
No: of logical processors: 8


In [12]:
## Selectting user and adding filepaths
user='leo'
if(user=='leo'):
    base_dir='C:/Users/cnlp/Research Fellowship/'
    dir_list_file=base_dir+'/UKLTD_Scripts/dir_list.txt'
    database_file=base_dir+'UKLTD_Database/AC01.hdf5'
    database_file_folder=base_dir+'UKLTD_Database/AC01/'

#### Checking new entries

In [14]:
## Checking for new download
with open(dir_list_file, 'a+') as fd:
    fd.seek(0)
    dir_list_old=fd.read().split('\n')

print('List of processed directores:',*dir_list_old,sep='\n')
dir_list_new=[dir_name for dir_name in os.listdir(base_dir) if (dir_name.startswith("UKLTD_W") or dir_name.startswith("UKLTD_R"))]
entry_dir_list=[entry_dir for entry_dir in dir_list_new if entry_dir not in dir_list_old ]
print('\nNew entries detected:',*entry_dir_list,sep='\n')

List of processed directores:


New entries detected:
UKLTD_W_20190616


#### Creating/Loading database 

In [15]:
## If databse is in .csv format 
if (os.path.isfile(database_file)):
    print('Database found.','Reading database! \n') 
    start = time.process_time()
    with ProgressBar():
        df_database = dd.read_hdf(database_file,key='AC01_database') 
    print('Time taken to read database {}:'.format(database_file.split('/')[-1]),time.process_time() - start,'s')
    database_missing=False
else:
    print('Database not found.','Creating database! \n')
    database_missing=True
    

Database not found. Creating database! 



#### Updating database

Rules:

1. Check for field:REGNUM
2. If REGNUM doesn't exist: add as new row in database
3. If they exist, check for the rest of the fields
4. If they are not identical: add as new row in database


In [16]:
def update_database(entry_dir):
    
    ## Reading new entry file and unzipping
    global df_database
    global database_missing
    entry_file="AC01_"+(entry_dir.split('_')[-2])+"_"+(entry_dir.split('_')[-1])+".txt"
    entry_file_zip="AC01_"+(entry_dir.split('_')[-2])+"_"+(entry_dir.split('_')[-1])+".rar"
    pyunpack.Archive(base_dir+'{}/{}'.format(entry_dir,entry_file_zip)).extractall(base_dir+'{}/'.format(entry_dir))
    print('Entry file unzipped as:',entry_file)
    
    start = time.process_time()
    df_entry_file=dd.read_csv('C:/Users/cnlp/Research Fellowship/{}/{}'.format(entry_dir,entry_file),sep='|',names=AC01_header_names,dtype=AC01_header_dtypes)
    print('Time taken to read:',time.process_time() - start,'s')
    df_entry_file['UPLOAD']=entry_file.split('.')[0]
    
     ## Adding new entry_rows to main database after checking
    start = time.process_time()
    if(database_missing):
        df_database=df_entry_file
        database_missing=False
    else:
        print('Processing')
        with ProgressBar():
            df_database=df_database.merge(df_entry_file)
    print('Time taken to process:',time.process_time() - start,'s')
    
    return True

In [17]:
## Reading from new directory and adding to database
for entry_dir in entry_dir_list:
    print('\nReading from entry dir:',entry_dir)
    print('Update Success:',update_database(entry_dir))


Reading from entry dir: UKLTD_W_20190616
Entry file unzipped as: AC01_W_20190616.txt
Time taken to read: 0.03125 s
Time taken to process: 0.0 s
Update Success: True


In [18]:
## Writing updated databse to file
start = time.process_time()
with ProgressBar():
    df_database.to_parquet(database_file_folder)
print('Time taken to write:',time.process_time() - start,'s')

[########################################] | 100% Completed |  0.7s
Time taken to write: 0.640625 s


In [19]:
## Writing updated databse to file
start = time.process_time()
with ProgressBar():
    df_database.to_hdf(database_file,key='AC01_database')
print('Time taken to write:',time.process_time() - start,'s')

[########################################] | 100% Completed |  1.2s
Time taken to write: 1.203125 s


In [None]:
## Update processed directory list
print('Processed list updated')
with open(dir_list_file, 'w') as fd:
    fd.write('\n'.join(dir_list_old+entry_dir_list))