#### Description

In [1]:
print('Author:  Leo Pauly (cnlp@leeds.ac.uk) & Nick Wilson (n.wilson@lubs.leeds.ac.uk)')
print('Description: Autmatic database update')

Author:  Leo Pauly (cnlp@leeds.ac.uk) & Nick Wilson (n.wilson@lubs.leeds.ac.uk)
Description: Autmatic database update


#### Usage intructions & Other info

1. Give absolute file paths always
2. Assumed that new entry directory names always start with UKLTD_R_ or UKLTD_W_
3. File dir_list.txt contains the processed directory list
4. Prefefably install python using anacodna (all in one installation): https://www.anaconda.com/products/individual#windows
5. Run this if 'pyreadstat' module is missing':  !pip install pyreadstat 
6. Need only change base directory in most of the case
7. Databases are stored in the directory (Create one if missing) : UKLTD_Database
8. Select user (user variable options: 'leo','nick')
9. Run this if 'patool' module is missing':  !pip install patool pyunpack

#### Imports

In [2]:
import os
import sys
import pandas as pd
import numpy as np
import pyreadstat
from zipfile import ZipFile
import pyunpack
import multiprocessing
import dask.dataframe as dd
import time
from dask.diagnostics import ProgressBar
from dask.distributed import Client


print('Info:')
print('Python version:',sys.version)
num_processes = multiprocessing.cpu_count()
print('No: of logical processors:',num_processes)

AC01_header={'AC01': 'string', 'REGNUM':'float64' , 'ACCDAT_start': 'string', 'ACCDAT': 'string', 'number_of_weeks': 'string', 'months': 'string',
       'currency': 'string', 'consolidated': 'string', 'acctype': 'float64', 'Turnover': 'float64', 'Export': 'float64',
       'Cost_of_Sales': 'float64', 'Gross_Profit': 'float64', 'Wages_Salaries': 'float64',
       'Directors_Emoluments': 'float64', 'Operating_Profits': 'float64', 'Depreciation': 'float64',
       'Audit_Fees': 'float64', 'Interest_Payments': 'float64', 'Pre_Tax_Profit': 'float64', 'taxation1': 'float64',
       'Profit_After_Tax': 'float64', 'Dividends_Payable': 'float64', 'Retained_Profits': 'float64',
       'Tangible_Assets': 'float64', 'Intangible_Assets': 'float64', 'Total_Fixed_Assets': 'float64',
       'Total_Current_Assets': 'float64', 'Trade_Debtors': 'float64', 'Stock': 'float64', 'Cash': 'float64',
       'Other_Current_Assets': 'float64', 'Increase_In_Cash': 'float64', 'Mis_Current_Assets': 'float64',
       'Total_Assets': 'float64', 'Total_Current_Liabilities': 'float64', 'Trade_Creditors': 'float64',
       'Bank_Overdraft': 'float64', 'Other_Short_Term_Fin': 'float64', 'Mis_Current_Liabilities': 'float64',
       'Other_Long_Term_Fin': 'float64', 'Total_Long_Term_Liabilities': 'float64',
       'Bank_Overdraft_LTL': 'float64', 'Total_Liabilities': 'float64', 'Net_Assets': 'float64',
       'Working_Capital': 'float64', 'Paid_up_equity': 'float64', 'PL_Account_Weserve': 'float64',
       'Sundry_Weserves': 'float64', 'Revaluation_Weserve': 'float64', 'Shareholder_Funds': 'float64',
       'NetWorth': 'float64', 'NetCashflowfromOperations': 'float64', 'NetCashflowbeforeFinancing': 'float64',
       'NetCashflowfromFinancing': 'float64', 'Contingent_Liability': 'float64', 'Capital_Employed': 'float64',
       'No_Employees': 'float64', 'status': 'float64', 'UPLOAD':'string'}

AC01_header_names=list(AC01_header.keys())
AC01_header_dtypes={'Bank_Overdraft': 'float64',
       'Bank_Overdraft_LTL': 'float64',
       'Capital_Employed': 'float64',
       'Mis_Current_Assets': 'float64',
       'NetCashflowbeforeFinancing': 'float64',
       'NetWorth': 'float64',
       'Net_Assets': 'float64',
       'REGNUM': 'object',
       'Tangible_Assets': 'float64',
       'acctype': 'float64',
       'status': 'float64'}#list(AC01_header.values())

Info:
Python version: 3.8.5 (default, Sep  3 2020, 21:29:08) [MSC v.1916 64 bit (AMD64)]
No: of logical processors: 8


In [3]:
## Selectting user and adding filepaths
user='leo'
if(user=='leo'):
    base_dir='C:/Users/cnlp/Research Fellowship/'
    dir_list_file=base_dir+'/UKLTD_Scripts/dir_list.txt'
    database_file=base_dir+'UKLTD_Database/AC01.csv'

In [4]:
## Monitering
#client = Client()

#### Checking new entries

In [6]:
## Checking for new download
with open(dir_list_file, 'a+') as fd:
    fd.seek(0)
    dir_list_old=fd.read().split('\n')

print('List of processed directores:',*dir_list_old,sep='\n')
dir_list_new=[dir_name for dir_name in os.listdir(base_dir) if (dir_name.startswith("UKLTD_W") or dir_name.startswith("UKLTD_R"))]
entry_dir_list=[entry_dir for entry_dir in dir_list_new if entry_dir not in dir_list_old ]
print('\nNew entries detected:',*entry_dir_list,sep='\n')

List of processed directores:


New entries detected:
UKLTD_W_20190602


#### Creating/Loading database 

In [7]:
## If databse is in .csv format 
if (os.path.isfile(database_file)):
    print('Database found.','Reading database! \n') 
    start = time.process_time()
    df_database = dd.read_csv(database_file,assume_missing=True) 
    print('Time taken to read database {}:'.format(database_file.split('/')[-1]),time.process_time() - start,'s')
    df_database
else:
    print('Database not found.','Creating database! \n')
    first_time=True
    #df_database_=pd.DataFrame(columns=AC01_header_names)
    #df_database=dd.from_pandas(df_database_,npartitions=1*multiprocessing.cpu_count())
    

Database not found. Creating database! 



NameError: name 'df_database' is not defined

#### Updating database

Rules:

1. Check for field:REGNUM
2. If REGNUM doesn't exist: add as new row in database
3. If they exist, check for the rest of the fields
4. If they are not identical: add as new row in database


In [None]:
def update_database(entry_dir):
    global df_database
    entry_file="AC01_"+(entry_dir.split('_')[-2])+"_"+(entry_dir.split('_')[-1])+".txt"
    entry_file_zip="AC01_"+(entry_dir.split('_')[-2])+"_"+(entry_dir.split('_')[-1])+".rar"
    pyunpack.Archive(base_dir+'{}/{}'.format(entry_dir,entry_file_zip)).extractall(base_dir+'{}/'.format(entry_dir))
    print('Entry file unzipped as:',entry_file)
    
    start = time.process_time()
    df_entry_file=dd.read_csv('C:/Users/cnlp/Research Fellowship/{}/{}'.format(entry_dir,entry_file),sep='|',names=AC01_header_names,dtype=AC01_header_dtypes)
    print('Time taken to read:',time.process_time() - start,'s')
    df_entry_file['UPLOAD']=entry_file.split('.')[0]
        
    start = time.process_time()
    if(not first_time):
        df_database=df_database.merge(df_entry_file)
    else:
        df_database=df_entry_file
    print('Time taken to process:',time.process_time() - start,'s')
    
    return True

In [None]:
## Reading from new directory
for entry_dir in entry_dir_list:
    print('\nReading from entry dir:',entry_dir)
    print('Update Success:',update_database(entry_dir))

In [None]:
## Writing updated databse to file
print('Database {} updated'.format(database_file.split('/')[-1]))
df_database.to_csv(database_file,single_file = True,assume_missing=True) #pyreadstat.write_sav(df_database,database_file)

In [None]:
## Update processed directory list
print('Processed list updated')
with open(dir_list_file, 'w') as fd:
    fd.write('\n'.join(dir_list_old+entry_dir_list))