#### Description

In [1]:
print('Author:  Leo Pauly (cnlp@leeds.ac.uk) & Nick Wilson (n.wilson@lubs.leeds.ac.uk)')
print('Description: Autmatic database update: Conversion')

Author:  Leo Pauly (cnlp@leeds.ac.uk) & Nick Wilson (n.wilson@lubs.leeds.ac.uk)
Description: Autmatic database update: Conversion


#### Usage intructions & Other info

1. Change/Add User (user variable options: 'leo','nick')
2. Change file_formate variable (options: '.sav','.csv')
3. Prefefably install python using anacodna (all in one installation): https://www.anaconda.com/products/individual#windows
4. Run this if 'pyarrow' module is missing':  !pip install pyarrow 
5. Converted database are stored in the directory : UKLTD_Database 

#### Imports

In [2]:
import os
import sys
import pandas as pd
import numpy as np
import pyreadstat
from zipfile import ZipFile
import pyunpack
import multiprocessing
import dask.dataframe as dd
import time
import dask
from dask.diagnostics import ProgressBar
from dask.distributed import Client
dask.config.set(scheduler='threads')
import gc

print('Python version:',sys.version)
num_processes = multiprocessing.cpu_count()
print('No: of logical CPU cores available:',num_processes)

Python version: 3.8.5 (default, Sep  3 2020, 21:29:08) [MSC v.1916 64 bit (AMD64)]
No: of logical CPU cores available: 8


In [3]:
## Selectting file format
file_format='sav' #(options: csv,sav)

## Selectting user
user='leo' #(user variable options: 'leo','nick')

## Adding filepaths
if(user=='leo'):
    base_dir='C:/Users/cnlp/Research Fellowship/'
elif(user=='nick'):
    base_dir='/Volumes/Pegasus32 R6/CreditSafe 2019 Zipped/'

os.makedirs(base_dir+'UKLTD_Database', exist_ok=True)
dir_list_file=base_dir+'/UKLTD_Scripts/dir_list.txt'
database_file_folder=base_dir+'UKLTD_Database/TD01/'
conv_database_file_folder=base_dir+'UKLTD_Database/TD01_convert/'

os.makedirs(conv_database_file_folder, exist_ok=True)
if (file_format=='csv'): 
    conv_database_file=conv_database_file_folder+'TD01.csv'
elif (file_format=='sav'): 
    conv_database_file=conv_database_file_folder+'TD01.sav'
else:
    print('File format not supported')

#### Loading database 

In [4]:
## If databse is in .csv format 
if (os.path.exists(database_file_folder)):
    print('Database found.','Reading database! \n') 
    start = time.process_time()
    with ProgressBar():
        df_database = dd.read_parquet(database_file_folder) 
    print('Time taken to read database {}:'.format(database_file_folder),time.process_time() - start,'s')
    database_missing=False
else:
    print('Database missing.  \n')
    exit()
    
print('No: of partitions in the database:',df_database.npartitions)

Database found. Reading database! 

Time taken to read database C:/Users/cnlp/Research Fellowship/UKLTD_Database/TD01/: 0.078125 s
No: of partitions in the database: 4


#### Converting and writing into disk

In [5]:
## Writing converted databse to file
import os
if os.path.exists(conv_database_file):
    os.remove(conv_database_file)
if (file_format=='csv'):
    start = time.process_time()
    with ProgressBar():
        df_database.to_csv(conv_database_file,single_file=True)
    print('Time taken to write:',time.process_time() - start,'s')
elif (file_format=='sav'):
    start = time.process_time()
    with ProgressBar():
        pyreadstat.write_sav(df_database.compute(),conv_database_file,compress=False)
    print('Saved file path:',conv_database_file)
    print('Time taken to write:',time.process_time() - start,'s')

[########################################] | 100% Completed |  0.7s
Saved file path: C:/Users/cnlp/Research Fellowship/UKLTD_Database/TD01_convert/TD01.sav
Time taken to write: 8.046875 s
