#### Description

In [1]:
print('Author: Leo Pauly (cnlp@leeds.ac.uk) & Nick Wilson (n.wilson@lubs.leeds.ac.uk)')
print('Description: Autmatic database update')

Author: Leo Pauly (cnlp@leeds.ac.uk) & Nick Wilson (n.wilson@lubs.leeds.ac.uk)
Description: Autmatic database update


#### Imports

In [1]:
import os
import sys
import pandas as pd
import numpy as np
import pyreadstat
from zipfile import ZipFile
import pyunpack
import multiprocessing
import dask.dataframe as dd
import time
import dask
from dask.diagnostics import ProgressBar
from dask.distributed import Client
dask.config.set(scheduler='threads')

print('Python version:',sys.version)
num_processes = multiprocessing.cpu_count()
print('No: of logical CPU cores available:',num_processes)

Python version: 3.8.5 (default, Sep  3 2020, 21:29:08) [MSC v.1916 64 bit (AMD64)]
No: of logical CPU cores available: 8


#### Reading sav files

In [2]:
df_database=pd.read_spss("../UKLTD_Database/MR01_Nk.sav")

In [3]:
header_list=df_database.columns.values

In [4]:
header_list_types = dict.fromkeys(header_list, "str")

In [5]:
print(header_list_types)

{'MR01': 'str', 'REGNUM': 'str', 'NUM': 'str', 'REGDATE': 'str', 'CREDATE': 'str', 'SATDATE': 'str', 'CHDATE': 'str', 'MEM': 'str', 'CHCODE': 'str', 'UPLOAD': 'str'}


In [6]:
{'SH03': 'str', 'PNR': 'float64', 'STITLE': 'str', 'SFORE': 'str', 'SSUR': 'str', 'SHTP': 'str', 'ADD1': 'str', 'ADD2': 'str', 'ADD3': 'str', 'ADD4': 'str', 'ADD5': 'str', 'PCODE': 'str', 'UPLOAD': 'str'}

{'SH03': 'str',
 'PNR': 'float64',
 'STITLE': 'str',
 'SFORE': 'str',
 'SSUR': 'str',
 'SHTP': 'str',
 'ADD1': 'str',
 'ADD2': 'str',
 'ADD3': 'str',
 'ADD4': 'str',
 'ADD5': 'str',
 'PCODE': 'str',
 'UPLOAD': 'str'}

#### Manually - I

In [7]:
checker={'CD03': 'str',
'ADDID': 'float64',
'ADD1': 'str',
'ADD2': 'str',
'ADD3': 'str',
'ADD4': 'str',
'ADD5': 'str',
'PCODE': 'str',
'CROF': 'str',
'PBOX': 'str',
'UPLOAD': 'str',}

In [8]:
print(checker)

{'CD03': 'str', 'ADDID': 'float64', 'ADD1': 'str', 'ADD2': 'str', 'ADD3': 'str', 'ADD4': 'str', 'ADD5': 'str', 'PCODE': 'str', 'CROF': 'str', 'PBOX': 'str', 'UPLOAD': 'str'}


#### Manually - II

In [33]:
def joinStrings(stringList):
    single_string=""
    for e in stringList:
        single_string = single_string + e + ','
    return single_string

In [37]:
checker=['\'AC05\': \'str\'','\'REGNUM\': \'str\'']

for i in range(1,157):
    checker.append('\'INSU{0:03}\': \'float64\''.format(i))

checker.append('\'UPLOAD\': \'str\'') 

checker_new=joinStrings(checker)
print(checker_new)

'AC04': 'str','REGNUM': 'str','FINA001': 'float64','FINA002': 'float64','FINA003': 'float64','FINA004': 'float64','FINA005': 'float64','FINA006': 'float64','FINA007': 'float64','FINA008': 'float64','FINA009': 'float64','FINA010': 'float64','FINA011': 'float64','FINA012': 'float64','FINA013': 'float64','FINA014': 'float64','FINA015': 'float64','FINA016': 'float64','FINA017': 'float64','FINA018': 'float64','FINA019': 'float64','FINA020': 'float64','FINA021': 'float64','FINA022': 'float64','FINA023': 'float64','FINA024': 'float64','FINA025': 'float64','FINA026': 'float64','FINA027': 'float64','FINA028': 'float64','FINA029': 'float64','FINA030': 'float64','FINA031': 'float64','FINA032': 'float64','FINA033': 'float64','FINA034': 'float64','FINA035': 'float64','FINA036': 'float64','FINA037': 'float64','FINA038': 'float64','FINA039': 'float64','FINA040': 'float64','FINA041': 'float64','FINA042': 'float64','FINA043': 'float64','FINA044': 'float64','FINA045': 'float64','FINA046': 'float64','FIN

In [35]:
checker_new

"'AC04': 'str','REGNUM': 'str','FINA000': 'float64','FINA001': 'float64','FINA002': 'float64','FINA003': 'float64','FINA004': 'float64','FINA005': 'float64','FINA006': 'float64','FINA007': 'float64','FINA008': 'float64','FINA009': 'float64','FINA010': 'float64','FINA011': 'float64','FINA012': 'float64','FINA013': 'float64','FINA014': 'float64','FINA015': 'float64','FINA016': 'float64','FINA017': 'float64','FINA018': 'float64','FINA019': 'float64','FINA020': 'float64','FINA021': 'float64','FINA022': 'float64','FINA023': 'float64','FINA024': 'float64','FINA025': 'float64','FINA026': 'float64','FINA027': 'float64','FINA028': 'float64','FINA029': 'float64','FINA030': 'float64','FINA031': 'float64','FINA032': 'float64','FINA033': 'float64','FINA034': 'float64','FINA035': 'float64','FINA036': 'float64','FINA037': 'float64','FINA038': 'float64','FINA039': 'float64','FINA040': 'float64','FINA041': 'float64','FINA042': 'float64','FINA043': 'float64','FINA044': 'float64','FINA045': 'float64','FI

#### Encoding Testing

In [9]:
import csv
df_entry_file=pd.read_csv('testing.txt',encoding='iso-8859-1',sep='|',quoting=csv.QUOTE_NONE)


In [10]:
df_entry_file

Unnamed: 0,ï»¿hello,hi,me
0,1,leo,leo
1,2,"""me","hi"""


In [11]:
 pyreadstat.write_sav(df_entry_file,'testing.sav',compress=False)