# DBF file to MySQL for Distcust File

## 1. Load DBF to Pandas DataFrame

In [1]:
from datetime import datetime
import pandas as pd
import numpy as np
print(f'pandas version: {pd.__version__}')

pd.set_option('max_columns', 500)
# pd.set_option('max_rows', 10)

pandas version: 1.1.3


In [2]:
"""
Load content of a DBF file into a Pandas data frame.

The iter() is required because Pandas doesn't detect that the DBF
object is iterable.
"""

from dbfread import DBF

In [3]:
setPath = input('Please enter path for year file (default as G:/relia/custbase/common/): ') or 'G:/relia/custbase/common/'
# yearSelected = input('Please insert year (default as this year) : ') or str(datetime.now().year)
print(f'Path : {setPath}')
# print(f'Year : {yearSelected}')
fullPath = setPath + 'distcust.dbf'
print(f'\nFull path : {fullPath}')

Please enter path for year file (default as G:/relia/custbase/common/): 
Path : G:/relia/custbase/common/

Full path : G:/relia/custbase/common/distcust.dbf


In [4]:
dbf = DBF(fullPath)
df = pd.DataFrame(iter(dbf))

In [5]:
total_records = df.shape[0]
df.tail()

Unnamed: 0,MONTH,YEAR,DISTRICT,NOCUS,D,REMARK,_NullFlags
3200,10,2020,15,235280,,,b'\x00'
3201,10,2020,16,164325,,,b'\x00'
3202,10,2020,17,188965,,,b'\x00'
3203,10,2020,18,247330,,,b'\x00'
3204,10,2020,99,4033298,,,b'\x00'


## - Import module for connecting to MySQL

In [6]:
from sqlalchemy import create_engine # conda install sqlalchemy
import pymysql # conda install pymysql

In [7]:
print(f'pymysql version: {pymysql.__version__}')

pymysql version: 0.10.1


### - create a connection to MySQL

In [8]:
uid = 'reliability'
pwd = 'pcd_db'
host = '10.99.1.36'
# uid = 'root'
# pwd = 'admin'
# host = 'localhost'
# uid = 'root'
# pwd = ''
# host = 'localhost'
port = 3306
db = 'statistics_database'
con_string = f'mysql+pymysql://{uid}:{pwd}@{host}:{port}/{db}'
print(f'connection string = {con_string}')
con = create_engine(con_string)

connection string = mysql+pymysql://reliability:pcd_db@10.99.1.36:3306/statistics_database


In [9]:
# con.table_names()

## 2. Cleand data in distcust table

In [10]:
# import Javascript for working with browser
# from IPython.display import Javascript
# Javascript("alert();")

In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3205 entries, 0 to 3204
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   MONTH       3205 non-null   int64  
 1   YEAR        3205 non-null   int64  
 2   DISTRICT    3205 non-null   int64  
 3   NOCUS       3205 non-null   int64  
 4   D           732 non-null    float64
 5   REMARK      3205 non-null   object 
 6   _NullFlags  3205 non-null   object 
dtypes: float64(1), int64(4), object(2)
memory usage: 175.4+ KB


### 2.1 Remove _NullFlags column

In [12]:
df.drop('_NullFlags', axis=1, inplace=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3205 entries, 0 to 3204
Data columns (total 6 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   MONTH     3205 non-null   int64  
 1   YEAR      3205 non-null   int64  
 2   DISTRICT  3205 non-null   int64  
 3   NOCUS     3205 non-null   int64  
 4   D         732 non-null    float64
 5   REMARK    3205 non-null   object 
dtypes: float64(1), int64(4), object(1)
memory usage: 150.4+ KB


### 2.1 Remove year <= 2006, 20060 and 1008

In [13]:
df = df[~((df['YEAR'] <= 2006) | (df['YEAR'] == 20060) | (df['YEAR'] == 1008))]

### 2.2 Replace all '' with NaN

In [14]:
df.replace(r'^\s*$', np.nan, regex=True, inplace=True)

In [15]:
# df.tail()

## 4. Export and check to MySQL

In [16]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3173 entries, 31 to 3204
Data columns (total 6 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   MONTH     3173 non-null   int64  
 1   YEAR      3173 non-null   int64  
 2   DISTRICT  3173 non-null   int64  
 3   NOCUS     3173 non-null   int64  
 4   D         700 non-null    float64
 5   REMARK    38 non-null     object 
dtypes: float64(1), int64(4), object(1)
memory usage: 173.5+ KB


In [17]:
# Replace field names
df.columns = pd.read_sql('select * from discust limit 1', con).columns[1:]
df.columns

Index(['month', 'year', 'district', 'nocus', 'd', 'remark'], dtype='object')

In [18]:
def insertToDB(year, month, tableName):
    try:
        print()
        print(f'Insert data to "{tableName}" table for {year}/{month}')
        df[(df['year'] == year) & (df['month'] == month)].to_sql(tableName, con, if_exists='append', index=False)
        print('Success^^')
    except:
        print("May be data error then they can't be imported to MySQL !!!")
        raise

In [19]:
def checkNumberRecords(year, month, tableName):
    rawDataRecords = df[(df['year'] == year) & (df['month'] == month)]['district'].count()
    print(f'Raw {year}/{month} data records : {rawDataRecords}')
    sql = f'select count(*) from {tableName} where year={year} and month={month}'
    dbDataRecords = con.execute(sql).fetchall()[0][0]
    print()
    print('Check number of records')
    print(f'{tableName} table {year}/{month} data records : {dbDataRecords}')
    if rawDataRecords == dbDataRecords:
        print(f'Successfully append to {tableName} ^^')
    else:
        print(f'Unsuccessfullt append to {tableName}, please resolve errors !!!')

In [20]:
currentYear = datetime.now().year
selectYear = int(input(f'Please insert year (default as {currentYear}) : ') or currentYear)
lastestYear = int(df[-1:]['year'].values[0])
oldestYear = int(df[:1]['year'].values[0])
lastestMonthInYear = int(df[-1:]['month'].values[0])
print()
if (selectYear >= oldestYear) and (selectYear <= lastestYear):
    selectMonthText = lastestMonthInYear if selectYear == lastestYear else 12
    selectMonth = int(input(f'Please select month between 1 to {selectMonthText}\nSelect month here : ') or '0')
    if selectMonth > 0 and selectMonth <= selectMonthText:
        insertToDB(selectYear, selectMonth, 'discust')
        checkNumberRecords(selectYear, selectMonth, 'discust')
        selectYear = None
        selectMonth = None
    else:
        print(f'Please run this cell again then select month between 1 and {selectMonthText} for year {selectYear} !!!')
else:
    print(f'Please run this cell again then select year between {oldestYear} and {lastestYear}')

Please insert year (default as 2020) : 

Please select month between 1 to 10
Select table here : 11
Please run this cell again then select month between 1 and 10 for year 2020 !!!


## Addition

### Check lastest year/month data in discust table

In [21]:
sql = f'select year, month from discust where year >= 2007 and year != 20060 order by year desc, month desc limit 1'
print(f'Lastest data of distcust table (year/month) : {con.execute(sql).fetchall()[0][0]}/{con.execute(sql).fetchall()[0][1]}')

Lastest data of distcust table (year/month) : 2020/10
