# DBF file to MySQL for Year File

## 1. Load DBF to Pandas DataFrame

In [1]:
from datetime import datetime
import pandas as pd
import numpy as np
print(f'pandas version: {pd.__version__}')

pd.set_option('max_columns', 500)
# pd.set_option('max_rows', 10)

pandas version: 1.0.1


In [2]:
"""
Load content of a DBF file into a Pandas data frame.

The iter() is required because Pandas doesn't detect that the DBF
object is iterable.
"""

from dbfread import DBF

In [3]:
setPath = input('Please enter path for year file (default as G:/database/): ') or 'G:/database/'
yearSelected = input('Please insert year (default as this year) : ') or str(datetime.now().year)
print(f'Path : {setPath}')
print(f'Year : {yearSelected}')
fullPath = setPath + 'year'+ yearSelected +'.dbf'
print(f'\nFull path : {fullPath}')

Please enter path for year file (default as G:/database/): 
Please insert year (default as this year) : 
Path : G:/database/
Year : 2020

Full path : G:/database/year2020.dbf


In [4]:
dbf = DBF(fullPath)
df = pd.DataFrame(iter(dbf))

In [5]:
total_records = df.shape[0]
df.tail()

Unnamed: 0,ID,DATE,DAY,MONTH,NEW_MONTH,NEW_MONTH2,YEAR,DOW,TYPE_DAY,WEATHER,TEMPERATUR,SEASON,ABB,FEEDER,AREA,NIKOM,LINE,CB,TYPE_LINE,ABNORMAL,KV,UNIT,EVENT,FROM,TO,PERIOD,TO_DATE,AMP,NAMP,TO1,TO2,TO3,TO4,AMP1,AMP2,AMP3,AMP4,TOCB1,TOCB_DATE,TOCB2,TOCB3,TOCB4,NEW_CODE,GROUP,CODE,TYPE,NEPO_GROUP,OLD_DISTRI,DISTRICT,POLE,LATERAL,ROAD,RELAY,COMPONENT,DEVICE,DEV_TYPE,PHASE,FAULT_TYPE,TIMES,LENGTH,TIMEOCB,TIMEFUSE,TIMEREPAIR,KVA_HR,LOSS,MW,BATH,TIME_EQ,MAJOR,CONTROL
13921,1326,2020-10-31,31,10,10,10,2020,7,,R,0.0,,BZ,BZ-421,C,N,,,,,24.0,D,I,17:02,17:02,,2020-10-31,120,120,,,,,0,0,0,0,,2020-10-31,,,,11,F,20,TF,70.0,,3,,,,"OCI-R,EFI,RR",,,,R,,,0,0,0,0,0,0,0.0,0.0,0.0,,C
13922,1327,2020-10-31,31,10,10,10,2020,7,,R,0.0,,PE,PE-411,C,N,,,,,24.0,C,O,17:47,17:47,,2020-10-31,90,90,,,,,0,0,0,0,,2020-10-31,,,,191,F,181,CO,80.0,,6,,HT.85-048292,,"OCI-YB,EFI,RR",C07,,,YB,,,0,0,0,0,0,0,0.0,0.0,0.0,,U
13923,1328,2020-10-31,31,10,10,10,2020,7,,R,0.0,,CK,CK-413,C,N,,,,,24.0,D,I,19:56,19:56,,2020-10-31,38,38,,,,,0,0,0,0,,2020-10-31,,,,53,F,20,TF,70.0,,14,,SOI PHAHOLYOTHIN 21,,"OCI-RY,EFI-T,RR",C07,,,RY,,,0,0,0,0,0,0,0.0,0.0,0.0,,C
13924,1329,2020-10-31,31,10,10,10,2020,7,,N,0.0,,KSK,KSK-413,S,N,,,,,24.0,D,I,20:32,20:32,,2020-10-31,130,130,,,,,0,0,0,0,,2020-10-31,,,,11,F,20,TF,70.0,,15,,,,"OCI,EFI,RR",,,,,,,0,0,0,0,0,0,0.0,0.0,0.0,,C
13925,1330,2020-10-31,31,10,10,10,2020,7,,N,0.0,,RN,RN-438,C,N,,,,,24.0,D,I,23:51,23:51,,2020-10-31,80,80,,,,,0,0,0,0,,2020-10-31,,,,11,F,20,TF,70.0,,7,,,,"TARGET NOT SHOWN,RR",,,,,,,0,0,0,0,0,0,0.0,0.0,0.0,,C


## - Import module for connecting to MySQL

In [6]:
from sqlalchemy import create_engine # conda install sqlalchemy
import pymysql # conda install pymysql

In [7]:
print(f'pymysql version: {pymysql.__version__}')

pymysql version: 0.10.1


### - create a connection to MySQL

In [8]:
# uid = 'reliability'
# pwd = 'pcd_db'
# host = '10.99.1.36'
# uid = 'root'
# pwd = 'admin'
# host = 'localhost'
uid = 'root'
pwd = ''
host = 'localhost'
port = 3306
db = 'statistics_database'
con_string = f'mysql+pymysql://{uid}:{pwd}@{host}:{port}/{db}'
print(f'connection string = {con_string}')
con = create_engine(con_string)

connection string = mysql+pymysql://root:@localhost:3306/statistics_database


In [9]:
# con.table_names()

## 2. Change data type in each fields

In [10]:
# import Javascript for working with browser
# from IPython.display import Javascript
# Javascript("alert();")

In [11]:
# df.info()

### 2.1 DATE field to datetime

In [12]:
regex = r'[12]\d{3}-0[1-9]|1[0-2]-0[1-9]|[12]\d|3[01]'
df[~(df['DATE'].astype(str).str.contains(regex))]['DATE'].unique()

array([], dtype=object)

In [13]:
df['DATE'] = pd.to_datetime(df['DATE'], errors='coerce')
# df.info()
print(f'Total records : {total_records}')
print('NaT records : %d' %df[df['DATE'].isna()]['ID'].count())
print('Date records : %d' %df[~df['DATE'].isna()]['ID'].count())
print('SUM records : %d' %(df[df['DATE'].isna()]['ID'].count() + df[~df['DATE'].isna()]['ID'].count()))

Total records : 13926
NaT records : 0
Date records : 13926
SUM records : 13926


### 2.2 TO_DATE field to datetime

In [14]:
regex = r'[12]\d{3}-0[1-9]|1[0-2]-0[1-9]|[12]\d|3[01]'
df[~(df['TO_DATE'].astype(str).str.contains(regex))]['TO_DATE'].unique()

array([None], dtype=object)

In [15]:
df['TO_DATE'] = pd.to_datetime(df['TO_DATE'], errors='coerce')
# df.info()
print(f'Total records : {total_records}')
print('NaT records : %d' %df[df['TO_DATE'].isna()]['ID'].count())
print('Date records : %d' %df[~df['TO_DATE'].isna()]['ID'].count())
print('SUM records : %d' %(df[df['TO_DATE'].isna()]['ID'].count() + df[~df['TO_DATE'].isna()]['ID'].count()))

Total records : 13926
NaT records : 3163
Date records : 10763
SUM records : 13926


In [16]:
# indexing = df[df['TO_DATE'].isna()].index
# print(indexing)
# df.loc[indexing, 'TO_DATE'] = None
# df.loc[indexing, 'TO_DATE']
# df[df['TO_DATE'].isna()]['ID'].count()

### 2.1 TOCB_DATE field to datetime

In [17]:
regex = r'[12]\d{3}-0[1-9]|1[0-2]-0[1-9]|[12]\d|3[01]'
df[~(df['TOCB_DATE'].astype(str).str.contains(regex))]['TOCB_DATE'].unique()

array([None], dtype=object)

In [18]:
df['TOCB_DATE'] = pd.to_datetime(df['TOCB_DATE'], errors='coerce')
# df.info()
print(f'Total records : {total_records}')
print('NaT records : %d' %df[df['TOCB_DATE'].isna()]['ID'].count())
print('Date records : %d' %df[~df['TOCB_DATE'].isna()]['ID'].count())
print('SUM records : %d' %(df[df['TOCB_DATE'].isna()]['ID'].count() + df[~df['TOCB_DATE'].isna()]['ID'].count()))

Total records : 13926
NaT records : 3168
Date records : 10758
SUM records : 13926


In [19]:
# df.dtypes.value_counts()

## 3. Check miss match data in each fields

In [20]:
# df.info()

### 3.1 Weather

In [21]:
# df.loc[0:5, 'WEATHER'] = 'S'
indexList = df[~df['WEATHER'].isin(['N', 'R'])].index
if len(indexList):
    print(df.loc[indexList, 'WEATHER'])
    df.loc[indexList, 'WEATHER'] = 'N'
    print(df.loc[indexList, 'WEATHER'])
else:
    print('PASS^^ : Weahter data have already complete.')

PASS^^ : Weahter data have already complete.


### 3.2 ABB

In [22]:
# Check empty ABB (ABB can empty if GROUP is 'E')
# pd.crosstab(df['ABB'], columns='COUNT')
if len(df[(df['ABB'] == '') & (df['GROUP'] != 'E')]) > 0:
    print('Please check empty ABB records !!!')
else:
    print('Pass^^ : No empty ABB records for non "E" GROUP')

Pass^^ : No empty ABB records for non "E" GROUP


In [23]:
# %%time
# Check ABB is not in Feeder fields
# df[df['GROUP'].isin(['F'])][['ABB', 'FEEDER']].apply(lambda row: print('Miss match "ABB" : %s and "FEEDER" : %s' %(row['ABB'], row['FEEDER']) if row['ABB'] not in row['FEEDER'] else 'Match'), axis=1);
missmatchAbbFeeder = pd.DataFrame()
feederRows = df[df['GROUP'].isin(['F'])][['ABB', 'FEEDER']]
for idx, row in feederRows.iterrows():
    if row['ABB'] not in row['FEEDER']:
        missmatchAbbFeeder = missmatchAbbFeeder.append(row)

if len(missmatchAbbFeeder) > 0:
    print('Please check group "F" records that "ABB" miss match with "FEEDER" !!!')
    print(missmatchAbbFeeder)
    missmatchAbbFeederIdx = missmatchAbbFeeder.index
    for idx, row in missmatchAbbFeeder.iterrows():
        df.loc[idx, 'ABB'] = row['FEEDER'].split('-',1)[0]
    print('They have already corrected^^')
    print(df.loc[missmatchAbbFeederIdx, ['ABB', 'FEEDER']])
else:
    print('Pass^^ : All group "F" records, "ABB" match with "FEEDER"')

Pass^^ : All group "F" records, "ABB" match with "FEEDER"


In [24]:
# Check ABB is not in CB fields
missmatchAbbStaionH = pd.DataFrame() # StationH = Station and 'H' event
missmatchAbbStationF = pd.DataFrame() # StationF = Station and 'I, O' event
stationRows = df[df['GROUP'].isin(['S'])][['ABB', 'FEEDER', 'CB', 'EVENT']]
for idx, row in stationRows.iterrows():
    if row['EVENT'] == 'H':
        if row['ABB'] not in row['CB']:
            missmatchAbbStaionH = missmatchAbbStaionH.append(row)
    else: # row['EVENT'] != 'H'
        if row['ABB'] not in row['FEEDER']:
            missmatchAbbStationF = missmatchAbbStationF.append(row)

if len(missmatchAbbStaionH) > 0:
    print('Please check group "S" records that "ABB" miss match with "CB" !!!')
    print(missmatchAbbStaionH)
else:
    print('Pass^^ : All group "S" records, "ABB" match with "CB"')
    
if len(missmatchAbbStationF) > 0:
    print('Please check group "S" records that "ABB" miss match with "FEEDER" !!!')
    print(missmatchAbbStationF)
    missmatchAbbStationFIdx = missmatchAbbStationF.index
    for idx, row in missmatchAbbStationF.iterrows():
        df.loc[idx, 'ABB'] = row['FEEDER'].split('-',1)[0]
    print('They have already corrected^^')
    print(df.loc[missmatchAbbStationFIdx, ['ABB', 'CB', 'EVENT', 'FEEDER']])
else:
    print('Pass^^ : All group "S" records, "ABB" match with "FEEDER"')

Pass^^ : All group "S" records, "ABB" match with "CB"
Pass^^ : All group "S" records, "ABB" match with "FEEDER"


In [25]:
# df.loc[12507:12509, 'ABB'] = 'WPC'

In [26]:
# Check ABB is not in LINE fields
missmatchAbbLineH = pd.DataFrame() # LineH = Line and 'H' event
missmatchAbbLineF = pd.DataFrame() # LineH = Line and 'I, O' event
lineRows = df[df['GROUP'].isin(['L'])][['ABB', 'FEEDER', 'LINE', 'EVENT']]
for idx, row in lineRows.iterrows():
    if row['EVENT'] == 'H':
        if row['ABB'] not in row['LINE']:
            missmatchAbbLineH = missmatchAbbLineH.append(row)
    else: # row['EVENT'] != 'H'
        if row['ABB'] not in row['FEEDER']:
            missmatchAbbLineF = missmatchAbbLineF.append(row)

if len(missmatchAbbLineH) > 0:
    print('Please check group "L" records that "ABB" miss match with "LINE" !!!')
    print(missmatchAbbLineH)
else:
    print('Pass^^ : All group "L" records, "ABB" match with "LINE"')
    
if len(missmatchAbbLineF) > 0:
    print('Please check group "L" records that "ABB" miss match with "FEEDER" !!!')
    print(missmatchAbbLineF)
    missmatchAbbLineFIdx = missmatchAbbLineF.index
    for idx, row in missmatchAbbLineF.iterrows():
        df.loc[idx, 'ABB'] = row['FEEDER'].split('-',1)[0]
    print('They have already corrected^^')
    print(df.loc[missmatchAbbLineFIdx, ['ABB', 'EVENT', 'FEEDER', 'LINE']])
else:
    print('Pass^^ : All group "L" records, "ABB" match with "FEEDER"')

Pass^^ : All group "L" records, "ABB" match with "LINE"
Please check group "L" records that "ABB" miss match with "FEEDER" !!!
       ABB EVENT   FEEDER     LINE
12509  WPC     I  WKS-411  SKT-695
They have already corrected^^
       ABB EVENT   FEEDER     LINE
12509  WKS     I  WKS-411  SKT-695


### 3.3 FROM

In [27]:
regex = r'^[0-9]|[0-1][0-9]|[2][0-3]:[0-5][0-9]$'
missmatchFrom = df[~df['FROM'].str.contains(regex)]['FROM']
if len(missmatchFrom) > 0:
    print('Please check "FROM" miss match records !!!')
else:
    print('Pass^^ : All "FROM" records have already correct')

Pass^^ : All "FROM" records have already correct


### 3.4 TO

In [28]:
regex = r'^[0-9]|[0-1][0-9]|[2][0-3]:[0-5][0-9]$'
missmatchTo = df[~df['TO'].str.contains(regex)]['TO']
if len(missmatchTo) > 0:
    print('Please check "TO" miss match records !!!')
    missmatchToIdx = missmatchTo.index
    print(missmatchTo)
    df.loc[missmatchToIdx, 'TO'] = None
    print('They have already corrected^^')
    print(df.loc[missmatchToIdx, 'TO'])
else:
    print('Pass^^ : All "TO" records have already correct')

Please check "TO" miss match records !!!
8136    **:**
8137    **:**
8675    **:**
Name: TO, dtype: object
They have already corrected^^
8136    None
8137    None
8675    None
Name: TO, dtype: object


### 3.5 TO1

In [29]:
regex = r'^[0-9]|[0-1][0-9]|[2][0-3]:[0-5][0-9]$'
missmatchTo1 = df[~df['TO1'].str.contains(regex)]['TO1']
missmatchTo1 = missmatchTo1[~(missmatchTo1 == '')]
if len(missmatchTo1) > 0:
    print('Please check "TO1" miss match records !!!')
    missmatchTo1Idx = missmatchTo1.index
    print(missmatchTo1)
    df.loc[missmatchTo1Idx, 'TO1'] = None
    print('They have already corrected^^')
    print(df.loc[missmatchTo1Idx, 'TO1'])
else:
    print('Pass^^ : All "TO1" records have already correct')

Please check "TO1" miss match records !!!
2007      :
9118      :
Name: TO1, dtype: object
They have already corrected^^
2007    None
9118    None
Name: TO1, dtype: object


### 3.6 TO2

In [30]:
regex = r'^[0-9]|[0-1][0-9]|[2][0-3]:[0-5][0-9]$'
missmatchTo2 = df[~df['TO2'].str.contains(regex)]['TO2']
missmatchTo2 = missmatchTo2[~(missmatchTo2 == '')]
if len(missmatchTo2) > 0:
    print('Please check "TO2" miss match records !!!')
    missmatchTo2Idx = missmatchTo2.index
    print(missmatchTo2)
    df.loc[missmatchTo2Idx, 'TO2'] = None
    print('They have already corrected^^')
    print(df.loc[missmatchTo2Idx, 'TO2'])
else:
    print('Pass^^ : All "TO2" records have already correct')

Pass^^ : All "TO2" records have already correct


### 3.7 TO3

In [31]:
regex = r'^[0-9]|[0-1][0-9]|[2][0-3]:[0-5][0-9]$'
missmatchTo3 = df[~df['TO3'].str.contains(regex)]['TO3']
missmatchTo3 = missmatchTo3[~(missmatchTo3 == '')]
if len(missmatchTo3) > 0:
    print('Please check "TO3" miss match records !!!')
    missmatchTo3Idx = missmatchTo3.index
    print(missmatchTo3)
    df.loc[missmatchTo3Idx, 'TO3'] = None
    print('They have already corrected^^')
    print(df.loc[missmatchTo3Idx, 'TO3'])
else:
    print('Pass^^ : All "TO3" records have already correct')

Pass^^ : All "TO3" records have already correct


### 3.8 TO4

In [32]:
regex = r'^[0-9]|[0-1][0-9]|[2][0-3]:[0-5][0-9]$'
missmatchTo4 = df[~df['TO4'].str.contains(regex)]['TO4']
missmatchTo4 = missmatchTo4[~(missmatchTo4 == '')]
if len(missmatchTo4) > 0:
    print('Please check "TO4" miss match records !!!')
    missmatchTo4Idx = missmatchTo4.index
    print(missmatchTo4)
    df.loc[missmatchTo4Idx, 'TO4'] = None
    print('They have already corrected^^')
    print(df.loc[missmatchTo4Idx, 'TO4'])
else:
    print('Pass^^ : All "TO4" records have already correct')

Pass^^ : All "TO4" records have already correct


### 3.9 TOCB1

In [33]:
regex = r'^[0-9]|[0-1][0-9]|[2][0-3]:[0-5][0-9]$'
missmatchTocb1 = df[~df['TOCB1'].str.contains(regex)]['TOCB1']
missmatchTocb1 = missmatchTocb1[~(missmatchTocb1 == '')]
if len(missmatchTocb1) > 0:
    print('Please check "TOCB1" miss match records !!!')
    missmatchTocb1Idx = missmatchTocb1.index
    print(missmatchTocb1)
    df.loc[missmatchTocb1Idx, 'TOCB1'] = None
    print('They have already corrected^^')
    print(df.loc[missmatchTocb1Idx, 'TOCB1'])
else:
    print('Pass^^ : All "TOCB1" records have already correct')

Please check "TOCB1" miss match records !!!
481          :
523          :
743          :
819          :
879          :
1704         :
1795         :
1968         :
7338     **:**
8095         :
8149         :
8206         :
8492         :
8601         :
8631         :
8827         :
9233         :
9415         :
9421         :
9836         :
9872         :
10210        :
10236        :
10377        :
10699        :
10790        :
10893        :
11185        :
11198    **:**
11199        :
11333        :
11738        :
11964        :
12008    **:**
12127        :
12331        :
12526        :
12694        :
12759        :
12767        :
Name: TOCB1, dtype: object
They have already corrected^^
481      None
523      None
743      None
819      None
879      None
1704     None
1795     None
1968     None
7338     None
8095     None
8149     None
8206     None
8492     None
8601     None
8631     None
8827     None
9233     None
9415     None
9421     None
9836     None
9872     None
10210

### 3.10 TOCB2

In [34]:
regex = r'^[0-9]|[0-1][0-9]|[2][0-3]:[0-5][0-9]$'
missmatchTocb2 = df[~df['TOCB2'].str.contains(regex)]['TOCB2']
missmatchTocb2 = missmatchTocb2[~(missmatchTocb2 == '')]
if len(missmatchTocb2) > 0:
    print('Please check "TOCB2" miss match records !!!')
    missmatchTocb2Idx = missmatchTocb2.index
    print(missmatchTocb2)
    df.loc[missmatchTocb2Idx, 'TOCB2'] = None
    print('They have already corrected^^')
    print(df.loc[missmatchTocb2Idx, 'TOCB2'])
else:
    print('Pass^^ : All "TOCB2" records have already correct')

Please check "TOCB2" miss match records !!!
9649         :
10504    **:**
11266    **:**
Name: TOCB2, dtype: object
They have already corrected^^
9649     None
10504    None
11266    None
Name: TOCB2, dtype: object


### 3.11 TOCB3

In [35]:
regex = r'^[0-9]|[0-1][0-9]|[2][0-3]:[0-5][0-9]$'
missmatchTocb3 = df[~df['TOCB3'].str.contains(regex)]['TOCB3']
missmatchTocb3 = missmatchTocb3[~(missmatchTocb3 == '')]
if len(missmatchTocb3) > 0:
    print('Please check "TOCB3" miss match records !!!')
    missmatchTocb3Idx = missmatchTocb3.index
    print(missmatchTocb3)
    df.loc[missmatchTocb3Idx, 'TOCB3'] = None
    print('They have already corrected^^')
    print(df.loc[missmatchTocb3Idx, 'TOCB3'])
else:
    print('Pass^^ : All "TOCB3" records have already correct')

Please check "TOCB3" miss match records !!!
9342     **:**
9394     **:**
11604        :
Name: TOCB3, dtype: object
They have already corrected^^
9342     None
9394     None
11604    None
Name: TOCB3, dtype: object


### 3.12 TOCB4

In [36]:
regex = r'^[0-9]|[0-1][0-9]|[2][0-3]:[0-5][0-9]$'
missmatchTocb4 = df[~df['TOCB4'].str.contains(regex)]['TOCB4']
missmatchTocb4 = missmatchTocb4[~(missmatchTocb4 == '')]
if len(missmatchTocb4) > 0:
    print('Please check "TOCB4" miss match records !!!')
    missmatchTocb4Idx = missmatchTocb4.index
    print(missmatchTocb4)
    df.loc[missmatchTocb4Idx, 'TOCB4'] = None
    print('They have already corrected^^')
    print(df.loc[missmatchTocb4Idx, 'TOCB4'])
else:
    print('Pass^^ : All "TOCB4" records have already correct')

Please check "TOCB4" miss match records !!!
2168     **:**
2638     **:**
3370     **:**
10655    **:**
Name: TOCB4, dtype: object
They have already corrected^^
2168     None
2638     None
3370     None
10655    None
Name: TOCB4, dtype: object


### 3.13 NEW_CODE

In [37]:
# con.table_names()
# df[0:1] -> new_code = '053'
# df.loc[0, 'NEW_CODE'] = '053'

In [38]:
# r = con.execute('select * from nw_cause limit 5;').fetchall()
# r
sql = 'select sub_code from nw_cause'
nw_cause_sub_code = pd.read_sql(sql, con).iloc[:, 0].values
missmatchNew_code = df[~df['NEW_CODE'].isin(nw_cause_sub_code)]\
                        [['DATE', 'FEEDER', 'FROM', 'LINE', 'CB', 'GROUP', 'DISTRICT', 'NEW_CODE', 'COMPONENT']]
if len(missmatchNew_code) > 0:
    print('Please check "NEW_CODE" miss match records !!!')
    print(missmatchNew_code)
else:
    print('Pass^^ : All "NEW_CODE" records have already correct')

Pass^^ : All "NEW_CODE" records have already correct


### 3.14 DISTRICT

In [39]:
# pd.crosstab(df['DISTRICT'], columns='COUNT')
# df[~df['DISTRICT'].isin(range(1,19))]['EVENT'].unique()
# df[df['EVENT']=='S']
# 15
# df.loc[0, 'DISTRICT'] = 15

In [40]:
missmatchDistrict = df[~df['DISTRICT'].isin(range(1,19)) & df['EVENT'].isin(['I', 'O'])]\
                        [['DATE', 'FEEDER', 'FROM', 'LINE', 'CB', 'GROUP', 'DISTRICT', 'NEW_CODE', 'COMPONENT']]
len(missmatchDistrict)
if len(missmatchDistrict) > 0:
    print('Please check "DISTRICT" miss match records !!!')
    print(missmatchDistrict)
else:
    print('Pass^^ : All "DISTRICT" records have already correct')

Pass^^ : All "DISTRICT" records have already correct


### 3.15 COMPONENT

In [41]:
# df.loc[0, 'COMPONENT'] = 'C22' # 'C22'

In [42]:
sql = 'select code from component'
component_code = pd.read_sql(sql, con).iloc[:, 0]
missmatchComponent = df[~((df['COMPONENT'].isin(component_code)) | (df['COMPONENT'] == ''))]\
                        [['DATE', 'FEEDER', 'FROM', 'LINE', 'CB', 'GROUP', 'DISTRICT', 'NEW_CODE', 'COMPONENT']]
if len(missmatchComponent) > 0:
    print('Please check "COMPONENT" miss match records !!!')
    print(missmatchComponent)
else:
    print('Pass^^ : All "COMPONENT" records have already correct')

Pass^^ : All "COMPONENT" records have already correct


### 3.16 OLD_DISTRI

In [43]:
# df['OLD_DISTRI'].unique()
missingValueOldDistrictIdx = df[df['OLD_DISTRI'].isna()].index
if len(missingValueOldDistrictIdx) > 0:
    print('Please check "OLD_DISTRI" missing value records !!!')
    print(missingValueOldDistrictIdx)
    df.loc[missingValueOldDistrictIdx, 'OLD_DISTRI'] = 0
    print('They have already corrected^^')
    print(df.loc[missingValueOldDistrictIdx, 'OLD_DISTRI'].value_counts())
else:
    print('Pass^^ : All "OLD_DISTRI" records have already correct')

df['OLD_DISTRI'] = df['OLD_DISTRI'].astype('int', errors='ignore')

Please check "OLD_DISTRI" missing value records !!!
Int64Index([    0,     1,     2,     3,     4,     5,     6,     7,     8,
                9,
            ...
            13916, 13917, 13918, 13919, 13920, 13921, 13922, 13923, 13924,
            13925],
           dtype='int64', length=13925)
They have already corrected^^
0.0    13925
Name: OLD_DISTRI, dtype: int64


### 3.17 NEPO_GROUP  

In [44]:
df['NEPO_GROUP'].unique()
# df['NEPO_GROUP'].unique()
missingValueNepoGroupIdx = df[df['NEPO_GROUP'].isna()].index
if len(missingValueNepoGroupIdx) > 0:
    print('Please check "NEPO_GROUP" missing value records !!!')
    print(missingValueNepoGroupIdx)
    df.loc[missingValueNepoGroupIdx, 'NEPO_GROUP'] = 0
    print('They have already corrected^^')
    print(df.loc[missingValueNepoGroupIdx, 'NEPO_GROUP'].value_counts())
else:
    print('Pass^^ : All "NEPO_GROUP" records have already correct')

df['NEPO_GROUP'] = df['NEPO_GROUP'].astype('int', errors='ignore')

Please check "NEPO_GROUP" missing value records !!!
Int64Index([4979, 4980, 4981, 4982, 4983, 4984, 4985, 4986, 4987, 4988, 4989,
            4990, 4991, 4992, 4993, 4994, 4995, 4996, 4997, 4998, 4999, 5000,
            5001, 5002, 5003, 5004, 5005, 5006, 5007, 5008, 5009, 5010, 5011,
            5012, 5013, 5014, 5015, 5016],
           dtype='int64')
They have already corrected^^
0.0    38
Name: NEPO_GROUP, dtype: int64


### 3.18 KV

In [45]:
kvValue = {2: 12, 4: 24, 6: 69, 7: 115}

rows = df[df['KV'].isna()][['FEEDER', 'LINE', 'CB', 'KV', 'GROUP']]
rowsIdx = rows.index
print(rows)
for idx, row in rows.iterrows():
    df.loc[idx, 'KV'] = kvValue[int(row['CB'][3])] if row['CB'][3] != 'C' else kvValue[int(row['CB'][4])]

print(df.loc[rowsIdx, ['FEEDER', 'LINE', 'CB', 'KV', 'GROUP']])
df['KV'] = df['KV'].astype('int', errors='ignore')

      FEEDER     LINE       CB  KV GROUP
8136                   BRY7912 NaN     S
10304                   BTR413 NaN     S
10308                   BTR413 NaN     S
10837                   WKS412 NaN     S
10839                    WKS27 NaN     S
13168         CLT-964  CLT6942 NaN     L
      FEEDER     LINE       CB     KV GROUP
8136                   BRY7912  115.0     S
10304                   BTR413   24.0     S
10308                   BTR413   24.0     S
10837                   WKS412   24.0     S
10839                    WKS27   12.0     S
13168         CLT-964  CLT6942   69.0     L


### 3.19 CONTROL

In [46]:
missmatchControl = df[(df['CONTROL'] == '') | (df['CONTROL'].isna())]\
                        [['DATE', 'FEEDER', 'FROM', 'LINE', 'CB', 'GROUP', 'NEW_CODE', 'COMPONENT', 'CONTROL']]
if len(missmatchControl) > 0:
    print('Please check "CONTROL" miss match records !!!')
    print(missmatchControl)
else:
    print('Pass^^ : All "CONTROL" records have already correct')

Pass^^ : All "CONTROL" records have already correct


## 4. Replace all '' with NaN

In [47]:
df.replace(r'^\s*$', np.nan, regex=True, inplace=True)

## 5. Export and check to MySQL

In [48]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13926 entries, 0 to 13925
Data columns (total 70 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   ID          13926 non-null  int64         
 1   DATE        13926 non-null  datetime64[ns]
 2   DAY         13926 non-null  int64         
 3   MONTH       13926 non-null  int64         
 4   NEW_MONTH   13926 non-null  int64         
 5   NEW_MONTH2  13926 non-null  int64         
 6   YEAR        13926 non-null  int64         
 7   DOW         13926 non-null  int64         
 8   TYPE_DAY    0 non-null      float64       
 9   WEATHER     13926 non-null  object        
 10  TEMPERATUR  13926 non-null  object        
 11  SEASON      0 non-null      float64       
 12  ABB         13922 non-null  object        
 13  FEEDER      13627 non-null  object        
 14  AREA        13926 non-null  object        
 15  NIKOM       13926 non-null  object        
 16  LINE        3243 non-n

In [49]:
# Replace field names
df.columns = pd.read_sql('select * from outage_event_db limit 1', con).columns[1:]
df.columns

Index(['id', 'date', 'day', 'month', 'new_month', 'new_month2', 'year', 'dow',
       'type_day', 'weather', 'temperatur', 'season', 'abb', 'feeder', 'area',
       'nikom', 'line', 'cb', 'type_line', 'abnormal', 'kv', 'unit', 'event',
       'time_from', 'time_to', 'period', 'to_date', 'amp', 'namp', 'to1',
       'to2', 'to3', 'to4', 'amp1', 'amp2', 'amp3', 'amp4', 'tocb1',
       'tocb_date', 'tocb2', 'tocb3', 'tocb4', 'new_code', 'group_type',
       'code', 'type', 'nepo_group', 'old_distri', 'district', 'pole',
       'lateral', 'road', 'relay', 'component', 'device', 'dev_type', 'phase',
       'fault_type', 'times', 'length', 'timeocb', 'timefuse', 'timerepair',
       'kva_hr', 'loss', 'mw', 'bath', 'time_eq', 'major', 'control'],
      dtype='object')

In [50]:
def insertToDB(month, tableName):
    try:
        print()
        print(f'Insert data to "{tableName}" table')
        df[df['month'] == month].to_sql(tableName, con, if_exists='append', index=False)
        print('Success^^')
    except:
        print("May be data error then they can't be imported to MySQL !!!")
        raise

In [51]:
def checkNumberRecords(month, tableName):
    rawDataRecords = df[(df['date'].dt.year == int(yearSelected)) & (df['date'].dt.month == month)]['date'].count()
    print(f'Raw {yearSelected}/{month} data records : {rawDataRecords}')
    sql = f'select count(*) from {tableName} where year(date)={yearSelected} and month(date)={month}'
    dbDataRecords = con.execute(sql).fetchall()[0][0]
    print()
    print('Check number of records')
    print(f'{tableName} table {yearSelected}/{month} data records : {dbDataRecords}')
    if rawDataRecords == dbDataRecords:
        print(f'Successfully append to {tableName} ^^')
    else:
        print()
        print(f'Unsuccessfullt append to {tableName}, please resolve errors !!!')
        sqlDel = f'delete from {tableName} where year(date)={yearSelected} and month(date)={month}'
        con.execute(sqlDel)
        print(f'Delete false imported data of {tableName} table {yearSelected}/{month} already !')

In [55]:
tableNames = {1: 'outage_event_db', 2: 'outage_event_db_15days'}
selectTable = int(input(
'''Please type 1 or 2 for selecting table to dump
\t1 to outage_event_db
\t2 to outage_event_db_15days
Select table here : ''') or '0')
print()
if selectTable > 0 and selectTable < 3:
    selectMonth = int(input('Please select month between 1 to 12\nSelect table here : ') or '0')
    if selectMonth > 0 and selectMonth < 13:
        insertToDB(selectMonth, tableNames[selectTable])
        checkNumberRecords(selectMonth, tableNames[selectTable])
        selectTable = None
        selectMonth = None
    else:
        print('Please run this cell again then select month between 1 or 12 !!!')
else:
    print('Please run this cell again then select only 1 or 2 !!!')

Please type 1 or 2 for selecting table to dump
	1 to outage_event_db
	2 to outage_event_db_15days
Select table here : 2

Please select month between 1 to 12
Select table here : 10

Insert data to "outage_event_db_15days" table
Success^^
Raw 2020/10 data records : 1329

Check number of records
outage_event_db_15days table 2020/10 data records : 1329
Successfully append to outage_event_db_15days ^^


## Addition

### Check lasted date data in outage_event_db or outage_event_db_15days

In [54]:
tableNames = {1: 'outage_event_db', 2: 'outage_event_db_15days'}
selectTable = int(input(
'''Please type 1 or 2 for selecting table to dump
\t1 to outage_event_db
\t2 to outage_event_db_15days
Select table here : ''') or '0')
print()
if selectTable > 0 and selectTable < 3:
    sql = f'select date from {tableNames[selectTable]} order by date desc limit 1'
    print(f'Lasted date of {tableNames[selectTable]} table (y-m-d) : {con.execute(sql).fetchall()[0][0]}')
else:
    print('Please run this cell again then select only 1 or 2 !!!')

Please type 1 or 2 for selecting table to dump
	1 to outage_event_db
	2 to outage_event_db_15days
Select table here : 1

Lasted date of outage_event_db table (y-m-d) : 2020-09-30
