# DBF file to MySQL for Year File

## 1. Load DBF to Pandas DataFrame

In [1]:
from datetime import datetime
import pandas as pd
import numpy as np
from IPython.display import display, Javascript
print(f'pandas version: {pd.__version__}')

pd.set_option('max_columns', 500)
# pd.set_option('max_rows', 10)

pandas version: 1.1.3


In [2]:
"""
Load content of a DBF file into a Pandas data frame.

The iter() is required because Pandas doesn't detect that the DBF
object is iterable.
"""

from dbfread import DBF

In [3]:
setPath = input('Please enter path for year file (default as G:/database/): ') or 'G:/database/'
yearSelected = input('Please insert year (default as this year) : ') or str(datetime.now().year)
print(f'Path : {setPath}')
print(f'Year : {yearSelected}')
fullPath = setPath + 'year'+ yearSelected +'.dbf'
print(f'\nFull path : {fullPath}')

Please enter path for year file (default as G:/database/): 
Please insert year (default as this year) : 2020
Path : G:/database/
Year : 2020

Full path : G:/database/year2020.dbf


In [4]:
dbf = DBF(fullPath)
df = pd.DataFrame(iter(dbf))

In [5]:
total_records = df.shape[0]
df.tail()

Unnamed: 0,ID,DATE,DAY,MONTH,NEW_MONTH,NEW_MONTH2,YEAR,DOW,TYPE_DAY,WEATHER,TEMPERATUR,SEASON,ABB,FEEDER,AREA,NIKOM,LINE,CB,TYPE_LINE,ABNORMAL,KV,UNIT,EVENT,FROM,TO,PERIOD,TO_DATE,AMP,NAMP,TO1,TO2,TO3,TO4,AMP1,AMP2,AMP3,AMP4,TOCB1,TOCB_DATE,TOCB2,TOCB3,TOCB4,NEW_CODE,GROUP,CODE,TYPE,NEPO_GROUP,OLD_DISTRI,DISTRICT,POLE,LATERAL,ROAD,RELAY,COMPONENT,DEVICE,DEV_TYPE,PHASE,FAULT_TYPE,TIMES,LENGTH,TIMEOCB,TIMEFUSE,TIMEREPAIR,KVA_HR,LOSS,MW,BATH,TIME_EQ,MAJOR,CONTROL
15630,691,2020-12-31,31,12,12,12,2020,5,,N,0.0,,SA,SA-424,C,N,,,,,24.0,D,I,16:07,16:07,,2020-12-31,70,70,16:45,,,,28,0,0,0,,2020-12-31,,,,52,F,80,TF,40.0,,4,,S & P BUILDING,,"OCI-T-YB,EFI-T,RR",C07,,,YB,,,0,0,38,0,518,443,0.0,114.52,15.2,,C
15631,692,2020-12-31,31,12,12,12,2020,5,,N,0.0,,MC,MC-418,C,N,,,,,24.0,D,I,16:38,16:38,,2020-12-31,80,80,17:23,,,,32,0,0,0,,2020-12-31,,,,53,F,80,TF,40.0,,8,,HT.87-060715,,"EFI,RR",C22,,,,,,0,0,45,0,701,599,0.0,154.84,18.0,,C
15632,693,2020-12-31,31,12,12,12,2020,5,,N,0.0,,SM,SM-413,C,N,,,,,24.0,D,I,16:58,16:58,,2020-12-31,83,83,,,,,0,0,0,0,,2020-12-31,,,,51,F,80,TF,40.0,,8,,HT.88-015791,,"OCI-T-YB,RR",C20,,,YB,,,0,0,0,0,0,0,0.0,0.0,0.0,,C
15633,694,2020-12-31,31,12,12,12,2020,5,,N,0.0,,KM,KM-423,C,N,,,,,24.0,C,O,17:26,17:26,,2020-12-31,36,36,,,,,0,0,0,0,,2020-12-31,,,,191,F,181,CO,80.0,,14,,HT.78-089775,,"OCI-T-R,EFI-T,RR",C07,,,R,,,0,0,0,0,0,0,0.0,0.0,0.0,,U
15634,695,2020-12-31,31,12,12,12,2020,5,,N,0.0,,SP,SP-423,C,N,,,,,24.0,D,I,21:42,21:42,,2020-12-31,130,130,22:15,,,,52,0,0,0,,2020-12-31,,,,11,F,20,TF,70.0,,14,,,,"OCI-Y,RR",,,,Y,,,0,0,33,0,835,714,0.0,184.57,13.2,,C


## - Import module for connecting to MySQL

In [6]:
from sqlalchemy import create_engine # conda install sqlalchemy
import pymysql # conda install pymysql

In [7]:
print(f'pymysql version: {pymysql.__version__}')

pymysql version: 1.0.0


### - create a connection to MySQL

In [8]:
uid = 'reliability'
pwd = 'pcd_db'
host = '10.99.1.36'
# uid = 'root'
# pwd = 'admin'
# host = 'localhost'
# uid = 'root'
# pwd = ''
# host = 'localhost'
port = 3306
db = 'statistics_database'
con_string = f'mysql+pymysql://{uid}:{pwd}@{host}:{port}/{db}'
print(f'connection string = {con_string}')
con = create_engine(con_string)

connection string = mysql+pymysql://reliability:pcd_db@10.99.1.36:3306/statistics_database


In [9]:
# con.table_names()

## 2. Change data type in each fields

In [10]:
# import Javascript for working with browser
# from IPython.display import Javascript
# Javascript("alert();")

In [11]:
# df.info()

### 2.1 DATE field to datetime

In [12]:
regex = r'[12]\d{3}-0[1-9]|1[0-2]-0[1-9]|[12]\d|3[01]'
df[~(df['DATE'].astype(str).str.contains(regex))]['DATE'].unique()

array([], dtype=object)

In [13]:
df['DATE'] = pd.to_datetime(df['DATE'], errors='coerce')
# df.info()
print(f'Total records : {total_records}')
print('NaT records : %d' %df[df['DATE'].isna()].shape[0])
print('Date records : %d' %df[~df['DATE'].isna()].shape[0])
sum_records = df[df['DATE'].isna()].shape[0] + df[~df['DATE'].isna()].shape[0]
print('SUM records : %d' %(sum_records))

if sum_records!=total_records:
    js_code = 'alert("Covert DATE filed to datetime problem!!!")'
    display(Javascript(js_code))
    raise SystemExit("Stop right there!")

Total records : 15635
NaT records : 0
Date records : 15635
SUM records : 15635


### 2.2 TO_DATE field to datetime

In [14]:
regex = r'[12]\d{3}-0[1-9]|1[0-2]-0[1-9]|[12]\d|3[01]'
df[~(df['TO_DATE'].astype(str).str.contains(regex))]['TO_DATE'].unique()

array([None], dtype=object)

In [15]:
df[df['TO_DATE'].isna()].shape[0]

3449

In [16]:
df['TO_DATE'] = pd.to_datetime(df['TO_DATE'], errors='coerce')
# df.info()
print(f'Total records : {total_records}')
print('NaT records : %d' %df[df['TO_DATE'].isna()].shape[0])
print('Date records : %d' %df[~df['TO_DATE'].isna()].shape[0])
sum_records = df[df['TO_DATE'].isna()].shape[0] + df[~df['TO_DATE'].isna()].shape[0]
print('SUM records : %d' %(sum_records))

if sum_records!=total_records:
    js_code = 'alert("Covert TO_DATE filed to datetime problem!!!")'
    display(Javascript(js_code))
    raise SystemExit("Stop right there!")

Total records : 15635
NaT records : 3449
Date records : 12186
SUM records : 15635


### 2.1 TOCB_DATE field to datetime

In [17]:
regex = r'[12]\d{3}-0[1-9]|1[0-2]-0[1-9]|[12]\d|3[01]'
df[~(df['TOCB_DATE'].astype(str).str.contains(regex))]['TOCB_DATE'].unique()

array([None], dtype=object)

In [18]:
df['TOCB_DATE'] = pd.to_datetime(df['TOCB_DATE'], errors='coerce')
# df.info()
print(f'Total records : {total_records}')
print('NaT records : %d' %df[df['TOCB_DATE'].isna()].shape[0])
print('Date records : %d' %df[~df['TOCB_DATE'].isna()].shape[0])
sum_records = df[df['TOCB_DATE'].isna()].shape[0] + df[~df['TOCB_DATE'].isna()].shape[0]
print('SUM records : %d' %(sum_records))

if sum_records!=total_records:
    js_code = 'alert("Covert TOCB_DATE filed to datetime problem!!!")'
    display(Javascript(js_code))
    raise SystemExit("Stop right there!")

Total records : 15635
NaT records : 3459
Date records : 12176
SUM records : 15635


In [19]:
# df.dtypes.value_counts()

## 3. Check miss match data in each fields

In [20]:
# df.info()

### 3.1 Weather

In [21]:
# df.loc[0:5, 'WEATHER'] = 'S'
indexList = df[~df['WEATHER'].isin(['N', 'R'])].index
if len(indexList):
    print(df.loc[indexList, 'WEATHER'])
    df.loc[indexList, 'WEATHER'] = 'N'
    print(df.loc[indexList, 'WEATHER'])
else:
    print('PASS^^ : Weahter data have already complete.')

PASS^^ : Weahter data have already complete.


### 3.2 ABB

In [22]:
# Check empty ABB (ABB can empty if GROUP is 'E')
# pd.crosstab(df['ABB'], columns='COUNT')
if len(df[(df['ABB'] == '') & (df['GROUP'] != 'E')]) > 0:
    print('Please check empty ABB records !!!')
    
    js_code = 'alert("Empty ABB problem!!!\\nPlease check empty ABB records")'
    display(Javascript(js_code))
    raise SystemExit("Stop right there!")
else:
    print('Pass^^ : No empty ABB records for non "E" GROUP')

Pass^^ : No empty ABB records for non "E" GROUP


In [23]:
# %%time
# Check ABB is not in Feeder fields
# df[df['GROUP'].isin(['F'])][['ABB', 'FEEDER']].apply(lambda row: print('Miss match "ABB" : %s and "FEEDER" : %s' %(row['ABB'], row['FEEDER']) if row['ABB'] not in row['FEEDER'] else 'Match'), axis=1);
missmatchAbbFeeder = pd.DataFrame()
feederRows = df[df['GROUP'].isin(['F'])][['ABB', 'FEEDER']]
for idx, row in feederRows.iterrows():
    if row['ABB'] not in row['FEEDER']:
        missmatchAbbFeeder = missmatchAbbFeeder.append(row)

if len(missmatchAbbFeeder) > 0:
    print('Please check group "F" records that "ABB" miss match with "FEEDER" !!!')
    print(missmatchAbbFeeder)
    missmatchAbbFeederIdx = missmatchAbbFeeder.index
    for idx, row in missmatchAbbFeeder.iterrows():
        df.loc[idx, 'ABB'] = row['FEEDER'].split('-',1)[0]
    print('They have already corrected^^')
    print(df.loc[missmatchAbbFeederIdx, ['ABB', 'FEEDER']])
else:
    print('Pass^^ : All group "F" records, "ABB" match with "FEEDER"')

Pass^^ : All group "F" records, "ABB" match with "FEEDER"


In [24]:
# Check ABB is not in CB fields
missmatchAbbStaionH = pd.DataFrame() # StationH = Station and 'H' event
missmatchAbbStationF = pd.DataFrame() # StationF = Station and 'I, O' event
stationRows = df[df['GROUP'].isin(['S'])][['ABB', 'FEEDER', 'CB', 'EVENT']]
for idx, row in stationRows.iterrows():
    if row['EVENT'] == 'H':
        if row['ABB'] not in row['CB']:
            missmatchAbbStaionH = missmatchAbbStaionH.append(row)
    else: # row['EVENT'] != 'H'
        if row['ABB'] not in row['FEEDER']:
            missmatchAbbStationF = missmatchAbbStationF.append(row)

if len(missmatchAbbStaionH) > 0:
    print('Please check group "S" records that "ABB" miss match with "CB" !!!')
    print(missmatchAbbStaionH)
    
    js_code = 'alert("Please check group \\"S\\" records that \\"ABB\\" miss match with \\"CB\\" !!!")'
    display(Javascript(js_code))
    raise SystemExit("Stop right there!")
else:
    print('Pass^^ : All group "S" records, "ABB" match with "CB"')
    
if len(missmatchAbbStationF) > 0:
    print('Please check group "S" records that "ABB" miss match with "FEEDER" !!!')
    print(missmatchAbbStationF)
    missmatchAbbStationFIdx = missmatchAbbStationF.index
    for idx, row in missmatchAbbStationF.iterrows():
        df.loc[idx, 'ABB'] = row['FEEDER'].split('-',1)[0]
    print('They have already corrected^^')
    print(df.loc[missmatchAbbStationFIdx, ['ABB', 'CB', 'EVENT', 'FEEDER']])
else:
    print('Pass^^ : All group "S" records, "ABB" match with "FEEDER"')

Pass^^ : All group "S" records, "ABB" match with "CB"
Pass^^ : All group "S" records, "ABB" match with "FEEDER"


In [25]:
# df.loc[12507:12509, 'ABB'] = 'WPC'

In [26]:
# Check ABB is not in LINE fields
missmatchAbbLineH = pd.DataFrame() # LineH = Line and 'H' event
missmatchAbbLineF = pd.DataFrame() # LineH = Line and 'I, O' event
lineRows = df[df['GROUP'].isin(['L'])][['ABB', 'FEEDER', 'LINE', 'EVENT']]
for idx, row in lineRows.iterrows():
    if row['EVENT'] == 'H':
        if row['ABB'] not in row['LINE']:
            missmatchAbbLineH = missmatchAbbLineH.append(row)
    else: # row['EVENT'] != 'H'
        if row['ABB'] not in row['FEEDER']:
            missmatchAbbLineF = missmatchAbbLineF.append(row)

if len(missmatchAbbLineH) > 0:
    print('Please check group "L" records that "ABB" miss match with "LINE" !!!')
    print(missmatchAbbLineH)
    
    js_code = 'alert("Please check group \\"L\\" records that \\"ABB\\" miss match with \\"LINE\\" !!!")'
    display(Javascript(js_code))
    raise SystemExit("Stop right there!")
else:
    print('Pass^^ : All group "L" records, "ABB" match with "LINE"')
    
if len(missmatchAbbLineF) > 0:
    print('Please check group "L" records that "ABB" miss match with "FEEDER" !!!')
    print(missmatchAbbLineF)
    missmatchAbbLineFIdx = missmatchAbbLineF.index
    for idx, row in missmatchAbbLineF.iterrows():
        df.loc[idx, 'ABB'] = row['FEEDER'].split('-',1)[0]
    print('They have already corrected^^')
    print(df.loc[missmatchAbbLineFIdx, ['ABB', 'EVENT', 'FEEDER', 'LINE']])
else:
    print('Pass^^ : All group "L" records, "ABB" match with "FEEDER"')

Pass^^ : All group "L" records, "ABB" match with "LINE"
Please check group "L" records that "ABB" miss match with "FEEDER" !!!
       ABB EVENT   FEEDER     LINE
12509  WPC     I  WKS-411  SKT-695
They have already corrected^^
       ABB EVENT   FEEDER     LINE
12509  WKS     I  WKS-411  SKT-695


### 3.3 FROM

In [27]:
regex = r'^[0-9]|[0-1][0-9]|[2][0-3]:[0-5][0-9]$'
missmatchFrom = df[~df['FROM'].str.contains(regex)]['FROM']
if len(missmatchFrom) > 0:
    print('Please check "FROM" miss match records !!!')
    
    js_code = 'alert("Please check \\"FROM\\" miss match records !!!")'
    display(Javascript(js_code))
    raise SystemExit("Stop right there!")
else:
    print('Pass^^ : All "FROM" records have already correct')

Pass^^ : All "FROM" records have already correct


### 3.4 TO

In [28]:
regex = r'^[0-9]|[0-1][0-9]|[2][0-3]:[0-5][0-9]$'
missmatchTo = df[~df['TO'].str.contains(regex)]['TO']
if len(missmatchTo) > 0:
    print('Please check "TO" miss match records !!!')
    missmatchToIdx = missmatchTo.index
    print(missmatchTo)
    df.loc[missmatchToIdx, 'TO'] = None
    print('They have already corrected^^')
    print(df.loc[missmatchToIdx, 'TO'])
else:
    print('Pass^^ : All "TO" records have already correct')

Please check "TO" miss match records !!!
8136    **:**
8137    **:**
8675    **:**
Name: TO, dtype: object
They have already corrected^^
8136    None
8137    None
8675    None
Name: TO, dtype: object


### 3.5 TO1

In [29]:
regex = r'^[0-9]|[0-1][0-9]|[2][0-3]:[0-5][0-9]$'
missmatchTo1 = df[~df['TO1'].str.contains(regex)]['TO1']
missmatchTo1 = missmatchTo1[~(missmatchTo1 == '')]
if len(missmatchTo1) > 0:
    print('Please check "TO1" miss match records !!!')
    missmatchTo1Idx = missmatchTo1.index
    print(missmatchTo1)
    df.loc[missmatchTo1Idx, 'TO1'] = None
    print('They have already corrected^^')
    print(df.loc[missmatchTo1Idx, 'TO1'])
else:
    print('Pass^^ : All "TO1" records have already correct')

Please check "TO1" miss match records !!!
2007      :
9118      :
Name: TO1, dtype: object
They have already corrected^^
2007    None
9118    None
Name: TO1, dtype: object


### 3.6 TO2

In [30]:
regex = r'^[0-9]|[0-1][0-9]|[2][0-3]:[0-5][0-9]$'
missmatchTo2 = df[~df['TO2'].str.contains(regex)]['TO2']
missmatchTo2 = missmatchTo2[~(missmatchTo2 == '')]
if len(missmatchTo2) > 0:
    print('Please check "TO2" miss match records !!!')
    missmatchTo2Idx = missmatchTo2.index
    print(missmatchTo2)
    df.loc[missmatchTo2Idx, 'TO2'] = None
    print('They have already corrected^^')
    print(df.loc[missmatchTo2Idx, 'TO2'])
else:
    print('Pass^^ : All "TO2" records have already correct')

Pass^^ : All "TO2" records have already correct


### 3.7 TO3

In [31]:
regex = r'^[0-9]|[0-1][0-9]|[2][0-3]:[0-5][0-9]$'
missmatchTo3 = df[~df['TO3'].str.contains(regex)]['TO3']
missmatchTo3 = missmatchTo3[~(missmatchTo3 == '')]
if len(missmatchTo3) > 0:
    print('Please check "TO3" miss match records !!!')
    missmatchTo3Idx = missmatchTo3.index
    print(missmatchTo3)
    df.loc[missmatchTo3Idx, 'TO3'] = None
    print('They have already corrected^^')
    print(df.loc[missmatchTo3Idx, 'TO3'])
else:
    print('Pass^^ : All "TO3" records have already correct')

Pass^^ : All "TO3" records have already correct


### 3.8 TO4

In [32]:
regex = r'^[0-9]|[0-1][0-9]|[2][0-3]:[0-5][0-9]$'
missmatchTo4 = df[~df['TO4'].str.contains(regex)]['TO4']
missmatchTo4 = missmatchTo4[~(missmatchTo4 == '')]
if len(missmatchTo4) > 0:
    print('Please check "TO4" miss match records !!!')
    missmatchTo4Idx = missmatchTo4.index
    print(missmatchTo4)
    df.loc[missmatchTo4Idx, 'TO4'] = None
    print('They have already corrected^^')
    print(df.loc[missmatchTo4Idx, 'TO4'])
else:
    print('Pass^^ : All "TO4" records have already correct')

Pass^^ : All "TO4" records have already correct


### 3.9 TOCB1

In [33]:
regex = r'^[0-9]|[0-1][0-9]|[2][0-3]:[0-5][0-9]$'
missmatchTocb1 = df[~df['TOCB1'].str.contains(regex)]['TOCB1']
missmatchTocb1 = missmatchTocb1[~(missmatchTocb1 == '')]
if len(missmatchTocb1) > 0:
    print('Please check "TOCB1" miss match records !!!')
    missmatchTocb1Idx = missmatchTocb1.index
    print(missmatchTocb1)
    df.loc[missmatchTocb1Idx, 'TOCB1'] = None
    print('They have already corrected^^')
    print(df.loc[missmatchTocb1Idx, 'TOCB1'])
else:
    print('Pass^^ : All "TOCB1" records have already correct')

Please check "TOCB1" miss match records !!!
481          :
523          :
743          :
819          :
879          :
1704         :
1795         :
1968         :
7338     **:**
8095         :
8149         :
8206         :
8492         :
8601         :
8631         :
8827         :
9233         :
9415         :
9421         :
9836         :
9872         :
10210        :
10236        :
10377        :
10699        :
10790        :
10893        :
11185        :
11198    **:**
11199        :
11333        :
11738        :
11964        :
12008    **:**
12127        :
12331        :
12526        :
12694        :
12759        :
12767        :
Name: TOCB1, dtype: object
They have already corrected^^
481      None
523      None
743      None
819      None
879      None
1704     None
1795     None
1968     None
7338     None
8095     None
8149     None
8206     None
8492     None
8601     None
8631     None
8827     None
9233     None
9415     None
9421     None
9836     None
9872     None
10210

### 3.10 TOCB2

In [34]:
regex = r'^[0-9]|[0-1][0-9]|[2][0-3]:[0-5][0-9]$'
missmatchTocb2 = df[~df['TOCB2'].str.contains(regex)]['TOCB2']
missmatchTocb2 = missmatchTocb2[~(missmatchTocb2 == '')]
if len(missmatchTocb2) > 0:
    print('Please check "TOCB2" miss match records !!!')
    missmatchTocb2Idx = missmatchTocb2.index
    print(missmatchTocb2)
    df.loc[missmatchTocb2Idx, 'TOCB2'] = None
    print('They have already corrected^^')
    print(df.loc[missmatchTocb2Idx, 'TOCB2'])
else:
    print('Pass^^ : All "TOCB2" records have already correct')

Please check "TOCB2" miss match records !!!
9649         :
10504    **:**
11266    **:**
Name: TOCB2, dtype: object
They have already corrected^^
9649     None
10504    None
11266    None
Name: TOCB2, dtype: object


### 3.11 TOCB3

In [35]:
regex = r'^[0-9]|[0-1][0-9]|[2][0-3]:[0-5][0-9]$'
missmatchTocb3 = df[~df['TOCB3'].str.contains(regex)]['TOCB3']
missmatchTocb3 = missmatchTocb3[~(missmatchTocb3 == '')]
if len(missmatchTocb3) > 0:
    print('Please check "TOCB3" miss match records !!!')
    missmatchTocb3Idx = missmatchTocb3.index
    print(missmatchTocb3)
    df.loc[missmatchTocb3Idx, 'TOCB3'] = None
    print('They have already corrected^^')
    print(df.loc[missmatchTocb3Idx, 'TOCB3'])
else:
    print('Pass^^ : All "TOCB3" records have already correct')

Please check "TOCB3" miss match records !!!
9342     **:**
9394     **:**
11604        :
Name: TOCB3, dtype: object
They have already corrected^^
9342     None
9394     None
11604    None
Name: TOCB3, dtype: object


### 3.12 TOCB4

In [36]:
regex = r'^[0-9]|[0-1][0-9]|[2][0-3]:[0-5][0-9]$'
missmatchTocb4 = df[~df['TOCB4'].str.contains(regex)]['TOCB4']
missmatchTocb4 = missmatchTocb4[~(missmatchTocb4 == '')]
if len(missmatchTocb4) > 0:
    print('Please check "TOCB4" miss match records !!!')
    missmatchTocb4Idx = missmatchTocb4.index
    print(missmatchTocb4)
    df.loc[missmatchTocb4Idx, 'TOCB4'] = None
    print('They have already corrected^^')
    print(df.loc[missmatchTocb4Idx, 'TOCB4'])
else:
    print('Pass^^ : All "TOCB4" records have already correct')

Please check "TOCB4" miss match records !!!
2168     **:**
2638     **:**
3370     **:**
10655    **:**
Name: TOCB4, dtype: object
They have already corrected^^
2168     None
2638     None
3370     None
10655    None
Name: TOCB4, dtype: object


### 3.13 NEW_CODE

In [37]:
# con.table_names()
# df[0:1] -> new_code = '053'
# df.loc[0, 'NEW_CODE'] = '053'

In [38]:
# r = con.execute('select * from nw_cause limit 5;').fetchall()
# r
sql = 'select sub_code from nw_cause'
nw_cause_sub_code = pd.read_sql(sql, con).iloc[:, 0].values
missmatchNew_code = df[~df['NEW_CODE'].isin(nw_cause_sub_code)]\
                        [['DATE', 'FEEDER', 'FROM', 'LINE', 'CB', 'GROUP', 'DISTRICT', 'NEW_CODE', 'COMPONENT']]
if len(missmatchNew_code) > 0:
    print('Please check "NEW_CODE" miss match records !!!')
    print(missmatchNew_code)
    
    js_code = 'alert("Please check \\"NEW_CODE\\" miss match records !!!")'
    display(Javascript(js_code))
    raise SystemExit("Stop right there!")
else:
    print('Pass^^ : All "NEW_CODE" records have already correct')

Pass^^ : All "NEW_CODE" records have already correct


### 3.14 DISTRICT

In [39]:
# pd.crosstab(df['DISTRICT'], columns='COUNT')
# df[~df['DISTRICT'].isin(range(1,19))]['EVENT'].unique()
# df[df['EVENT']=='S']
# 15
# df.loc[0, 'DISTRICT'] = 15

In [40]:
missmatchDistrict = df[~df['DISTRICT'].isin(range(1,19)) & df['EVENT'].isin(['I', 'O'])]\
                        [['DATE', 'FEEDER', 'FROM', 'LINE', 'CB', 'GROUP', 'DISTRICT', 'NEW_CODE', 'COMPONENT']]
len(missmatchDistrict)
if len(missmatchDistrict) > 0:
    print('Please check "DISTRICT" miss match records !!!')
    print(missmatchDistrict)
    
    js_code = 'alert("Please check \\"DISTRICT\\" miss match records !!!")'
    display(Javascript(js_code))
    raise SystemExit("Stop right there!")
else:
    print('Pass^^ : All "DISTRICT" records have already correct')

Pass^^ : All "DISTRICT" records have already correct


### 3.15 COMPONENT

In [41]:
# df.loc[0, 'COMPONENT'] = 'C22' # 'C22'

In [42]:
sql = 'select code from component'
component_code = pd.read_sql(sql, con).iloc[:, 0]
missmatchComponent = df[~((df['COMPONENT'].isin(component_code)) | (df['COMPONENT'] == ''))]\
                        [['DATE', 'FEEDER', 'FROM', 'LINE', 'CB', 'GROUP', 'DISTRICT', 'NEW_CODE', 'COMPONENT']]
if len(missmatchComponent) > 0:
    print('Please check "COMPONENT" miss match records !!!')
    print(missmatchComponent)
    
    js_code = 'alert("Please check \\"COMPONENT\\" miss match records !!!")'
    display(Javascript(js_code))
    raise SystemExit("Stop right there!")
else:
    print('Pass^^ : All "COMPONENT" records have already correct')

Pass^^ : All "COMPONENT" records have already correct


In [43]:
# component_code

### 3.16 OLD_DISTRI

In [44]:
# df['OLD_DISTRI'].unique()
missingValueOldDistrictIdx = df[df['OLD_DISTRI'].isna()].index
if len(missingValueOldDistrictIdx) > 0:
    print('Please check "OLD_DISTRI" missing value records !!!')
    print(missingValueOldDistrictIdx)
    df.loc[missingValueOldDistrictIdx, 'OLD_DISTRI'] = 0
    print('They have already corrected^^')
    print(df.loc[missingValueOldDistrictIdx, 'OLD_DISTRI'].value_counts())
else:
    print('Pass^^ : All "OLD_DISTRI" records have already correct')

df['OLD_DISTRI'] = df['OLD_DISTRI'].astype('int', errors='ignore')

Please check "OLD_DISTRI" missing value records !!!
Int64Index([    0,     1,     2,     3,     4,     5,     6,     7,     8,
                9,
            ...
            15625, 15626, 15627, 15628, 15629, 15630, 15631, 15632, 15633,
            15634],
           dtype='int64', length=15634)
They have already corrected^^
0.0    15634
Name: OLD_DISTRI, dtype: int64


### 3.17 NEPO_GROUP  

In [45]:
df['NEPO_GROUP'].unique()
# df['NEPO_GROUP'].unique()
missingValueNepoGroupIdx = df[df['NEPO_GROUP'].isna()].index
if len(missingValueNepoGroupIdx) > 0:
    print('Please check "NEPO_GROUP" missing value records !!!')
    print(missingValueNepoGroupIdx)
    df.loc[missingValueNepoGroupIdx, 'NEPO_GROUP'] = 0
    print('They have already corrected^^')
    print(df.loc[missingValueNepoGroupIdx, 'NEPO_GROUP'].value_counts())
else:
    print('Pass^^ : All "NEPO_GROUP" records have already correct')

df['NEPO_GROUP'] = df['NEPO_GROUP'].astype('int', errors='ignore')

Please check "NEPO_GROUP" missing value records !!!
Int64Index([4979, 4980, 4981, 4982, 4983, 4984, 4985, 4986, 4987, 4988, 4989,
            4990, 4991, 4992, 4993, 4994, 4995, 4996, 4997, 4998, 4999, 5000,
            5001, 5002, 5003, 5004, 5005, 5006, 5007, 5008, 5009, 5010, 5011,
            5012, 5013, 5014, 5015, 5016],
           dtype='int64')
They have already corrected^^
0.0    38
Name: NEPO_GROUP, dtype: int64


### 3.18 KV

In [46]:
kvValue = {2: 12, 4: 24, 6: 69, 7: 115}

rows = df[df['KV'].isna()][['FEEDER', 'LINE', 'CB', 'KV', 'GROUP']]
rowsIdx = rows.index
print(rows)
for idx, row in rows.iterrows():
    df.loc[idx, 'KV'] = kvValue[int(row['CB'][3])] if row['CB'][3] != 'C' else kvValue[int(row['CB'][4])]

print(df.loc[rowsIdx, ['FEEDER', 'LINE', 'CB', 'KV', 'GROUP']])
df['KV'] = df['KV'].astype('int', errors='ignore')

      FEEDER     LINE       CB  KV GROUP
8136                   BRY7912 NaN     S
10304                   BTR413 NaN     S
10308                   BTR413 NaN     S
10837                   WKS412 NaN     S
10839                    WKS27 NaN     S
13168         CLT-964  CLT6942 NaN     L
15624                   RTR414 NaN     S
      FEEDER     LINE       CB     KV GROUP
8136                   BRY7912  115.0     S
10304                   BTR413   24.0     S
10308                   BTR413   24.0     S
10837                   WKS412   24.0     S
10839                    WKS27   12.0     S
13168         CLT-964  CLT6942   69.0     L
15624                   RTR414   24.0     S


### 3.19 CONTROL

In [47]:
missmatchControl = df[(df['CONTROL'] == '') | (df['CONTROL'].isna())]\
                        [['DATE', 'FEEDER', 'FROM', 'LINE', 'CB', 'GROUP', 'NEW_CODE', 'COMPONENT', 'CONTROL']]
if len(missmatchControl) > 0:
    print('Please check "CONTROL" miss match records !!!')
    print(missmatchControl)
    
    js_code = 'alert("Please check \\"CONTROL\\" miss match records !!!")'
    display(Javascript(js_code))
    raise SystemExit("Stop right there!")
else:
    print('Pass^^ : All "CONTROL" records have already correct')

Pass^^ : All "CONTROL" records have already correct


In [48]:
df

Unnamed: 0,ID,DATE,DAY,MONTH,NEW_MONTH,NEW_MONTH2,YEAR,DOW,TYPE_DAY,WEATHER,TEMPERATUR,SEASON,ABB,FEEDER,AREA,NIKOM,LINE,CB,TYPE_LINE,ABNORMAL,KV,UNIT,EVENT,FROM,TO,PERIOD,TO_DATE,AMP,NAMP,TO1,TO2,TO3,TO4,AMP1,AMP2,AMP3,AMP4,TOCB1,TOCB_DATE,TOCB2,TOCB3,TOCB4,NEW_CODE,GROUP,CODE,TYPE,NEPO_GROUP,OLD_DISTRI,DISTRICT,POLE,LATERAL,ROAD,RELAY,COMPONENT,DEVICE,DEV_TYPE,PHASE,FAULT_TYPE,TIMES,LENGTH,TIMEOCB,TIMEFUSE,TIMEREPAIR,KVA_HR,LOSS,MW,BATH,TIME_EQ,MAJOR,CONTROL
0,1,2020-01-01,1,1,1,1,2020,4,,N,0.0,,BBO,BBO-411,S,N,,,,,24,D,I,00:07,00:07,,2020-01-01,90,90,01:39,,,,36,0,0,0,,2020-01-01,,,,053,F,080,TF,40,0,15,,,TALINGCHAN - SUPHAN RD.,"OCI-T-Y,EFI-T,RR",C22,,,Y,,,0,0,92,0,1612,1379,0.00,356.47,36.8,,C
1,2,2020-01-01,1,1,1,1,2020,4,,N,0.0,,BI,BI-424,C,N,,,,,24,D,O,05:44,06:13,,2020-01-01,160,160,06:17,,,,64,0,0,0,05:45,2020-01-01,06:01,,,063,F,090,SF,40,0,9,,BI424-22H,,"OCI-T-R,EFI-T,RR",C14,,,R,,,0,29,4,0,4642,3969,0.00,1025.99,30.6,,U
2,3,2020-01-01,1,1,1,1,2020,4,,N,0.0,,KE,KE-418,C,N,,,,,24,D,I,06:13,06:13,,2020-01-01,80,80,,,,,0,0,0,0,,2020-01-01,,,,011,F,020,TF,70,0,6,,,,"OCI-R,RR",,,,R,,,0,0,0,0,0,0,0.00,0.00,0.0,,C
3,4,2020-01-01,1,1,1,1,2020,4,,N,0.0,,KM,KM-427,C,N,,,,,24,D,I,06:35,06:35,,2020-01-01,115,115,,,,,0,0,0,0,,2020-01-01,,,,052,F,080,TF,40,0,6,,,NAWONG PHATTHANA RD.,"OCI-Y,EFI,RR",C07,,,Y,,,0,0,0,0,0,0,0.00,0.00,0.0,,C
4,5,2020-01-01,1,1,1,1,2020,4,,N,0.0,,RD,RD-412,C,N,,,,,24,D,I,07:33,07:33,,2020-01-01,90,90,,,,,0,0,0,0,,2020-01-01,,,,011,F,020,TF,70,0,8,,,,"OCI-R,RR",,,,R,,,0,0,0,0,0,0,0.00,0.00,0.0,,C
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15630,691,2020-12-31,31,12,12,12,2020,5,,N,0.0,,SA,SA-424,C,N,,,,,24,D,I,16:07,16:07,,2020-12-31,70,70,16:45,,,,28,0,0,0,,2020-12-31,,,,052,F,080,TF,40,0,4,,S & P BUILDING,,"OCI-T-YB,EFI-T,RR",C07,,,YB,,,0,0,38,0,518,443,0.00,114.52,15.2,,C
15631,692,2020-12-31,31,12,12,12,2020,5,,N,0.0,,MC,MC-418,C,N,,,,,24,D,I,16:38,16:38,,2020-12-31,80,80,17:23,,,,32,0,0,0,,2020-12-31,,,,053,F,080,TF,40,0,8,,HT.87-060715,,"EFI,RR",C22,,,,,,0,0,45,0,701,599,0.00,154.84,18.0,,C
15632,693,2020-12-31,31,12,12,12,2020,5,,N,0.0,,SM,SM-413,C,N,,,,,24,D,I,16:58,16:58,,2020-12-31,83,83,,,,,0,0,0,0,,2020-12-31,,,,051,F,080,TF,40,0,8,,HT.88-015791,,"OCI-T-YB,RR",C20,,,YB,,,0,0,0,0,0,0,0.00,0.00,0.0,,C
15633,694,2020-12-31,31,12,12,12,2020,5,,N,0.0,,KM,KM-423,C,N,,,,,24,C,O,17:26,17:26,,2020-12-31,36,36,,,,,0,0,0,0,,2020-12-31,,,,191,F,181,CO,80,0,14,,HT.78-089775,,"OCI-T-R,EFI-T,RR",C07,,,R,,,0,0,0,0,0,0,0.00,0.00,0.0,,U


In [51]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15635 entries, 0 to 15634
Data columns (total 70 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   ID          15635 non-null  int64         
 1   DATE        15635 non-null  datetime64[ns]
 2   DAY         15635 non-null  int64         
 3   MONTH       15635 non-null  int64         
 4   NEW_MONTH   15635 non-null  int64         
 5   NEW_MONTH2  15635 non-null  int64         
 6   YEAR        15635 non-null  int64         
 7   DOW         15635 non-null  int64         
 8   TYPE_DAY    15635 non-null  object        
 9   WEATHER     15635 non-null  object        
 10  TEMPERATUR  15635 non-null  object        
 11  SEASON      15635 non-null  object        
 12  ABB         15635 non-null  object        
 13  FEEDER      15635 non-null  object        
 14  AREA        15635 non-null  object        
 15  NIKOM       15635 non-null  object        
 16  LINE        15635 non-

In [54]:
df.to_csv(r'E:\Python\DbfToMySql\year2020.csv',index=False)