### Code to preprocess, clean and export the data into database table - tbl_pbr_data

In [1]:
import numpy as np
import pandas as pd
import os
import re
from datetime import date
import sqlalchemy
import sys
import mariadb
import pymysql

import warnings    # to avoid warning during executions
warnings.filterwarnings("ignore")

### Listing PBR files & Set file Name

In [2]:
# load all the file names into a list (files in the pbr data folder)
os.getcwd()
_dir_path = '../../../dataExport/PBRdata'
raw_data = os.listdir(_dir_path)
raw_data

['_10_FPC23_Rhosa.csv',
 '_11_FPC24_SkeMa_1.csv',
 '_12_FPC24_SkeMa_2.csv',
 '_13_FPC11_Rhosa_1.csv',
 '_14_FPC12_DiaLut.csv',
 '_15_FPC22_Thalaps.csv',
 '_16_FPC23_NaCho.csv',
 '_1_FPC13_ChCal_1.csv',
 '_2_FPC13_ChCal_2.csv',
 '_3_FPC13_ThalaPs.csv',
 '_4_FPC14_chamu.csv',
 '_5_FPC14_RhoSa.csv',
 '_6_FPC14_thalaps.csv',
 '_7_FPC21_chamu.csv',
 '_8_FPC21_Techu.csv',
 '_9_FPC22_DiaLut.csv']

In [3]:
filename = raw_data[0]
filename

'_10_FPC23_Rhosa.csv'

### Import Unit number and Species ID from from Database

In [4]:
# database connection with mariaDB for read the read and write data from and to dataframe <----> mariaDB tables
database_username = 'root'
database_password = 'password'
database_ip       = '127.0.0.1:3306'
database_name     = 'data_dashboard'
database_connection = sqlalchemy.create_engine('mariadb+mariadbconnector://{0}:{1}@{2}/{3}'.
                                               format(database_username, database_password, 
                                                      database_ip, database_name))

In [5]:
# fetch data from tbl_pbr_lookup to data frame
tbl_lookup = pd.read_sql('SELECT * FROM tbl_pbr_lookup', database_connection)
tbl_lookup.tail(3)

Unnamed: 0,instance,unit,species,start_date,end_date,remark,raw_file_name
9,10,5,7,1616427000,1626690600,,_10_FPC23_Rhosa.csv
10,11,7,8,1620129600,1633964395,,_11_FPC24_SkeMa_1.csv
11,12,7,8,1635246000,1641567595,,_12_FPC24_SkeMa_2.csv


### Set - Trial No, Unit ID and Species id

In [6]:
species_row = tbl_lookup.query('raw_file_name == @filename')
species_id = int(species_row['species'])
trial_no = int(species_row['instance'])
unit_id = int(species_row['unit'])
species_row

Unnamed: 0,instance,unit,species,start_date,end_date,remark,raw_file_name
9,10,5,7,1616427000,1626690600,,_10_FPC23_Rhosa.csv


In [7]:
print("Species ID : ", species_id)
print("Instance No : ", trial_no)
print("Unit No : ", unit_id)

Species ID :  7
Instance No :  10
Unit No :  5


### Load the raw data into a dataframe

In [8]:
# join file name with path - file name can be specified as list item based on required file name
file_path=  os.path.join(_dir_path, filename)

In [9]:
# load the selected file into the data frame.
pbr_rawDF = pd.read_csv(file_path, sep=',')
pbr_rawDF.head(5)

Unnamed: 0,primary_key,_timeString,FPC23,FT1,FT2,PT1_V4,PT2,FT3,CO2in,CO2_V2,...,QT,QC,QB,QR,QG,QT_corr,TT0,TT1,pH,C_PAR
0,1616427000,2021-3-22 16:30:0,FPC23,6.2414,19.5991,98.8617,136.6463,0.0,0.04,0.1343,...,69.0354,29.563,11.364,6.7134,10.4412,432.2674,21.4094,20.6272,14.803,0.0
1,1616427005,2021-3-22 16:30:5,FPC23,0.0,19.5991,98.9197,136.6463,0.0,0.04,0.1343,...,69.0192,29.5248,11.3483,6.7065,10.4248,432.2674,21.4042,20.6202,14.8002,0.0
2,1616427010,2021-3-22 16:30:10,FPC23,8.4852,19.5818,98.8602,136.5891,0.0,0.04,0.1343,...,69.0068,29.4984,11.3384,6.7015,10.414,432.2674,21.4102,20.5971,14.7837,0.0
3,1616427015,2021-3-22 16:30:15,FPC23,11.2677,19.5982,98.786,136.5615,0.0,0.04,0.1343,...,69.0068,29.4843,11.3325,6.6975,10.4083,432.2674,21.4102,20.5999,14.7966,0.0
4,1616427020,2021-3-22 16:30:20,FPC23,11.4072,19.5787,98.8449,136.4703,0.0,0.04,0.1343,...,68.9968,29.4769,11.3294,6.6953,10.4053,432.2674,21.4136,20.6341,14.7997,0.0


In [19]:
pbr_raw_tmp1 = pbr_rawDF.copy()

#### Data preprocessing function (chain programming)

In [165]:
def pbr_preProcess(pbr_rawDF):
    pbr_raw_tmp1 = pbr_rawDF.copy()
    pbr_raw_tmp1.insert(loc=0, column='trial_no', value=trial_no)
    pbr_raw_tmp1.insert(loc=4, column='species', value=species_id)      
    return(
    pbr_raw_tmp1
    #.assign(trial_no = trail_no)
    .rename(columns={ pbr_raw_tmp1.columns[3]: "unit" })
    .assign(unit = unit_id)
    .replace('Empty', pd.np.nan)
    .replace([np.inf, -np.inf], pd.np.nan)
    .astype({'primary_key':'int64', '_timeString':'datetime64[ns]','unit': 'int16','FT1':'float32', 'FT2':'float32', 'PT1_V4':'float32', 'PT2':'float32', 'FT3':'float32', 'CO2in':'float32', 'CO2_V2':'float32', 'CO2sys':'float32', 'LT1':'float32', 'LReactor':'float32', 'C_g':'float32', 'C_Eff':'float32', 'C_CD':'float32', 'T_CD':'float32', 'C_D_g':'float32', 'C_D_Eff':'float32', 'QT':'float32', 'QC':'float32', 'QB':'float32', 'QR':'float32', 'QG':'float32', 'QT_corr':'float32', 'TT0':'float32', 'TT1':'float32', 'pH':'float32', 'C_PAR':'float32'})
    )
    #.astype({'primary_key':'int64', '_timeString':'datetime64[ns]','unit': 'int16','FT1':'float32', 'FT2':'float32', 'PT1_V4':'float32', 'PT2':'float32', 'FT3':'float32', 'CO2in':'float32', 'CO2_V2':'float32', 'CO2sys':'float32', 'LT1':'float32', 'LReactor':'float32', 'C_g':'float32', 'C_Eff':'float32', 'C_CD':'float32', 'T_CD':'float32', 'C_D_g':'float32', 'C_D_Eff':'float32', 'QT':'float32', 'QC':'float32', 'QB':'float32', 'QR':'float32', 'QG':'float32', 'QT_corr':'float32', 'TT0':'float32', 'TT1':'float32', 'pH':'float32', 'C_PAR':'float32'})

pbr_cleaned = pbr_preProcess(pbr_rawDF)

In [166]:
pbr_cleaned_tmp= pbr_cleaned.copy()
pbr_columns = ['FT1', 'FT2', 'PT1_V4', 'PT2', 'FT3', 'CO2in', 'CO2_V2', 'CO2sys', 'LT1', 'LReactor', 'C_g', 'C_Eff', 'C_CD', 'T_CD', 'C_D_g', 'C_D_Eff', 'QT', 'QC', 'QB', 'QR', 'QG', 'QT_corr', 'TT0', 'TT1', 'pH', 'C_PAR']

In [167]:
for pbr_col in pbr_columns:
    print(pbr_col)
    pbr_cleaned_tmp[pbr_col]=pbr_cleaned_tmp[pbr_col].interpolate(method='linear')


#pbr_cleaned[pbr_columns]=pbr_cleaned[pbr_columns].fillna(df.mode().iloc[0])

FT1
FT2
PT1_V4
PT2
FT3
CO2in
CO2_V2
CO2sys
LT1
LReactor
C_g
C_Eff
C_CD
T_CD
C_D_g
C_D_Eff
QT
QC
QB
QR
QG
QT_corr
TT0
TT1
pH
C_PAR


In [168]:
pbr_cleaned_tmp.isna().sum()

trial_no       0
primary_key    0
_timeString    0
unit           0
species        0
FT1            0
FT2            0
PT1_V4         0
PT2            0
FT3            0
CO2in          0
CO2_V2         0
CO2sys         0
LT1            0
LReactor       0
C_g            0
C_Eff          0
C_CD           0
T_CD           0
C_D_g          0
C_D_Eff        0
QT             0
QC             0
QB             0
QR             0
QG             0
QT_corr        0
TT0            0
TT1            0
pH             0
C_PAR          0
dtype: int64

In [171]:
#pbr_cleaned_tmp['QC'].value_counts(dropna= False).head(50)

ds = pbr_cleaned_tmp.isin([np.inf, -np.inf]).values.sum()



In [172]:
ds

31112

In [173]:
count = np.isinf(pbr_cleaned_tmp).values.sum()
count

31112

In [174]:
print("printing column name where infinity is present")
col_name = pbr_cleaned_tmp.columns.to_series()[np.isinf(pbr_cleaned_tmp).any()]
print(col_name)

printing column name where infinity is present
C_D_g        C_D_g
C_D_Eff    C_D_Eff
dtype: object


In [175]:
# counting infinity in a particular column name
c = np.isinf(pbr_cleaned_tmp['C_D_g']).values.sum()
c

15737

In [176]:
# counting infinity in a particular column name
c = np.isinf(pbr_cleaned_tmp['C_D_Eff']).values.sum()
c

15375

In [177]:
r = pbr_cleaned_tmp.index[np.isinf(pbr_cleaned_tmp).any(1)]
print(r)

Int64Index([ 841000,  841001,  841002,  841003,  841004,  841005,  841006,
             841007,  841008,  841009,
            ...
            1843551, 1843552, 1843553, 1843554, 1843555, 1843556, 1843557,
            1843558, 1843559, 1843560],
           dtype='int64', length=15737)


In [161]:
pbr_cleaned_tmp['C_D_Eff'].iloc[391940:391949]

391940    2.944051
391941    2.914902
391942    2.885753
391943    2.856604
391944    2.827455
391945    2.798306
391946    2.769157
391947    2.740008
391948    2.710859
Name: C_D_Eff, dtype: float32

In [162]:
pbr_cleaned['C_D_Eff'].iloc[391940:391949]

391940   NaN
391941   NaN
391942   NaN
391943   NaN
391944   NaN
391945   NaN
391946   NaN
391947   NaN
391948   NaN
Name: C_D_Eff, dtype: float32

In [163]:
pbr_rawDF['C_D_Eff'].iloc[391940:391949]

391940    inf
391941    inf
391942    inf
391943    inf
391944    inf
391945    NaN
391946    inf
391947    inf
391948    inf
Name: C_D_Eff, dtype: object

In [None]:
inf, -nan



In [164]:
pbr_cleaned_tmp.isin([np.inf]).sum(axis=0)

trial_no           0
primary_key        0
_timeString        0
unit               0
species            0
FT1                0
FT2                0
PT1_V4             0
PT2                0
FT3                0
CO2in              0
CO2_V2             0
CO2sys             0
LT1                0
LReactor           0
C_g                0
C_Eff              0
C_CD               0
T_CD               0
C_D_g          15737
C_D_Eff        15375
QT                 0
QC                 0
QB                 0
QR                 0
QG                 0
QT_corr            0
TT0                0
TT1                0
pH                 0
C_PAR              0
dtype: int64

In [143]:
pbr_cleaned_tmp['C_D_Eff'].value_counts(bins=5)

ValueError: cannot specify integer `bins` when input data contains infinity

### Export Cleaned Data into DB

In [105]:
connection = pymysql.connect(host='localhost',
                            user='root',
                            password='password',
                            db='data_dashboard')

# create cursor
cursor=connection.cursor()

In [109]:
cols = "`,`".join([str(i) for i in pbr_cleaned.columns.tolist()])

In [110]:
for i,row in pbr_cleaned_tmp.iterrows():
    sql = "INSERT INTO `tbl_pbr_data` (`" +cols + "`) VALUES (" + "%s,"*(len(row)-1) + "%s)"
    cursor.execute(sql, tuple(row))
    # the connection is not autocommitted by default, so we must commit to save our changes
    #connection.commit()

ProgrammingError: inf can not be used with MySQL

## Appendix

In [None]:
'''
# set instance number from the file name
trail_no = re.split("_", filename)
print(trail_no)
trail_no = int(trail_no[1])
print(trail_no)

'''

In [None]:
'''
unit_ID = tbl_unit.query('unit_name == "FPC13"')
unit_no = unit_ID['unit_id']
unit_no

'''