### Code to preprocess, clean and export the data into database table - tbl_pbr_data

In [2]:
import numpy as np
import pandas as pd
import os
import re
from datetime import date
import sqlalchemy
import sys
import mariadb
import pymysql

import warnings    # to avoid warning during executions
warnings.filterwarnings("ignore")

### Listing PBR files & Set file Name

In [24]:
# load all the file names into a list (files in the pbr data folder)
os.getcwd()
_dir_path = '../../../dataExport/PBRdata'
raw_data = os.listdir(_dir_path)
raw_data

['_10_FPC23_Rhosa.csv',
 '_11_FPC24_SkeMa_1.csv',
 '_12_FPC24_SkeMa_2.csv',
 '_1_FPC13_ChCal_1.csv',
 '_2_FPC13_ChCal_2.csv',
 '_3_FPC13_ThalaPs.csv',
 '_4_FPC14_chamu.csv',
 '_5_FPC14_RhoSa.csv',
 '_6_FPC14_thalaps.csv',
 '_7_FPC21_chamu.csv',
 '_8_FPC21_Techu.csv',
 '_9_FPC22_DiaLut.csv']

In [27]:
filename = raw_data[3]
filename

'_1_FPC13_ChCal_1.csv'

### Import Unit number and Species ID from from Database

In [6]:
# database connection with mariaDB for read the read and write data from and to dataframe <----> mariaDB tables
database_username = 'root'
database_password = 'password'
database_ip       = '127.0.0.1:3306'
database_name     = 'data_dashboard'
database_connection = sqlalchemy.create_engine('mariadb+mariadbconnector://{0}:{1}@{2}/{3}'.
                                               format(database_username, database_password, 
                                                      database_ip, database_name))

In [8]:
# fetch data from tbl_pbr_lookup to data frame
tbl_lookup = pd.read_sql('SELECT * FROM tbl_pbr_lookup', database_connection)
tbl_lookup.head(1)

Unnamed: 0,instance,unit,species,start_date,end_date,remark,raw_file_name
0,1,6,1,1629468000,1633939195,missing values from 2021-10-3 2:56:55 to 2021-...,_1_FPC13_ChCal_1.csv


### Set - Trial No, Unit ID and Species id

In [54]:
species_row = tbl_lookup.query('raw_file_name == @filename')
species_id = int(species_row['species'])
trail_no = int(species_row['instance'])
unit_id = int(species_row['unit'])

In [55]:
print("Species ID : ", species_id)
print("Instance No : ", trail_no)
print("Unit No : ", unit_id)

Species ID :  1
Instance No :  1
Unit No :  6


### Load the raw data into a dataframe

In [56]:
# join file name with path - file name can be specified as list item based on required file name
file_path=  os.path.join(_dir_path, filename)

In [57]:
# load the selected file into the data frame.
pbr_rawDF = pd.read_csv(file_path, sep=',')
pbr_rawDF.head(5)

Unnamed: 0,primary_key,_timeString,unit,FT1,FT2,PT1_V4,PT2,FT3,CO2in,CO2_V2,...,QT,QC,QB,QR,QG,QT_corr,TT0,TT1,pH,C_PAR
0,1629468000,2021-8-20 16:0:0,FPC13,0.0,20.3,87.4364,115.5517,0.0,0.04,0.2703,...,76.4022,0.3467,0.0977,0.1338,0.0977,-165.7484,20.5091,19.556,14.9008,0.0
1,1629468005,2021-8-20 16:0:5,FPC13,0.0,20.3158,87.6113,114.9015,0.0,0.04,0.2703,...,76.4022,0.3466,0.0977,0.1335,0.0977,-165.7484,20.5091,19.5481,14.9008,0.0
2,1629468010,2021-8-20 16:0:10,FPC13,0.0,20.3041,87.3263,115.4681,0.0,0.04,0.2703,...,76.4022,0.3466,0.0977,0.1332,0.0977,-165.7484,20.5201,19.6102,14.9008,0.0
3,1629468015,2021-8-20 16:0:15,FPC13,0.0,20.3159,87.2076,115.3525,0.0,0.04,0.2703,...,76.4022,0.3469,0.0977,0.1324,0.0977,-165.7484,20.5091,19.5792,14.9008,0.0
4,1629468020,2021-8-20 16:0:20,FPC13,0.0,20.3044,87.241,115.2204,0.0,0.04,0.2703,...,76.4022,0.3473,0.0977,0.1318,0.0977,-165.7484,20.5121,19.6156,14.9008,0.0


In [5]:
# dataframe overview
pbr_rawDF.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 894240 entries, 0 to 894239
Data columns (total 29 columns):
 #   Column       Non-Null Count   Dtype 
---  ------       --------------   ----- 
 0   primary_key  894240 non-null  int64 
 1   _timeString  894240 non-null  object
 2   unit         894240 non-null  object
 3   FT1          894240 non-null  object
 4   FT2          894240 non-null  object
 5   PT1_V4       894240 non-null  object
 6   PT2          894240 non-null  object
 7   FT3          894240 non-null  object
 8   CO2in        894240 non-null  object
 9   CO2_V2       894240 non-null  object
 10  CO2sys       894240 non-null  object
 11  LT1          894240 non-null  object
 12  LReactor     894240 non-null  object
 13  C_g          894240 non-null  object
 14  C_Eff        894240 non-null  object
 15  C_CD         894240 non-null  object
 16  T_CD         894240 non-null  object
 17  C_D_g        894240 non-null  object
 18  C_D_Eff      894240 non-null  object
 19  QT

#### Data preprocessing function (chain programming)

In [None]:

(
    
)

### Export Cleaned Data into DB

## Appendix

In [None]:
'''
# set instance number from the file name
trail_no = re.split("_", filename)
print(trail_no)
trail_no = int(trail_no[1])
print(trail_no)

'''

In [None]:
'''
unit_ID = tbl_unit.query('unit_name == "FPC13"')
unit_no = unit_ID['unit_id']
unit_no

'''