In [46]:
import numpy as np
import pandas as pd
import os
from datetime import date
import sqlalchemy
import pymysql

import warnings    # to avoid warning during executions
warnings.filterwarnings("ignore")

In [48]:
# load all the file names into a list (files in the pbr data folder)
os.getcwd()
_dir_path = '../../../dataExport/PBRdata'
raw_data = os.listdir(_dir_path)
raw_data

['_10_FPC23_Rhosa.csv',
 '_11_FPC24_SkeMa_1.csv',
 '_12_FPC24_SkeMa_2.csv',
 '_13_FPC11_Rhosa_1.csv',
 '_14_FPC12_DiaLut.csv',
 '_15_FPC22_Thalaps.csv',
 '_16_FPC23_NaCho.csv',
 '_1_FPC13_ChCal_1.csv',
 '_2_FPC13_ChCal_2.csv',
 '_3_FPC13_ThalaPs.csv',
 '_4_FPC14_chamu.csv',
 '_5_FPC14_RhoSa.csv',
 '_6_FPC14_thalaps.csv',
 '_7_FPC21_chamu.csv',
 '_8_FPC21_Techu.csv',
 '_9_FPC22_DiaLut.csv']

In [57]:
filename = raw_data[10]

In [58]:
# database connection with mariaDB using SQL Alchemy
def dbConn_sqlAlc():
    database_username = 'root'
    database_password = 'password'
    database_ip       = '127.0.0.1:3306'
    database_name     = 'data_dashboard'
    database_connection = sqlalchemy.create_engine('mariadb+mariadbconnector://{0}:{1}@{2}/{3}'.
                                               format(database_username, database_password, 
                                                      database_ip, database_name))
    return database_connection

# # database connection with mariaDB using Maria DB package
def dBCon_Maria():
    connection = pymysql.connect(host='localhost',
                            user='root',
                            password='password',
                            db='data_dashboard')
    return connection


# connection = dBCon_Maria()
# cursor=connection.cursor()                                            
# database_connection = dbConn_sqlAlc()

In [59]:
# set trial no, species_id and unit number for processing data
def set_varables(filename):
    filename = filename
    database_connection = dbConn_sqlAlc()
    tbl_lookup = pd.read_sql('SELECT * FROM tbl_pbr_lookup', database_connection)
    species_row = tbl_lookup.query('raw_file_name == @filename')
    species_id = int(species_row['species'])
    trial_no = int(species_row['instance'])
    unit_id = int(species_row['unit'])
    return species_id, trial_no, unit_id

# Preprocessing the data seting/ adding values and data types
def pbr_preProcess(pbr_rawDF):
    variable_list = set_varables(filename)
    pbr_raw_tmp1 = pbr_rawDF.copy()
    pbr_raw_tmp1.insert(loc=0, column='trial_no', value=variable_list[1])
    pbr_raw_tmp1.insert(loc=4, column='species', value=variable_list[0])      
    return(
    pbr_raw_tmp1
    #.assign(trial_no = trail_no)
    .rename(columns={ pbr_raw_tmp1.columns[3]: "unit" })
    .assign(unit = variable_list[2])
    .replace('Empty', pd.np.nan)
    .replace('inf', pd.np.nan)
    .replace([np.inf, -np.inf], np.nan)
    .astype({'primary_key':'int64', '_timeString':'datetime64[ns]','unit': 'int16','FT1':'float32', 'FT2':'float32', 'PT1_V4':'float32', 'PT2':'float32', 'FT3':'float32', 'CO2in':'float32', 'CO2_V2':'float32', 'CO2sys':'float32', 'LT1':'float32', 'LReactor':'float32', 'C_g':'float32', 'C_Eff':'float32', 'C_CD':'float32', 'T_CD':'float32', 'C_D_g':'float32', 'C_D_Eff':'float32', 'QT':'float32', 'QC':'float32', 'QB':'float32', 'QR':'float32', 'QG':'float32', 'QT_corr':'float32', 'TT0':'float32', 'TT1':'float32', 'pH':'float32', 'C_PAR':'float32'})
    )

# Missing Value imputation using interpolation - spline
def missingValue_imputation(pbr_cleaned):
    pbr_cleaned_indexed = pbr_cleaned.set_index('_timeString')
    pbr_columns = ['FT1', 'FT2', 'PT1_V4', 'PT2', 'FT3', 'CO2in', 'CO2_V2', 'CO2sys', 'LT1', 'LReactor', 'C_g', 'C_Eff', 'C_CD', 'T_CD', 'C_D_g', 'C_D_Eff', 'QT', 'QC', 'QB', 'QR', 'QG', 'QT_corr', 'TT0', 'TT1', 'pH', 'C_PAR']
    for pbr_col in pbr_columns:
        pbr_cleaned_indexed[pbr_col]=pbr_cleaned_indexed[pbr_col].interpolate(option='spline')

    pbr_cleaned_indexed = pbr_cleaned_indexed.reset_index(level=0)
    new_col = list(pbr_cleaned.columns)
    pbr_cleaned_indexed=pbr_cleaned_indexed[new_col]
    return pbr_cleaned_indexed

# exporting the preprocessed data into db pbr
def pbr_exporttoDB(pbr_preprocessed):
    connection = dBCon_Maria()
    cursor=connection.cursor()      
    cols = "`,`".join([str(i) for i in pbr_preprocessed.columns.tolist()])
    for i,row in pbr_preprocessed.iterrows():
        sql = "INSERT INTO `tbl_pbr_data` (`" +cols + "`) VALUES (" + "%s,"*(len(row)-1) + "%s)"
        cursor.execute(sql, tuple(row))

    connection.commit()
    connection.close()

# export_status = pbr_exporttoDB(pbr_preprocessed)
# pbr_exporttoDB(pbr_preprocessed)
# pbr_preprocessed = missingValue_imputation(pbr_cleaned)
# pbr_cleaned = pbr_preProcess(pbr_rawDF)
# variable_list = set_varables(filename)

In [60]:
# load content of the export_list into list split by new line
# data = [line.strip() for line in open("exported_list.txt", 'r')]
with open('./exported_list.txt') as f:
    lines = f.read().splitlines()

if (filename not in lines):
    file_path=  os.path.join(_dir_path, filename)
    pbr_rawDF = pd.read_csv(file_path, sep=',')
    pbr_cleaned = pbr_preProcess(pbr_rawDF)
    pbr_preprocessed = missingValue_imputation(pbr_cleaned)
    try:
        export_status = pbr_exporttoDB(pbr_preprocessed)
        file1 = open("exported_list.txt", "a")
        file1.write("\n")     
        file1.write(filename)
        file1.close()
        print('file is exported')
    except Exception as e: print(e)

else:
    print('data is present in the db - check and very the trial number in tbl_pbr_data')

file is exported


### Test the Exported Data

In [54]:
database_connection = dbConn_sqlAlc()
tbl_logdata = pd.read_sql('SELECT * FROM tbl_pbr_data where trial_no = 3', database_connection)


In [55]:
tbl_logdata.shape

(586080, 31)

In [56]:
pbr_rawDF.shape

(586080, 29)