In [7]:
import numpy as np
import pandas as pd
import os
from datetime import date
import sqlalchemy
import pymysql
import re

import warnings    # to avoid warning during executions
warnings.filterwarnings("ignore")

In [2]:
# load all the file names into a list (files in the pbr data folder)
os.getcwd()
_dir_path_hour = '../../../dataExport/cumulativeDATA/Hour'
raw_data_hour = os.listdir(_dir_path_hour)
raw_data_hour

['_10_FPC23_RhoSa.csv',
 '_11_FPC24_SkeMa_1.csv',
 '_12_FPC24_SkeMa_2.csv',
 '_16_FPC23_NaCho.csv',
 '_1_FPC13_ChCal_1.csv',
 '_2_FPC13_ChCal_2.csv',
 '_3_FPC13_ThalaPs.csv',
 '_4_FPC14_ChaMu.csv',
 '_5_FPC14_RhoSa.csv',
 '_6_FPC14_ThalaPs.csv',
 '_7_FPC21_ChaMu.csv',
 '_8_FPC21_TeChu.csv',
 '_9_FPC22_DiaLut.csv']

In [3]:
filename_hour = raw_data_hour[2]
filename_hour

'_12_FPC24_SkeMa_2.csv'

In [4]:
# database connection with mariaDB using SQL Alchemy
def dbConn_sqlAlc():
    database_username = 'root'
    database_password = 'password'
    database_ip       = '127.0.0.1:3306'
    database_name     = 'data_dashboard'
    database_connection = sqlalchemy.create_engine('mariadb+mariadbconnector://{0}:{1}@{2}/{3}'.
                                               format(database_username, database_password, 
                                                      database_ip, database_name))
    return database_connection

# # database connection with mariaDB using pymysql package
def dBCon_Maria():
    connection = pymysql.connect(host='localhost',
                            user='root',
                            password='password',
                            db='data_dashboard')
    return connection


# connection = dBCon_Maria()
# cursor=connection.cursor()                                            
# database_connection = dbConn_sqlAlc()

In [8]:
# set trial no, species_id and unit number for processing data
def set_varables(trial_no):
    database_connection = dbConn_sqlAlc()
    tbl_lookup = pd.read_sql("SELECT * FROM tbl_pbr_lookup WHERE instance = ?", database_connection, params=[trial_no])
    species_id = int(tbl_lookup['species'])
    trial_no = int(tbl_lookup['instance'])
    unit_id = int(tbl_lookup['unit'])
    return species_id, trial_no, unit_id


# Preprocessing the data setting/ adding values and data types
def pbr_preProcess(sum_hour_raw, trial_no):
    variable_list = set_varables(trial_no)
    pbr_raw_tmp1 = sum_hour_raw.copy()
    pbr_raw_tmp1.insert(loc=3, column='trial_no', value=variable_list[1])
    pbr_raw_tmp1.insert(loc=4, column='species', value=variable_list[0])      
    return(
    pbr_raw_tmp1
    .assign(unit = variable_list[2])
    .replace('Empty', pd.np.nan)
    .replace('inf', pd.np.nan)
    .replace([np.inf, -np.inf], pd.np.nan)
    .astype({'primary_key':'int64', '_timeString':'datetime64[ns]','unit': 'int16','D_Algae':'float32', 'D_CD':'float32', 'D_CO2':'float32', 'D_PAR':'float32', 'D_eff':'float32', 'D_feed':'float32', 'D_g':'float32', 'D_harvest':'float32'})
    )


# Missing Value imputation using interpolation - spline
def missingValue_imputation(sum_hour_cleaned):
    sum_hour_cleaned_indexed = sum_hour_cleaned.set_index('_timeString')
    null_cols = sum_hour_cleaned_indexed.columns[sum_hour_cleaned_indexed.isnull().any()].tolist()
    for pbr_col in null_cols:
        sum_hour_cleaned_indexed[pbr_col]=sum_hour_cleaned_indexed[pbr_col].interpolate(option='spline')

    sum_hour_cleaned_indexed = sum_hour_cleaned_indexed.reset_index(level=0)
    new_col = list(sum_hour_cleaned.columns)
    sum_hour_cleaned_indexed=sum_hour_cleaned_indexed[new_col]
    return sum_hour_cleaned_indexed


# exporting the preprocessed data into db pbr
def pbr_exporttoDB(sumHour_preprocessed):
    connection = dBCon_Maria()
    cursor=connection.cursor()      
    cols = "`,`".join([str(i) for i in sumHour_preprocessed.columns.tolist()])
    for i,row in sumHour_preprocessed.iterrows():
        sql = "INSERT INTO `tbl_cumulative_per_hour` (`" +cols + "`) VALUES (" + "%s,"*(len(row)-1) + "%s)"
        cursor.execute(sql, tuple(row))

    connection.commit()
    connection.close()

In [9]:
# load content of the export_list into list split by new line
# data = [line.strip() for line in open("exported_list.txt", 'r')]
with open('./exported_list.txt') as f:
    lines = f.read().splitlines()

if (filename_hour not in lines):
    trial_no = re.findall(r'\d+', filename_hour)
    trial_no = int(trial_no[0])
    file_path=  os.path.join(_dir_path_hour, filename_hour)
    sumHour_rawDF = pd.read_csv(file_path, sep=',')
    sum_hour_cleaned = pbr_preProcess(sumHour_rawDF, trial_no)
    sumHour_preprocessed = missingValue_imputation(sum_hour_cleaned)
    try:
        export_status = pbr_exporttoDB(sumHour_preprocessed)
        file1 = open("exported_list.txt", "a")
        file1.write("\n")     
        file1.write(filename_hour)
        file1.close()
        print('file is exported')
    except Exception as e: print(e)

else:
    print('data is present in the db - check and very the trial number in tbl_pbr_data')

file is exported


### Test the Exported Data

In [None]:
database_connection = dbConn_sqlAlc()
tbl_logdata = pd.read_sql('SELECT * FROM tbl_cumulative_per_hour where trial_no = ?', database_connection, params=[trial_no])

In [None]:
tbl_logdata.shape

In [None]:
sumHour_rawDF.shape