In [12]:
import numpy as np
import pandas as pd
import os
import re
from datetime import date
import sqlalchemy

import warnings    # to avoid warning during executions
warnings.filterwarnings("ignore")

In [2]:
# set file name want to process
file_name = '..\..\dataExport\LOGdata\log_2022_4_20.csv'

In [13]:
# database connection with mariaDB for read the read and write data from and to dataframe <----> mariaDB tables
database_username = 'root'
database_password = 'password'
database_ip       = '127.0.0.1:3306'
database_name     = 'data_dashboard'
database_connection = sqlalchemy.create_engine('mariadb+mariadbconnector://{0}:{1}@{2}/{3}'.
                                               format(database_username, database_password, 
                                                      database_ip, database_name))

In [15]:
# fetch data from tbl_unit to data frame
tbl_unit = pd.read_sql('SELECT unit_id, unit_name FROM tbl_unit', database_connection)
tbl_unit.head(5)

Unnamed: 0,unit_id,unit_name
0,31,FFP_AUTFU
1,19,FFP_FFP
2,3,FFU
3,14,FPC11
4,34,FPC11_AUT11


In [16]:
# fetch data from tbl_log_parameters to data frame
tbl_parameter = pd.read_sql('SELECT prm_id, prm_name FROM tbl_log_parameters', database_connection)
tbl_parameter.head(5)

Unnamed: 0,prm_id,prm_name
0,1,Arduino
1,101,Arduino1
2,66,Arduino1_LT41
3,125,Arduino1_LT42
4,126,Arduino1_LT43


## Removing extra newlines

In [None]:
# Append a line to previous line if its not starting in standard format. 
# save the preprocessed data into same file
def del_newline(file_name):
    with open(file_name, 'r+', encoding="utf-8") as file:
        text = str();
        for line in file:
            if line[0:3] == "202":
                text = text + '\n';
            text = text + line.strip();
        file.seek(0);
        file.write(text);

del_newline(file_name)

In [7]:
log_rawDF = pd.read_csv(file_name, sep='*')
log_rawDF.head(10)

Unnamed: 0,Date_Time,Time,MicroSec,Type,Unit,Parameter,Message,Value1,Value2
0,2021-4-20 13:0:13,1618916413,613129,4,FPC11_FPP11,Arduino,"more than 2 communication restarts, try powerd...",,
1,2021-4-20 13:0:57,1618916457,9327,4,FPC11_FPP11,Arduino,"more than 2 communication restarts, try powerd...",,
2,2021-4-20 13:1:38,1618916498,812633,4,FPC11_FPP11,Arduino,"more than 2 communication restarts, try powerd...",,
3,2021-4-20 13:2:20,1618916540,214501,4,FPC11_FPP11,Arduino,"more than 2 communication restarts, try powerd...",,
4,2021-4-20 13:3:3,1618916583,212528,4,FPC11_FPP11,Arduino,"more than 2 communication restarts, try powerd...",,
5,2021-4-20 13:3:46,1618916626,209680,4,FPC11_FPP11,Arduino,"more than 2 communication restarts, try powerd...",,
6,2021-4-20 13:3:58,1618916638,271269,3,FPC14,program,step ID - Changed (new and old value),4_WaterAcid,5_empty
7,2021-4-20 13:4:1,1618916641,101959,3,FPC14,program,start at - Changed (new and old value),sellected step at...,first step
8,2021-4-20 13:4:6,1618916646,386245,3,FPC14,program,procedure running - Changed,not active,active
9,2021-4-20 13:4:7,1618916647,639887,2,FPC14,program,Program F_ThalaPseu_HalfNutr started,,


In [8]:
print(log_rawDF.shape)

(864368, 9)


In [29]:
log_rawDF.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 864368 entries, 0 to 864367
Data columns (total 9 columns):
 #   Column     Non-Null Count   Dtype 
---  ------     --------------   ----- 
 0   Date_Time  864368 non-null  object
 1   Time       864368 non-null  int64 
 2   MicroSec   864368 non-null  int64 
 3   Type       864368 non-null  int64 
 4   Unit       864368 non-null  object
 5   Parameter  864368 non-null  object
 6   Message    864368 non-null  object
 7   Value1     411351 non-null  object
 8   Value2     495515 non-null  object
dtypes: int64(3), object(6)
memory usage: 59.4+ MB


In [9]:
log_rawDF.columns

Index(['Date_Time', 'Time', 'MicroSec', 'Type', 'Unit', 'Parameter', 'Message',
       'Value1', 'Value2'],
      dtype='object')

## Replace Parameter values and unit values with Table int values

In [34]:
# data type conversion and droping rows if parameter column value is missing

def log_tweak(log_rawDF):
    return (
    log_rawDF
    .assign(Unit = log_rawDF.Unit.map(tbl_unit.set_index('unit_name')['unit_id']),
           Parameter =  log_rawDF.Parameter.map(tbl_parameter.set_index('prm_name')['prm_id']),
           Date_Time = pd.to_datetime(log_rawDF.Date_Time))
    .astype({'Type' : 'int8', 'Unit' : 'int16', 'Parameter' : 'int32'})
    .dropna(subset=['Parameter'])
    .query('Type != 1')
    #.info()
    )

log_cleaned = log_tweak(log_rawDF)

In [35]:
log_cleaned.head(10)

Unnamed: 0,Date_Time,Time,MicroSec,Type,Unit,Parameter,Message,Value1,Value2
0,2021-04-20 13:00:13,1618916413,613129,4,1,1,"more than 2 communication restarts, try powerd...",,
1,2021-04-20 13:00:57,1618916457,9327,4,1,1,"more than 2 communication restarts, try powerd...",,
2,2021-04-20 13:01:38,1618916498,812633,4,1,1,"more than 2 communication restarts, try powerd...",,
3,2021-04-20 13:02:20,1618916540,214501,4,1,1,"more than 2 communication restarts, try powerd...",,
4,2021-04-20 13:03:03,1618916583,212528,4,1,1,"more than 2 communication restarts, try powerd...",,
5,2021-04-20 13:03:46,1618916626,209680,4,1,1,"more than 2 communication restarts, try powerd...",,
6,2021-04-20 13:03:58,1618916638,271269,3,2,2,step ID - Changed (new and old value),4_WaterAcid,5_empty
7,2021-04-20 13:04:01,1618916641,101959,3,2,2,start at - Changed (new and old value),sellected step at...,first step
8,2021-04-20 13:04:06,1618916646,386245,3,2,2,procedure running - Changed,not active,active
9,2021-04-20 13:04:07,1618916647,639887,2,2,2,Program F_ThalaPseu_HalfNutr started,,


In [36]:
print('Data frame Info')
print(log_cleaned.info())
print('\nNullvalues in each columns')
print(log_cleaned.isnull().sum())
print('\nLists of unique values in unit column')
print(log_cleaned['Unit'].unique())
print('\nTotal number of unique values in unit column')
print(log_cleaned['Unit'].nunique())


Data frame Info
<class 'pandas.core.frame.DataFrame'>
Int64Index: 662835 entries, 0 to 863359
Data columns (total 9 columns):
 #   Column     Non-Null Count   Dtype         
---  ------     --------------   -----         
 0   Date_Time  662835 non-null  datetime64[ns]
 1   Time       662835 non-null  int64         
 2   MicroSec   662835 non-null  int64         
 3   Type       662835 non-null  int8          
 4   Unit       662835 non-null  int16         
 5   Parameter  662835 non-null  int32         
 6   Message    662835 non-null  object        
 7   Value1     407367 non-null  object        
 8   Value2     491141 non-null  object        
dtypes: datetime64[ns](1), int16(1), int32(1), int64(2), int8(1), object(3)
memory usage: 39.8+ MB
None

Nullvalues in each columns
Date_Time         0
Time              0
MicroSec          0
Type              0
Unit              0
Parameter         0
Message           0
Value1       255468
Value2       171694
dtype: int64

Lists of unique valu