In [1]:
import os
import numpy as np
import pandas as pd
from concurrent.futures import ThreadPoolExecutor


In [2]:
data_root_path = "d:\\joystick"
sample_names = ['RAW DATA 1']

In [3]:
file_path_list = list()

for sample_name in sample_names:
    for file in os.listdir(os.path.join(data_root_path, sample_name , 'Durability')):
        if file.endswith('.CSV'):
            file_path_list.append(os.path.join(data_root_path, sample_name, 'Durability', file))

file_path_list.sort()

In [4]:
file_path_list[0:3]

['d:\\joystick\\RAW DATA 1\\Durability\\SN0001_2024-02-07 12-01_(1)_STOP.CSV',
 'd:\\joystick\\RAW DATA 1\\Durability\\SN0001_2024-02-07 12-21_(1170)_STOP.CSV',
 'd:\\joystick\\RAW DATA 1\\Durability\\SN0001_2024-02-07 12-42_(2333)_STOP.CSV']

In [5]:
data_column_names = ['port_1_p(bar)', 'port_2_p(bar)', 'port_3_p(bar)', 'port_4_p(bar)', 'support_p(bar)', 'tank_p(bar)', 'temp(c)']

In [6]:
def load_csv(path_list):
    data = pd.read_csv(path_list, encoding='cp949').iloc[9:] 
    data.columns = data_column_names
    
    return data

In [7]:
# Executor를 사용한 병렬 처리
with ThreadPoolExecutor() as executor:
    results = list(executor.map(load_csv, file_path_list[0:1]))

# 결과 병합
result_df = pd.concat(results)

In [8]:
result_df['time(hour)'] = np.arange(result_df.shape[0])/(100*3600)
result_df.reset_index(inplace=True, drop=True)

In [9]:
result_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 120000 entries, 0 to 119999
Data columns (total 8 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   port_1_p(bar)   120000 non-null  object 
 1   port_2_p(bar)   120000 non-null  object 
 2   port_3_p(bar)   120000 non-null  object 
 3   port_4_p(bar)   120000 non-null  object 
 4   support_p(bar)  120000 non-null  object 
 5   tank_p(bar)     120000 non-null  object 
 6   temp(c)         120000 non-null  object 
 7   time(hour)      120000 non-null  float64
dtypes: float64(1), object(7)
memory usage: 7.3+ MB


In [10]:
result_df.head()

Unnamed: 0,port_1_p(bar),port_2_p(bar),port_3_p(bar),port_4_p(bar),support_p(bar),tank_p(bar),temp(c),time(hour)
0,0.55,12.29,12.42,0.53,12.37,0.53,48.0,0.0
1,0.55,12.32,12.45,0.55,12.37,0.54,48.0,3e-06
2,0.54,12.31,12.43,0.54,12.35,0.54,48.0,6e-06
3,0.58,12.32,12.43,0.54,12.38,0.55,48.0,8e-06
4,0.55,12.3,12.43,0.54,12.36,0.55,48.0,1.1e-05


In [43]:
float32_target_var = ['port_1_p(bar)', 'port_2_p(bar)', 'port_3_p(bar)', 'port_4_p(bar)','support_p(bar)', 'tank_p(bar)', 'time(hour)']
uint8_target_var = ['temp(c)']

In [44]:
for var in float32_target_var:
    result_df[var] = result_df[var].astype(np.float32)

for var in uint8_target_var:
    result_df[var] = result_df[var].astype(np.float64).astype(np.uint8)    

In [45]:
result_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 120000 entries, 0 to 119999
Data columns (total 8 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   port_1_p(bar)   120000 non-null  float32
 1   port_2_p(bar)   120000 non-null  float32
 2   port_3_p(bar)   120000 non-null  float32
 3   port_4_p(bar)   120000 non-null  float32
 4   support_p(bar)  120000 non-null  float32
 5   tank_p(bar)     120000 non-null  float32
 6   temp(c)         120000 non-null  uint8  
 7   time(hour)      120000 non-null  float32
dtypes: float32(7), uint8(1)
memory usage: 3.3 MB


In [11]:
result_df.head()

Unnamed: 0,port_1_p(bar),port_2_p(bar),port_3_p(bar),port_4_p(bar),support_p(bar),tank_p(bar),temp(c),time(hour)
0,0.55,12.29,12.42,0.53,12.37,0.53,48.0,0.0
1,0.55,12.32,12.45,0.55,12.37,0.54,48.0,3e-06
2,0.54,12.31,12.43,0.54,12.35,0.54,48.0,6e-06
3,0.58,12.32,12.43,0.54,12.38,0.55,48.0,8e-06
4,0.55,12.3,12.43,0.54,12.36,0.55,48.0,1.1e-05


In [12]:
result_df.tail()

Unnamed: 0,port_1_p(bar),port_2_p(bar),port_3_p(bar),port_4_p(bar),support_p(bar),tank_p(bar),temp(c),time(hour)
119995,13.04,0.61,0.79,20.17,39.56,0.66,48.0,0.333319
119996,11.99,0.57,0.74,20.5,39.56,0.63,48.0,0.333322
119997,10.89,0.55,0.77,20.71,39.18,0.65,48.0,0.333325
119998,9.86,0.62,0.82,20.94,39.1,0.71,48.0,0.333328
119999,8.73,0.67,0.83,21.48,39.26,0.72,48.0,0.333331


In [23]:
result_df.to_csv('d:\\joystick\\test.csv', float_format='%.3f')

In [48]:
result_df.to_parquet('d:\\joystick\\test.parquet', engine='pyarrow')