-
Notifications
You must be signed in to change notification settings - Fork 0
/
input.py
233 lines (168 loc) · 9.42 KB
/
input.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
##########################################################################
#### Raw CSV data processing and PCA dimensionality reduction
#### Author : Juan Pablo Valdes and Fuyue Liang
### First commit: Feb 2024
### Department of Chemical Engineering, Imperial College London
##########################################################################
import pandas as pd
import os
from sklearn.model_selection import train_test_split
from paths import PathConfig
from data_utils import DataReader, DataProcessor, DataPackager
def preprocess_data(df: pd.DataFrame, dt_processor:DataProcessor, in_idx, out_idx,
filter_range, scale_choice):
"""
Preprocesses the input DataFrame.
Parameters:
df (pd.DataFrame): Input DataFrame.
in_idx (int): Cut-off index between input and output params.
out_idx (str): Selected output parameters idx to include.
filter_range (str): Filter percentages [0,1] for min/max cases.
scale_choice (str): Scaling method (norm/log/robust/power/quantile).
Returns:
tuple: Tuple containing scaled input and output features.
"""
param_idx = [(idx,param) for idx,param in enumerate(df.columns)]
#drop output columns assuming DOE df is concatenated first always
X_df = df.drop(df.columns[int(in_idx):], axis = 1)
# Drop output columns
if out_idx == 'all':
y_df = df.drop(df.columns[:int(in_idx)], axis=1)
else:
#selected variables to preserve
out_idx_list = [int(x) for x in out_idx.split(',')]
# Raise exception if chosen idx are located within the selected input features
for idx in out_idx_list:
if idx < int(in_idx):
raise ValueError(f'idx = {idx} selected is not within the output features idxs: {in_idx} to {param_idx[-1][0]} ')
y_df = df[df.columns[out_idx_list]].copy()
# Filter cases with min/max feature values
percentages = [float(x) for x in filter_range.split(',')]
if len(percentages) < 2:
raise ValueError('Either min or max filter percentage was not defined')
X_minmax, y_minmax, X_filtered, y_filtered = dt_processor.filter_minmax([X_df, y_df], bottom=percentages[0], upper=percentages[1])
# Scale input and output features
X_scaled = dt_processor.scale_data([X_df.copy(), X_minmax, X_filtered], scaling=scale_choice)
y_scaled = dt_processor.scale_data([y_df.copy(), y_minmax, y_filtered], scaling=scale_choice)
# plot datapack with filtered minmax cases
dt_processor.plot_scaling(X_df,X_scaled[-1],X_scaled[1],data_label='inputs')
dt_processor.plot_scaling(y_df,y_scaled[-1],y_scaled[1],data_label='outputs')
return X_scaled, y_scaled
def process_ini(case_name, df, X_scaled, y_scaled, dt_processor: DataProcessor, dt_packager: DataPackager):
"""
Process initial data.
Parameters:
df (pd.DataFrame): Input DataFrame.
X_scaled (pd.DataFrame): Scaled input features.
y_scaled (pd.DataFrame): Scaled output features.
dt_processor (DataProcessor): Data processor instance.
dt_packager (DataPackager): Data packager instance.
"""
# Case splitting for sampling comparison, setting an initial set to train with and explore: AL vs. Random
if 'sv' in case_name:
random_ratio=0.55
test_ratio = 0.3
else:
random_ratio=0.36
test_ratio = 0.30
X_ini, X_random, y_ini, y_random = train_test_split(X_scaled[-1], y_scaled[-1], test_size=random_ratio, random_state=2024)
# train test splitting with filtered datapack from the initial dataset to be used
X_train, X_test, y_train, y_test = train_test_split(X_ini, y_ini, test_size=test_ratio, random_state=2024)
print(f'Sizes of ini training: {X_train.shape[0]}; test: {X_test.shape[0]}; random: {X_random.shape[0]} ')
# recombine filtered minmax cases into initial training data pack
combine_choice = input('Include the filtered cases into training? (y/n):')
if combine_choice.lower() == 'y':
X_train = pd.concat([X_train,X_scaled[1]],axis=0)
y_train = pd.concat([y_train,y_scaled[1]],axis=0)
# Expand y_test arrays into separate columns for further regression eval
y_test_exp = dt_packager.expand_targets(y_test)
pca_choice = input('Carry out dimensionality reduction through PCA? (y/n): ')
if pca_choice.lower() == 'y':
# Carry out PCA on scaled outputs for training only
var_ratio = 0.95
y_train_reduced, pca_info_df = dt_processor.PCA_reduction(y_train,var_ratio, datasample='ini')
# Package data for further use training and deploying regression models
data_pack = [df,X_train,y_train_reduced,X_test,y_test_exp,y_train,pca_info_df]
labels = ['full','X_train_i','y_train_i_red','X_test_i','y_test_i','y_train_i_raw','PCA_info']
else:
# Expand y_train and test columns containing arrays to individual columns per feature value for correct handling by regressor
y_train_exp = dt_packager.expand_targets(y_train)
# Package data for further use training and deploying regression models
data_pack = [df,X_train,y_train_exp,X_test,y_test_exp,y_train]
labels = ['full','X_train_i','y_train_i','X_test_i','y_test_i','y_train_i_raw']
# Package initial data sets
dt_packager.package_data(data_pack,labels, datasample= 'ini')
# Package random data sets
random_pack = [X_random, y_random]
random_labels = ['X_random', 'y_random_raw']
dt_packager.package_data(random_pack, random_labels, datasample='random')
def process_aug(df:pd.DataFrame, X_scaled, y_scaled, dt_packager: DataPackager, datasample : str):
"""
Process resampled data following decision tree (dt) active sampling.
Parameters:
df (pd.DataFrame): Input DataFrame from dt csvs.
X_scaled (pd.DataFrame): Scaled input features.
y_scaled (pd.DataFrame): Scaled output features.
dt_packager (DataPackager): Data packager instance.
"""
# train test splitting with filtered datapack from the initial dataset to be used
X_train, y_train = X_scaled[-1], y_scaled[-1]
print(f'Sizes of {datasample} training set: {X_train.shape[0]} ')
# Package data for further use training and deploying regression models
data_pack = [df,X_train,y_train]
labels = ['full',f'X_train_{datasample}',f'y_train_{datasample}_raw']
# Package initial data sets
dt_packager.package_data(data_pack,labels, datasample= datasample)
def clean_pkl(path):
file_list = os.listdir(path)
for file_name in file_list:
if file_name.endswith('.pkl'):
file_del = os.path.join(path,file_name)
os.remove(file_del)
def main():
case_name = input('Select a study to process raw datasets (sp_(sv)geom): ')
data_name = input('Select a dataset to process from the study selected above (ini, dt, gsx): ')
dt_reader = DataReader(case_name,data_name)
dt_processor = DataProcessor(case_name, data_name)
dt_packager = DataPackager(case_name, data_name)
# Global paths configuration
PATH = PathConfig()
#Combine csv and DOE label files
df = dt_reader.combine_data()
# Dropping SMX_pos, length (sm) and Height, arc_length, Time (sv) columns from input features required by the regressor
columns_to_drop = ['SMX_pos (mm)', 'Length', 'Height', 'arc_length', 'Time']
columns_to_drop = [col for col in columns_to_drop if col in df.columns]
df = df.drop(columns_to_drop, axis=1)
# Divide between input and output params in the combined df
params = df.columns
param_idx = [(idx,param) for idx,param in enumerate(params)]
print(f'The input and output parameters in this case study are: {param_idx}')
if data_name == 'ini':
# clean previous files
scaler_path = os.path.join(PATH.input_savepath,case_name, 'ini')
clean_pkl(scaler_path)
in_idx = input('Provide cut-off index between input and output params (first out idx): ')
#Choose idx for output variables
out_idx = input('Select the output parameters idx to include (separated by ,) or choose \'all\': ')
# Filter cases with min/max feature values
filter_range = input('Define filter percentages [0,1] for min/max cases (default: min_filter,max_filter = 0,0): ')
#Scale input and output features
scale_choice = input('Select a scaling method (norm/log/robust/power/quantile): ')
X_scaled, y_scaled = preprocess_data(df, dt_processor, in_idx, out_idx, filter_range, scale_choice)
process_ini(case_name, df, X_scaled, y_scaled, dt_processor, dt_packager)
else:
# extract the columns from ini pre-processing
scaler_path = os.path.join(PATH.input_savepath,case_name, 'ini')
file_names = os.listdir(scaler_path)
# filter the columns for X_df
X_columns = [col.split('scaler_X_')[1].split('.')[0] for col in file_names if col.startswith('scaler_X_')]
regex_pattern = '|'.join(X_columns)
X_df = df.filter(regex=regex_pattern)
# filter the columns for X_df
y_columns = [col.split('scaler_y_')[1].split('.')[0] for col in file_names if col.startswith('scaler_y_')]
y_df = df[y_columns]
X_scaled = dt_processor.scale_data([X_df.copy()], scaling=None)
y_scaled = dt_processor.scale_data([y_df.copy()], scaling=None)
process_aug(df, X_scaled, y_scaled, dt_packager, data_name)
if __name__ == "__main__":
main()