-
Notifications
You must be signed in to change notification settings - Fork 0
/
files.py
342 lines (287 loc) · 11.8 KB
/
files.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
"""
DYCO Dynamic Lag Compensation
Copyright (C) 2020-2024 Lukas Hörtnagl
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <https://www.gnu.org/licenses/>.
"""
import datetime as dt
import numpy as np
import pandas as pd
def read_segment_lagtimes_file(filepath):
"""
Read file.
Reading segment covariances and lag search results for each segment.
Can be used for all text files for which the .read_csv args are valid.
Parameters
----------
filepath: str
Returns
-------
pandas DataFrame
"""
# parse = lambda x: dt.datetime.strptime(x, '%Y%m%d%H%M%S')
found_lags_df = pd.read_csv(filepath,
skiprows=None,
header=0,
# names=header_cols_list,
# na_values=-9999,
encoding='utf-8',
delimiter=',',
# mangle_dupe_cols=True,
# keep_date_col=False,
parse_dates=False,
# date_parser=parse,
index_col=0,
dtype=None,
engine='c')
return found_lags_df
def read_raw_data(filepath, data_timestamp_format):
"""
Read raw data files
Parameters
----------
filepath: Path
Full path to raw data file.
data_timestamp_format: str
Datetime format of the timestamp in each data row in the raw data file.
Returns
-------
pandas DataFrame that contains raw data from the file in filepath
"""
header_rows_list = [0]
skip_rows_list = []
header_section_rows = [0]
num_data_cols = \
length_data_cols(filepath=filepath,
header_rows_list=header_rows_list,
skip_rows_list=skip_rows_list)
num_header_cols, header_cols_df = \
length_header_cols(filepath=filepath,
header_rows_list=header_rows_list,
skip_rows_list=skip_rows_list)
more_data_cols_than_header_cols, num_missing_header_cols = \
data_vs_header(num_data_cols=num_data_cols,
num_header_cols=num_header_cols)
header_cols_list = \
generate_missing_cols(header_cols_df=header_cols_df,
more_data_cols_than_header_cols=more_data_cols_than_header_cols,
num_missing_header_cols=num_missing_header_cols)
if data_timestamp_format:
# parse = lambda x: dt.datetime.strptime(x, data_timestamp_format) # now deprecated
# date_parser = parse # now deprecated
parse_dates = True
index_col = 0
else:
# date_parser = None # now deprecated
parse_dates = False
index_col = None
data_df = pd.read_csv(filepath,
skiprows=header_section_rows,
header=None,
names=header_cols_list,
na_values=-9999,
encoding='utf-8',
delimiter=',',
# mangle_dupe_cols=True, # now deprecated
# keep_date_col=False, # now deprecated
parse_dates=parse_dates,
# date_parser=date_parser, # now deprecated
date_format=data_timestamp_format,
index_col=index_col,
dtype=None,
engine='c',
nrows=None)
return data_df
def calc_true_resolution(num_records, data_nominal_res, expected_records, expected_duration):
"""
Calculate the true resolution of the raw data
Parameters
----------
num_records: int
Number of raw data records.
data_nominal_res: float
Nominal time resolution of the raw data, e.g. 0.05 for 20 Hz data
(one measurement every 0.05 seconds)
expected_records: int or float
Expected number of records in the raw data file, based on the nominal
time resolution of the raw data.
expected_duration: int
Expected duration of the raw data file in seconds.
Returns
-------
float that gives the true resolution in seconds, e.g. 0.05s for 20 Hz (1/20 = 0.05)
"""
ratio = num_records / expected_records
if (ratio > 0.999) and (ratio < 1.001):
# file_complete = True
true_resolution = np.float64(expected_duration / num_records)
else:
# file_complete = False
true_resolution = data_nominal_res
return true_resolution
def insert_timestamp(df, file_info_row, num_records, data_nominal_res, expected_records, expected_duration):
"""
Calculate the timestamp for each row record if not available
Parameters
----------
df: pd.DataFrame
Raw data without timestamp. In case data already have a timestamp, it
will be overwritten.
file_info_row: pandas Series
Contains info about the current raw data file.
num_records: int
Number or records in raw data file.
data_nominal_res: float
Nominal time resolution of the raw data, e.g. 0.05 (one measurement every
0.05 seconds, 20 Hz).
expected_records: int or float
Number of expected records in raw data file, e.g. a 30-minute file of
20 Hz data should contain 36000 records (20 * 60 * 30).
expected_duration, int or float
Expected duration of the raw data file in seconds, e.g. 1800 for a
30-minute file.
Returns
-------
df: pandas DataFrame with timestamp index
true_resolution: time resolution of raw data measurements
"""
true_resolution = calc_true_resolution(num_records=num_records, data_nominal_res=data_nominal_res,
expected_records=expected_records, expected_duration=expected_duration)
df['sec'] = df.index * true_resolution
df['file_start_dt'] = file_info_row['start']
df['TIMESTAMP'] = pd.to_datetime(df['file_start_dt']) \
+ pd.to_timedelta(df['sec'], unit='s')
df.drop(['sec', 'file_start_dt'], axis=1, inplace=True)
df.set_index('TIMESTAMP', inplace=True)
return df, true_resolution
def add_data_stats(df, true_resolution, filename, files_overview_df, found_records, fnm_date_format):
"""
Collect additional info about raw data file
Parameters
----------
df: pandas DataFrame
Raw data from the file.
true_resolution: float
Time resolution of the raw data records in seconds, e.g. 0.05 for 20 Hz data.
filename: str
Filename of the raw data file, without extension.
files_overview_df: pandas DataFrame
Overview of all raw data files, with stats.
found_records: int
Number of records in the raw data file.
fnm_date_format: str
Datetime format of the datetime info in the raw data filename.
Returns
-------
pandas DataFrame with additional info for current raw data file
"""
# Detect overall frequency
data_duration = found_records * true_resolution
data_freq = np.float64(found_records / data_duration)
idx = dt.datetime.strptime(filename, fnm_date_format) # Use filename datetime info as index
files_overview_df.loc[idx, 'first_record'] = df.index[0]
files_overview_df.loc[idx, 'last_record'] = df.index[-1]
files_overview_df.loc[idx, 'file_duration'] = (df.index[-1] - df.index[0]).total_seconds()
files_overview_df.loc[idx, 'found_records'] = found_records
files_overview_df.loc[idx, 'data_freq'] = data_freq
return files_overview_df
def generate_missing_cols(header_cols_df, more_data_cols_than_header_cols, num_missing_header_cols):
"""
Insert additional column names in data header
Additional columns are created if the number of data columns
does not match the number of header columns.
Parameters
----------
header_cols_df: pandas DataFrame
A small DataFrame that only contains the header columns.
more_data_cols_than_header_cols: bool
True if more data columns than header columns were found in
the data file. This can happen due to irregularities during
raw data collection.
num_missing_header_cols: int
Number of missing header columns in comparison to data columns.
Returns
-------
list of header columns that contains labels for additionally created columns
"""
# Generate missing header columns if necessary
header_cols_list = header_cols_df.columns.to_list()
generated_missing_header_cols_list = []
if more_data_cols_than_header_cols:
for m in list(range(1, num_missing_header_cols + 1)):
missing_col = (f'unknown_{m}')
generated_missing_header_cols_list.append(missing_col)
header_cols_list.append(missing_col)
return header_cols_list
def length_data_cols(filepath, header_rows_list, skip_rows_list):
"""
Check number of columns of the first data row after the header part
Parameters
----------
filepath: Path
Path to raw data file
header_rows_list: list
List of integers that give the row positions of the header lines
skip_rows_list: list
List of skipped rows
Returns
-------
Number of data columns
"""
skip_num_lines = len(header_rows_list) + len(skip_rows_list)
first_data_row_df = pd.read_csv(filepath,
skiprows=skip_num_lines,
header=None,
nrows=1)
return first_data_row_df.columns.size
def length_header_cols(filepath, header_rows_list, skip_rows_list):
"""
Check number of columns of the header part
Parameters
----------
filepath: Path
Path to raw data file
header_rows_list: list
List of integers that give the row positions of the header lines
skip_rows_list: list
List of skipped rows
Returns
-------
Number of header columns and a pandas DataFrame that contains only the header columns
"""
header_cols_df = pd.read_csv(filepath,
skiprows=skip_rows_list,
header=header_rows_list,
nrows=0)
return header_cols_df.columns.size, header_cols_df
def data_vs_header(num_data_cols, num_header_cols):
"""
Check if there are more data columns than header columns
Parameters
----------
num_data_cols: int
Number of data columns
num_header_cols: int
Number of header columns
Returns
-------
more_data_cols_than_header_cols: bool
True if number of data columns > number of header columns
num_missing_header_cols: int
Number of missing header columns compared to number of data columns
"""
if num_data_cols > num_header_cols:
more_data_cols_than_header_cols = True
num_missing_header_cols = num_data_cols - num_header_cols
else:
more_data_cols_than_header_cols = False
num_missing_header_cols = 0
return more_data_cols_than_header_cols, num_missing_header_cols