In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm

In [None]:
data = pd.read_csv('../assets/trains/102.csv', sep=';')
# Sort by timestamps_UTC
timestamp_col = 'timestamps_UTC'
data[timestamp_col] = pd.to_datetime(data[timestamp_col])
data = data.sort_values(by=timestamp_col)
# Remove data when consecutive timestamps delta is bigger than 30min
data = data[data[timestamp_col].diff() < pd.Timedelta(minutes=30)]
# Remove data when consecutive timestamps delta is smaller than 1s
data = data[data[timestamp_col].diff() > pd.Timedelta(seconds=1)]

AirTemp = data['RS_E_InAirTemp_PC2']
Timestamps = data['timestamps_UTC']
limit = len(data)

ts = pd.Series(AirTemp.values, index=Timestamps)
ts = ts[0:limit]

# Normalize the data between 0 and 1
ts = (ts - ts.min()) / (ts.max() - ts.min())

In [None]:
# Plot the data
plt.figure(figsize=(20,10))
plt.plot(ts)
plt.show()

# Count the number of 0s in 'RS_E_InAirTemp_PC2'
print("Number of 0s in 'RS_E_InAirTemp_PC2':", len(data[data['RS_E_InAirTemp_PC2'] == 0]))

# Remove data when 'RS_E_InAirTemp_PC2' is 0
data_clean = data[data['RS_E_InAirTemp_PC2'] != 0]
ts_clean = pd.Series(data_clean['RS_E_InAirTemp_PC2'].values, index=data_clean['timestamps_UTC'])
print("Size of the data after removing 0s in 'RS_E_InAirTemp_PC2':", len(ts_clean))

# Plot the data
plt.figure(figsize=(20,10))
plt.plot(ts_clean)
plt.show()

# Remove the rows where the distance between consecutive location is smaller than 1m
# + pay attention to rows where timestamp is the same or really close => use speed
# + pay attention to rows where speed is 0
# + pay attention to rows where speed is bigger than 100km/h (shouldn't happen, max speed is 100km/h)
print("Number of rows where speed smaller than 1km/h:", len(data_clean[data_clean['speed'] < 1]))
data_clean = data_clean[data_clean['speed'] >= 1]
print("Number of rows where speed bigger than 100km/h:", len(data_clean[data_clean['speed'] > 100]))
data_clean = data_clean[data_clean['speed'] <= 100]

print(len(data_clean))
# Remove data when consecutive timestamps delta is bigger than 30min
data_clean = data_clean[data_clean['timestamps_UTC'].diff() < pd.Timedelta(minutes=30)]
print(len(data_clean))

ts_clean = pd.Series(data_clean['RS_E_InAirTemp_PC2'].values, index=data_clean['timestamps_UTC'])
print("Size of the data after removing rows where speed smaller than 1km/h and bigger than 100km/h:", len(ts_clean))

# Plot the data
plt.figure(figsize=(20,10))
plt.plot(ts_clean)
plt.show()


# Plot lines for each index, showcase real data portions
# timestamps = data_clean['timestamps_UTC']
# for i in tqdm(timestamps):
#     plt.axvline(x=i, color='r')
# plt.plot(ts_clean)
# plt.show()

In [None]:
# Plot the data
plt.figure(figsize=(20,5))
plt.plot(ts)
plt.show()

# Separate data by duration without data
data_sample = data_clean.copy()

# Reset the index
data_sample = data_sample.reset_index(drop=True)

# Get the difference between consecutive timestamps
diff = data_sample['timestamps_UTC'].diff()

# Get the right side indexes where the difference is bigger than 30min
idx_left = diff[diff > pd.Timedelta(minutes=10)].index


# Get the left side indexes where the difference is bigger than 30min
idx_right = idx_left - 1

# Add the first and last indexes
idx_left = [0] + idx_left.tolist()
idx_right = idx_right.tolist() + [len(data_sample)]

# Plot the data outside the segments
ts = pd.Series(data_sample['RS_E_InAirTemp_PC2'].values, index=data_sample['timestamps_UTC'])

plt.figure(figsize=(20,5))
# for i in tqdm(data_sample['timestamps_UTC']):
#     plt.axvline(x=i, color='r')

plt.plot(ts)
plt.show()

plt.figure(figsize=(20,5))

for segment in zip(idx_left, idx_right):
    data_plot = 0
    ts_plot = ts[segment[0]:segment[1]]
    plt.plot(ts_plot, label=str(segment))

plt.savefig('segments.png', dpi=300)
plt.show()