In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm

In [None]:
data = pd.read_csv('../assets/trains/102.csv', sep=';')
# Sort by timestamps_UTC
timestamp_col = 'timestamps_UTC'
data[timestamp_col] = pd.to_datetime(data[timestamp_col])
data = data.sort_values(by=timestamp_col)
# Remove data when consecutive timestamps delta is bigger than 30min
data = data[data[timestamp_col].diff() < pd.Timedelta(minutes=30)]
# Remove data when consecutive timestamps delta is smaller than 1s
data = data[data[timestamp_col].diff() > pd.Timedelta(seconds=1)]

AirTemp_PC1 = data['RS_E_InAirTemp_PC1']
AirTemp_PC2 = data['RS_E_InAirTemp_PC2']
Timestamps = data['timestamps_UTC']
limit = len(data)

ts_pc1 = pd.Series(AirTemp_PC1.values, index=Timestamps)
ts_pc2 = pd.Series(AirTemp_PC2.values, index=Timestamps)
ts_pc1 = ts_pc1[0:limit]
ts_pc2 = ts_pc2[0:limit]

# Normalize the data between 0 and 1
ts_pc1 = (ts_pc1 - ts_pc1.min()) / (ts_pc1.max() - ts_pc1.min())
ts_pc2 = (ts_pc2 - ts_pc2.min()) / (ts_pc2.max() - ts_pc2.min())

In [None]:
# Plot the data
plt.figure(figsize=(20, 10))
plt.plot(ts_pc1, label='PC1')
plt.plot(ts_pc2, label='PC2')
plt.legend()
plt.show()

# Count the number of 0s in 'RS_E_InAirTemp_PC2'
print("Number of 0s in 'RS_E_InAirTemp_PC1':", len(data[data['RS_E_InAirTemp_PC1'] == 0]))
print("Number of 0s in 'RS_E_InAirTemp_PC2':", len(data[data['RS_E_InAirTemp_PC2'] == 0]))

# Remove data when 'RS_E_InAirTemp_PC1/2' is 0
data_clean = data.copy()

data_clean = data_clean[data_clean['RS_E_InAirTemp_PC1'] != 0]
data_clean = data_clean[data_clean['RS_E_InAirTemp_PC2'] != 0]

ts_clean_pc1 = pd.Series(data_clean['RS_E_InAirTemp_PC1'].values, index=data_clean['timestamps_UTC'])
ts_clean_pc2 = pd.Series(data_clean['RS_E_InAirTemp_PC2'].values, index=data_clean['timestamps_UTC'])

print("Size of the data after removing 0s in 'RS_E_InAirTemp_PC1/2':", len(ts_clean_pc1))

# Plot the data
plt.figure(figsize=(20,10))
plt.plot(ts_clean_pc1, label='PC1')
plt.plot(ts_clean_pc2, label='PC2')
plt.legend()
plt.show()

# Remove the rows where the distance between consecutive location is smaller than 1m
# + pay attention to rows where timestamp is the same or really close => use speed
# + pay attention to rows where speed is 0
# + pay attention to rows where speed is bigger than 100km/h (shouldn't happen, max speed is 100km/h)
print("Number of rows where speed smaller than 1km/h:", len(data_clean[data_clean['speed'] < 1]))
data_clean = data_clean[data_clean['speed'] >= 1]
print("Number of rows where speed bigger than 100km/h:", len(data_clean[data_clean['speed'] > 100]))
data_clean = data_clean[data_clean['speed'] <= 100]

print(len(data_clean))
# Remove data when consecutive timestamps delta is bigger than 30min
data_clean = data_clean[data_clean['timestamps_UTC'].diff() < pd.Timedelta(minutes=30)]
print(len(data_clean))

ts_clean_pc1 = pd.Series(data_clean['RS_E_InAirTemp_PC1'].values, index=data_clean['timestamps_UTC'])
ts_clean_pc2 = pd.Series(data_clean['RS_E_InAirTemp_PC2'].values, index=data_clean['timestamps_UTC'])

print("Size of the data after removing rows where speed smaller than 1km/h and bigger than 100km/h:", len(ts_clean_pc1))

# Plot the data
plt.figure(figsize=(20,10))
plt.plot(ts_clean_pc1, label='PC1')
plt.plot(ts_clean_pc2, label='PC2')
plt.legend()
plt.show()

In [None]:
# Compute the difference between values of 'RS_E_InAirTemp_PC1' and 'RS_E_InAirTemp_PC2' over time
ts_diff = ts_clean_pc1 - ts_clean_pc2

print(len(ts_diff))

# Normalize the data between 0 and 1
# ts_diff = (ts_diff - ts_diff.min()) / (ts_diff.max() - ts_diff.min())

# Plot the data
plt.figure(figsize=(20,10))
plt.plot(ts_diff[1000:3000], label='PC1 - PC2')
plt.legend()
plt.show()

# Remove data with a difference bigger than 15°C
threshold = 3
ts_diff = ts_diff[ts_diff < threshold]
ts_diff = ts_diff[ts_diff > -threshold]

print(len(ts_diff))

# Plot the data
# Plot the data
plt.figure(figsize=(20,10))
plt.plot(ts_diff, label='PC1 - PC2')
plt.legend()
plt.show()
