## Data preparation of the rail data
Väylävirasto has given the rail temperature data for 11 stations in Finland from September 2019 to May 2023. The forecast data is from MOS Archive.



In [2]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
import time


In [3]:
#station data from Väylävirasto
df = pd.read_csv(r'/home/daniel/projects/rails/data/RailTemperatures_with_stations_2023v2.csv', encoding = 'Latin-1',sep = ',')

In [4]:
df.dtypes

station        object
station_id      int64
lat           float64
lon           float64
X               int64
Y               int64
Timestamp      object
TAir          float64
TRail         float64
dtype: object

In [5]:
# Weather forecast data from Mos archive data and calculating the hourly values
dfmos = pd.read_csv(r'/home/daniel/projects/rails/data/mos_archive_data_hourly_data_for_all_rail_stations.csv', sep = ',')
dfmos

Unnamed: 0,station_id,analysis_time,forecast_time,forecast_period,analysis_date,MSL,T2,D2,U10,V10,...,cosmonth,sinmonth,coshour,sinhour,lat,lon,hourly_SRR,hourly_STR,hourly_SLHF,hourly_SSHF
0,10,0,2019-09-07 01:00:00,1,2019-09-07,101743.2,284.1,283.3,2.5,0.4,...,0.0,-1.0,0.97,0.26,62.397832,30.027159,0.000000e+00,-141475.500000,-60470.500000,67784.400000
1,10,0,2019-09-07 02:00:00,2,2019-09-07,101818.8,283.6,282.8,2.3,0.3,...,0.0,-1.0,0.87,0.50,62.397832,30.027159,0.000000e+00,-256928.000000,-32600.200000,50426.400000
2,10,0,2019-09-07 03:00:00,3,2019-09-07,101899.5,283.4,282.7,2.2,0.8,...,0.0,-1.0,0.71,0.71,62.397832,30.027159,0.000000e+00,-271362.100000,-28831.000000,35452.300000
3,10,0,2019-09-07 04:00:00,4,2019-09-07,101967.3,283.5,282.8,2.5,0.7,...,0.0,-1.0,0.50,0.87,62.397832,30.027159,6.882370e+04,-293050.300000,-46265.100000,42631.400000
4,10,0,2019-09-07 05:00:00,5,2019-09-07,102039.0,284.3,282.9,2.2,0.5,...,0.0,-1.0,0.26,0.97,62.397832,30.027159,3.580963e+05,-321952.400000,-145720.200000,1171.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3705609,110,12,2023-06-12 12:00:00,240,2023-06-02 12:00:00,102111.0,294.4,281.2,2.6,2.8,...,-1.0,0.0,-1.00,0.00,66.141150,24.921643,1.726848e+06,-301524.666667,-482150.666667,-642968.666667
3705610,110,12,2023-06-12 18:00:00,222,2023-06-03 12:00:00,102022.2,289.2,274.7,1.5,-2.5,...,-1.0,0.0,0.00,-1.00,66.141150,24.921643,1.179019e+06,-335230.000000,-442266.000000,-209314.000000
3705611,110,12,2023-06-13,228,2023-06-03 12:00:00,102400.0,279.8,273.6,-0.9,-2.2,...,-1.0,0.0,1.00,0.00,66.141150,24.921643,9.457600e+04,-325212.666667,-81877.333333,177692.666667
3705612,110,12,2023-06-13 06:00:00,234,2023-06-03 12:00:00,102434.4,286.9,277.2,0.5,0.7,...,-1.0,0.0,0.00,1.00,66.141150,24.921643,6.480613e+05,-336188.000000,-138500.000000,-154909.333333


In [6]:
dfmos.dtypes

station_id           int64
analysis_time        int64
forecast_time       object
forecast_period      int64
analysis_date       object
MSL                float64
T2                 float64
D2                 float64
U10                float64
V10                float64
LCC                float64
MCC                float64
SKT                float64
MX2T               float64
MN2T               float64
T_925              float64
T2_ENSMEAN_MA1     float64
SRR                float64
STR                float64
SLHF               float64
SSHF               float64
cosmonth           float64
sinmonth           float64
coshour            float64
sinhour            float64
lat                float64
lon                float64
hourly_SRR         float64
hourly_STR         float64
hourly_SLHF        float64
hourly_SSHF        float64
dtype: object

In [7]:
dfmos.columns

Index(['station_id', 'analysis_time', 'forecast_time', 'forecast_period',
       'analysis_date', 'MSL', 'T2', 'D2', 'U10', 'V10', 'LCC', 'MCC', 'SKT',
       'MX2T', 'MN2T', 'T_925', 'T2_ENSMEAN_MA1', 'SRR', 'STR', 'SLHF', 'SSHF',
       'cosmonth', 'sinmonth', 'coshour', 'sinhour', 'lat', 'lon',
       'hourly_SRR', 'hourly_STR', 'hourly_SLHF', 'hourly_SSHF'],
      dtype='object')

In [8]:
(min(df.Timestamp), max(df.Timestamp))

('2019-09-17 12:00:00', '2023-06-01 23:00:00')

In [9]:
(min(dfmos.forecast_time), max(dfmos.forecast_time))

('2019-09-07 01:00:00', '2023-06-14')

In [10]:
dfmos.describe

<bound method NDFrame.describe of          station_id  analysis_time        forecast_time  forecast_period  \
0                10              0  2019-09-07 01:00:00                1   
1                10              0  2019-09-07 02:00:00                2   
2                10              0  2019-09-07 03:00:00                3   
3                10              0  2019-09-07 04:00:00                4   
4                10              0  2019-09-07 05:00:00                5   
...             ...            ...                  ...              ...   
3705609         110             12  2023-06-12 12:00:00              240   
3705610         110             12  2023-06-12 18:00:00              222   
3705611         110             12           2023-06-13              228   
3705612         110             12  2023-06-13 06:00:00              234   
3705613         110             12  2023-06-13 12:00:00              240   

               analysis_date       MSL     T2     D2 

In [11]:
dfmos.isnull().sum()

station_id               0
analysis_time            0
forecast_time            0
forecast_period          0
analysis_date            0
MSL                      0
T2                    1364
D2                    1364
U10                   4092
V10                   4092
LCC                      0
MCC                      0
SKT                      0
MX2T                 38192
MN2T                 38192
T_925                35464
T2_ENSMEAN_MA1     1793924
SRR                      0
STR                      1
SLHF                     2
SSHF                     0
cosmonth                 0
sinmonth                 0
coshour                  0
sinhour                  0
lat                      0
lon                      0
hourly_SRR               0
hourly_STR               2
hourly_SLHF              4
hourly_SSHF              0
dtype: int64

In [12]:
cols = ['station_id', 'analysis_time', 'forecast_time', 'forecast_period',
       'analysis_date', 'MSL', 'T2', 'D2', 'U10', 'V10', 'LCC', 'MCC', 'SKT',
       'cosmonth', 'sinmonth', 'coshour', 'sinhour', 'lat', 'lon',
       'hourly_SRR', 'hourly_STR', 'hourly_SLHF', 'hourly_SSHF']

In [13]:
dfmos = dfmos[cols]

In [14]:
dfmosrail= dfmos.merge(df,left_on = ['forecast_time','lat', 'lon'], right_on = ['Timestamp','lat', 'lon'])
dfmosrail.shape

(419918, 30)

In [15]:
dfmosrail.columns

Index(['station_id_x', 'analysis_time', 'forecast_time', 'forecast_period',
       'analysis_date', 'MSL', 'T2', 'D2', 'U10', 'V10', 'LCC', 'MCC', 'SKT',
       'cosmonth', 'sinmonth', 'coshour', 'sinhour', 'lat', 'lon',
       'hourly_SRR', 'hourly_STR', 'hourly_SLHF', 'hourly_SSHF', 'station',
       'station_id_y', 'X', 'Y', 'Timestamp', 'TAir', 'TRail'],
      dtype='object')

In [16]:
cols = ['lat', 'lon', 'analysis_date','analysis_time', 'forecast_time', 'forecast_period',
        'MSL', 'T2', 'D2', 'U10', 'V10', 'LCC', 'MCC', 'SKT',
       'cosmonth', 'sinmonth', 'coshour', 'sinhour',
       'hourly_SRR', 'hourly_STR', 'hourly_SLHF', 'hourly_SSHF', 'X', 'Y', 'Timestamp', 'TAir', 'TRail']

In [17]:
dfmosrail = dfmosrail[cols]
dfmosrail.shape

(419918, 27)

In [18]:
cols = dfmosrail.select_dtypes(include=[np.float64]).columns
cols

Index(['lat', 'lon', 'MSL', 'T2', 'D2', 'U10', 'V10', 'LCC', 'MCC', 'SKT',
       'cosmonth', 'sinmonth', 'coshour', 'sinhour', 'hourly_SRR',
       'hourly_STR', 'hourly_SLHF', 'hourly_SSHF', 'TAir', 'TRail'],
      dtype='object')

In [19]:
dfmosrail[cols] = dfmosrail[cols].astype(np.float32)
dfmosrail.dtypes

lat                float32
lon                float32
analysis_date       object
analysis_time        int64
forecast_time       object
forecast_period      int64
MSL                float32
T2                 float32
D2                 float32
U10                float32
V10                float32
LCC                float32
MCC                float32
SKT                float32
cosmonth           float32
sinmonth           float32
coshour            float32
sinhour            float32
hourly_SRR         float32
hourly_STR         float32
hourly_SLHF        float32
hourly_SSHF        float32
X                    int64
Y                    int64
Timestamp           object
TAir               float32
TRail              float32
dtype: object

In [20]:
cols = dfmosrail.select_dtypes(include=[np.int64]).columns
cols

Index(['analysis_time', 'forecast_period', 'X', 'Y'], dtype='object')

In [21]:
dfmosrail[cols] = dfmosrail[cols].astype(np.int32)

In [22]:
dfmosrail.dtypes

lat                float32
lon                float32
analysis_date       object
analysis_time        int32
forecast_time       object
forecast_period      int32
MSL                float32
T2                 float32
D2                 float32
U10                float32
V10                float32
LCC                float32
MCC                float32
SKT                float32
cosmonth           float32
sinmonth           float32
coshour            float32
sinhour            float32
hourly_SRR         float32
hourly_STR         float32
hourly_SLHF        float32
hourly_SSHF        float32
X                    int32
Y                    int32
Timestamp           object
TAir               float32
TRail              float32
dtype: object

In [None]:
# save the data 
dfmosrail.to_csv('/home/daniel/projects/rails/data/rail_temperatures_mos_data.csv', sep = ',', index = False)