In [1]:
import numpy as np
import pandas as pd
import datetime

In [2]:
def compress(df, **kwargs):
    """
    Reduces size of dataframe by downcasting numerical columns
    """
    input_size = df.memory_usage(index=True).sum()/ 1024
    print("new dataframe size: ", round(input_size,2), 'kB')

    in_size = df.memory_usage(index=True).sum()
    for type in ["float", "integer"]:
        l_cols = list(df.select_dtypes(include=type))
        for col in l_cols:
            df[col] = pd.to_numeric(df[col], downcast=type)
    out_size = df.memory_usage(index=True).sum()
    ratio = (1 - round(out_size / in_size, 2)) * 100

    print("optimized size by {} %".format(round(ratio,2)))
    print("new dataframe size: ", round(out_size / 1024,2), " kB")

    return df

In [37]:
filepath = r"../raw_data/history_forecast_bulk_20171007_20240312.csv"

df = pd.read_csv(filepath)

In [38]:
df = compress(df)

new dataframe size:  617298.69 kB
optimized size by 46.0 %
new dataframe size:  334370.18  kB


In [39]:
df['forecast dt iso'] = df['forecast dt iso'].str.replace('+0000 UTC', '')
df['slice dt iso'] = df['slice dt iso'].str.replace('+0000 UTC', '')

In [44]:
df  = df[df['forecast dt iso'].str.contains('12:00:00')]

In [49]:
df.columns

Index(['forecast dt unixtime', 'forecast dt iso', 'slice dt unixtime',
       'slice dt iso', 'lat', 'lon', 'temperature', 'dew_point', 'pressure',
       'ground_pressure', 'humidity', 'clouds', 'wind_speed', 'wind_deg',
       'rain', 'snow', 'ice', 'fr_rain', 'convective', 'snow_depth',
       'accumulated', 'hours', 'rate', 'probability'],
      dtype='object')

In [57]:
df['forecast dt unixtime'].unique()

array([1507377600, 1507464000, 1507550400, ..., 1710072000, 1710158400,
       1710244800], dtype=int32)

In [35]:
df['forecast dt iso'] = df['forecast dt iso'][:19]

In [36]:
df['forecast dt iso']

0          2017-10-07 00:00:00 +0000 UTC
1          2017-10-07 00:00:00 +0000 UTC
2          2017-10-07 00:00:00 +0000 UTC
3          2017-10-07 00:00:00 +0000 UTC
4          2017-10-07 00:00:00 +0000 UTC
                       ...              
3292254                              NaN
3292255                              NaN
3292256                              NaN
3292257                              NaN
3292258                              NaN
Name: forecast dt iso, Length: 3292259, dtype: object

In [9]:
datetime.datetime.strptime(df['forecast dt iso'][0], "%Y-%m-%d %H:%M:%S %z %Z")

datetime.datetime(2017, 10, 7, 0, 0, tzinfo=datetime.timezone(datetime.timedelta(0), 'UTC'))

In [None]:
df['forecast dt iso'] = df['forecast dt iso'].apply(lambda row: datetime.datetime.strptime(row, "%Y-%m-%d %H:%M:%S %z %Z"))
# df['slice dt iso'] = df['slice dt iso'].apply(lambda row: datetime.datetime.strptime(row, "%Y-%m-%d %H:%M:%S %z %Z"))

# Data exploration 

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3292259 entries, 0 to 3292258
Data columns (total 24 columns):
 #   Column                Dtype  
---  ------                -----  
 0   forecast dt unixtime  int64  
 1   forecast dt iso       object 
 2   slice dt unixtime     int64  
 3   slice dt iso          object 
 4   lat                   float64
 5   lon                   float64
 6   temperature           float64
 7   dew_point             float64
 8   pressure              float64
 9   ground_pressure       float64
 10  humidity              float64
 11  clouds                float64
 12  wind_speed            float64
 13  wind_deg              float64
 14  rain                  float64
 15  snow                  float64
 16  ice                   float64
 17  fr_rain               float64
 18  convective            float64
 19  snow_depth            float64
 20  accumulated           float64
 21  hours                 float64
 22  rate                  float64
 23  probabi

In [7]:
df = compress(df)

new dataframe size:  617298.69 kB
optimized size by 46.0 %
new dataframe size:  334370.18  kB


In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3292259 entries, 0 to 3292258
Data columns (total 24 columns):
 #   Column                Dtype  
---  ------                -----  
 0   forecast dt unixtime  int32  
 1   forecast dt iso       object 
 2   slice dt unixtime     int32  
 3   slice dt iso          object 
 4   lat                   float32
 5   lon                   float32
 6   temperature           float32
 7   dew_point             float32
 8   pressure              float32
 9   ground_pressure       float32
 10  humidity              float32
 11  clouds                float32
 12  wind_speed            float32
 13  wind_deg              float32
 14  rain                  float32
 15  snow                  float32
 16  ice                   float32
 17  fr_rain               float32
 18  convective            float32
 19  snow_depth            float32
 20  accumulated           float32
 21  hours                 float32
 22  rate                  float32
 23  probabi

In [9]:
df.describe()

Unnamed: 0,forecast dt unixtime,slice dt unixtime,lat,lon,temperature,dew_point,pressure,ground_pressure,humidity,clouds,...,rain,snow,ice,fr_rain,convective,snow_depth,accumulated,hours,rate,probability
count,3292259.0,3292259.0,3292259.0,3292259.0,3292259.0,3292259.0,3292259.0,3292259.0,3292259.0,3292259.0,...,3292259.0,3292259.0,3292259.0,3292259.0,3292259.0,3292259.0,3292247.0,3292247.0,3292259.0,3292259.0
mean,1616181000.0,1616830000.0,52.47009,13.39995,11.06815,4.816284,1015.78,1009.95,68.41671,64.07908,...,0.06357615,0.007369398,1.061202e-05,0.000157891,1.083201,0.001513291,0.07135489,1.096222,1.919358e-05,0.1560877
std,56033680.0,56097760.0,0.0,0.0,8.141868,6.181293,9.509791,9.447282,17.65172,38.15989,...,0.3050024,0.09227522,0.002988257,0.01385303,1.825642,0.00998404,0.3179784,1.02429,6.570214e-05,0.2808626
min,1507334000.0,1507334000.0,52.47,13.4,-21.29,-24.67,959.16,951.68,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
25%,1571724000.0,1572415000.0,52.47,13.4,4.65,0.49,1010.09,1004.31,55.9,27.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
50%,1617905000.0,1618592000.0,52.47,13.4,10.23,4.93,1016.07,1010.28,70.81,81.0,...,0.0,0.0,0.0,0.0,0.333,0.0,0.0,1.0,0.0,0.0
75%,1664086000.0,1664770000.0,52.47,13.4,17.18,9.44,1021.83,1015.98,82.56,100.0,...,0.0,0.0,0.0,0.0,1.333,0.0,0.0,1.0,6e-06,0.19
max,1710266000.0,1711649000.0,52.47,13.4,42.97,22.82,1057.87,1051.42,100.0,100.0,...,45.0,19.062,1.854167,9.0,23.875,0.41,45.0,12.0,0.005028,1.0


In [10]:
df.columns

Index(['forecast dt unixtime', 'forecast dt iso', 'slice dt unixtime',
       'slice dt iso', 'lat', 'lon', 'temperature', 'dew_point', 'pressure',
       'ground_pressure', 'humidity', 'clouds', 'wind_speed', 'wind_deg',
       'rain', 'snow', 'ice', 'fr_rain', 'convective', 'snow_depth',
       'accumulated', 'hours', 'rate', 'probability'],
      dtype='object')

In [11]:
df.isna().sum()

forecast dt unixtime     0
forecast dt iso          0
slice dt unixtime        0
slice dt iso             0
lat                      0
lon                      0
temperature              0
dew_point                0
pressure                 0
ground_pressure          0
humidity                 0
clouds                   0
wind_speed               0
wind_deg                 0
rain                     0
snow                     0
ice                      0
fr_rain                  0
convective               0
snow_depth               0
accumulated             12
hours                   12
rate                     0
probability              0
dtype: int64

In [12]:
df.isnull().sum()

forecast dt unixtime     0
forecast dt iso          0
slice dt unixtime        0
slice dt iso             0
lat                      0
lon                      0
temperature              0
dew_point                0
pressure                 0
ground_pressure          0
humidity                 0
clouds                   0
wind_speed               0
wind_deg                 0
rain                     0
snow                     0
ice                      0
fr_rain                  0
convective               0
snow_depth               0
accumulated             12
hours                   12
rate                     0
probability              0
dtype: int64

In [13]:
df.head()

Unnamed: 0,forecast dt unixtime,forecast dt iso,slice dt unixtime,slice dt iso,lat,lon,temperature,dew_point,pressure,ground_pressure,...,rain,snow,ice,fr_rain,convective,snow_depth,accumulated,hours,rate,probability
0,1507334400,2017-10-07 00:00:00 +0000 UTC,1507334400,2017-10-07 00:00:00 +0000 UTC,52.470001,13.4,9.07,7.75,1015.809998,1010.159973,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,1507334400,2017-10-07 00:00:00 +0000 UTC,1507338000,2017-10-07 01:00:00 +0000 UTC,52.470001,13.4,9.49,7.62,1015.75,1010.190002,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.04
2,1507334400,2017-10-07 00:00:00 +0000 UTC,1507341600,2017-10-07 02:00:00 +0000 UTC,52.470001,13.4,9.64,7.68,1015.799988,1010.289978,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.03
3,1507334400,2017-10-07 00:00:00 +0000 UTC,1507345200,2017-10-07 03:00:00 +0000 UTC,52.470001,13.4,9.63,7.85,1015.909973,1010.400024,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,1507334400,2017-10-07 00:00:00 +0000 UTC,1507348800,2017-10-07 04:00:00 +0000 UTC,52.470001,13.4,9.61,8.02,1016.030029,1010.52002,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,8e-06,0.0


In [22]:
df.iloc[-32:,0:5]

Unnamed: 0,forecast dt unixtime,forecast dt iso,slice dt unixtime,slice dt iso,lat
3292227,1710266400,2024-03-12 18:00:00 +0000 UTC,1711537200,2024-03-27 11:00:00 +0000 UTC,52.470001
3292228,1710266400,2024-03-12 18:00:00 +0000 UTC,1711540800,2024-03-27 12:00:00 +0000 UTC,52.470001
3292229,1710266400,2024-03-12 18:00:00 +0000 UTC,1711544400,2024-03-27 13:00:00 +0000 UTC,52.470001
3292230,1710266400,2024-03-12 18:00:00 +0000 UTC,1711548000,2024-03-27 14:00:00 +0000 UTC,52.470001
3292231,1710266400,2024-03-12 18:00:00 +0000 UTC,1711551600,2024-03-27 15:00:00 +0000 UTC,52.470001
3292232,1710266400,2024-03-12 18:00:00 +0000 UTC,1711555200,2024-03-27 16:00:00 +0000 UTC,52.470001
3292233,1710266400,2024-03-12 18:00:00 +0000 UTC,1711558800,2024-03-27 17:00:00 +0000 UTC,52.470001
3292234,1710266400,2024-03-12 18:00:00 +0000 UTC,1711562400,2024-03-27 18:00:00 +0000 UTC,52.470001
3292235,1710266400,2024-03-12 18:00:00 +0000 UTC,1711566000,2024-03-27 19:00:00 +0000 UTC,52.470001
3292236,1710266400,2024-03-12 18:00:00 +0000 UTC,1711569600,2024-03-27 20:00:00 +0000 UTC,52.470001


# Convert the DataFrame for 1 Forecast prediction per day

In [126]:
time_features =['forecast dt unixtime', 'forecast dt iso', 'slice dt unixtime',
       'slice dt iso',]

df_time = df[time_features][:5060]

In [132]:
df_time['forecast dt iso'][0]

Timestamp('2017-10-07 00:00:00+0000', tz='UTC')

In [99]:
df_time['forecast dt iso'] = df_time['forecast dt iso'].apply(lambda row: datetime.datetime.strptime(row, "%Y-%m-%d %H:%M:%S %z %Z"))
df_time['slice dt iso'] = df_time['slice dt iso'].apply(lambda row: datetime.datetime.strptime(row, "%Y-%m-%d %H:%M:%S %z %Z"))

In [123]:
df_time['forecast dt iso'] = df_time['forecast dt iso'].apply(lambda col: datetime.datetime.strptime(col, "%Y-%m-%d %H:%M:%S %z %Z"), axis=0)

TypeError: <lambda>() got an unexpected keyword argument 'axis'

In [122]:
df.apply(lambda col: col.max(), axis = 0)

In [118]:
x = np.array([1507334400, 1507341600])
x = np.datetime64('1507334400')
x

numpy.datetime64('1507334400')

In [100]:
df_time.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5060 entries, 0 to 5059
Data columns (total 4 columns):
 #   Column                Non-Null Count  Dtype              
---  ------                --------------  -----              
 0   forecast dt unixtime  5060 non-null   int32              
 1   forecast dt iso       5060 non-null   datetime64[ns, UTC]
 2   slice dt unixtime     5060 non-null   int32              
 3   slice dt iso          5060 non-null   datetime64[ns, UTC]
dtypes: datetime64[ns, UTC](2), int32(2)
memory usage: 118.7 KB


In [56]:
df_time['forecast dt iso'].nunique()

20

In [124]:
df_time.filter(like='12:00:00 +0000 UTC', axis = 0)

Unnamed: 0,forecast dt unixtime,forecast dt iso,slice dt unixtime,slice dt iso


In [57]:
df_time['forecast dt iso']

forecast dt iso
2017-10-07 00:00:00 +0000 UTC    253
2017-10-07 06:00:00 +0000 UTC    253
2017-10-11 12:00:00 +0000 UTC    253
2017-10-11 06:00:00 +0000 UTC    253
2017-10-11 00:00:00 +0000 UTC    253
2017-10-10 18:00:00 +0000 UTC    253
2017-10-10 12:00:00 +0000 UTC    253
2017-10-10 06:00:00 +0000 UTC    253
2017-10-10 00:00:00 +0000 UTC    253
2017-10-09 18:00:00 +0000 UTC    253
2017-10-09 12:00:00 +0000 UTC    253
2017-10-09 06:00:00 +0000 UTC    253
2017-10-09 00:00:00 +0000 UTC    253
2017-10-08 18:00:00 +0000 UTC    253
2017-10-08 12:00:00 +0000 UTC    253
2017-10-08 06:00:00 +0000 UTC    253
2017-10-08 00:00:00 +0000 UTC    253
2017-10-07 18:00:00 +0000 UTC    253
2017-10-07 12:00:00 +0000 UTC    253
2017-10-11 18:00:00 +0000 UTC    253
Name: count, dtype: int64

In [58]:
df_time['slice dt iso'].value_counts()

slice dt iso
2017-10-14 21:00:00 +0000 UTC    20
2017-10-12 00:00:00 +0000 UTC    20
2017-10-13 08:00:00 +0000 UTC    20
2017-10-13 09:00:00 +0000 UTC    20
2017-10-13 10:00:00 +0000 UTC    20
                                 ..
2017-10-07 05:00:00 +0000 UTC     1
2017-10-07 04:00:00 +0000 UTC     1
2017-10-07 03:00:00 +0000 UTC     1
2017-10-07 02:00:00 +0000 UTC     1
2017-10-27 18:00:00 +0000 UTC     1
Name: count, Length: 379, dtype: int64

In [77]:
df_one_day = df_time[df_time['forecast dt iso'] == '2017-10-11 12:00:00 +0000 UTC'].reset_index()
df_one_day['slice dt iso'][12:36]

12    2017-10-12 00:00:00 +0000 UTC
13    2017-10-12 01:00:00 +0000 UTC
14    2017-10-12 02:00:00 +0000 UTC
15    2017-10-12 03:00:00 +0000 UTC
16    2017-10-12 04:00:00 +0000 UTC
17    2017-10-12 05:00:00 +0000 UTC
18    2017-10-12 06:00:00 +0000 UTC
19    2017-10-12 07:00:00 +0000 UTC
20    2017-10-12 08:00:00 +0000 UTC
21    2017-10-12 09:00:00 +0000 UTC
22    2017-10-12 10:00:00 +0000 UTC
23    2017-10-12 11:00:00 +0000 UTC
24    2017-10-12 12:00:00 +0000 UTC
25    2017-10-12 13:00:00 +0000 UTC
26    2017-10-12 14:00:00 +0000 UTC
27    2017-10-12 15:00:00 +0000 UTC
28    2017-10-12 16:00:00 +0000 UTC
29    2017-10-12 17:00:00 +0000 UTC
30    2017-10-12 18:00:00 +0000 UTC
31    2017-10-12 19:00:00 +0000 UTC
32    2017-10-12 20:00:00 +0000 UTC
33    2017-10-12 21:00:00 +0000 UTC
34    2017-10-12 22:00:00 +0000 UTC
35    2017-10-12 23:00:00 +0000 UTC
Name: slice dt iso, dtype: object

In [78]:
df_one_day['forecast dt iso']

'2017-10-11 12:00:00 +0000 UTC'

In [108]:
df_one_day

Unnamed: 0,index,forecast dt unixtime,forecast dt iso,slice dt unixtime,slice dt iso
0,4554,1507723200,2017-10-11 12:00:00 +0000 UTC,1507723200,2017-10-11 12:00:00 +0000 UTC
1,4555,1507723200,2017-10-11 12:00:00 +0000 UTC,1507726800,2017-10-11 13:00:00 +0000 UTC
2,4556,1507723200,2017-10-11 12:00:00 +0000 UTC,1507730400,2017-10-11 14:00:00 +0000 UTC
3,4557,1507723200,2017-10-11 12:00:00 +0000 UTC,1507734000,2017-10-11 15:00:00 +0000 UTC
4,4558,1507723200,2017-10-11 12:00:00 +0000 UTC,1507737600,2017-10-11 16:00:00 +0000 UTC
...,...,...,...,...,...
248,4802,1507723200,2017-10-11 12:00:00 +0000 UTC,1508932800,2017-10-25 12:00:00 +0000 UTC
249,4803,1507723200,2017-10-11 12:00:00 +0000 UTC,1508976000,2017-10-26 00:00:00 +0000 UTC
250,4804,1507723200,2017-10-11 12:00:00 +0000 UTC,1509019200,2017-10-26 12:00:00 +0000 UTC
251,4805,1507723200,2017-10-11 12:00:00 +0000 UTC,1509062400,2017-10-27 00:00:00 +0000 UTC


In [109]:
np.('1507723200')

numpy.datetime64('1507723200')

In [79]:
pd.to_datetime(['2017-10-11 12:00:00 +0000 UTC', '2017-10-11 12:00:00 +0000 UTC'],
               utc=True)

  pd.to_datetime(['2017-10-11 12:00:00 +0000 UTC', '2017-10-11 12:00:00 +0000 UTC'],


DateParseError: Unknown datetime string format, unable to parse: 2017-10-11 12:00:00 +0000 UTC, at position 0

In [90]:
datetime.datetime.strptime('2017-10-11 12:00:00 +0000 UTC', '%Y-%m-%d %H:%M:%S %z %Z')

datetime.datetime(2017, 10, 11, 12, 0, tzinfo=datetime.timezone(datetime.timedelta(0), 'UTC'))

In [85]:
df_one_day.head()

Unnamed: 0,index,forecast dt unixtime,forecast dt iso,slice dt unixtime,slice dt iso
0,4554,1507723200,2017-10-11 12:00:00 +0000 UTC,1507723200,2017-10-11 12:00:00 +0000 UTC
1,4555,1507723200,2017-10-11 12:00:00 +0000 UTC,1507726800,2017-10-11 13:00:00 +0000 UTC
2,4556,1507723200,2017-10-11 12:00:00 +0000 UTC,1507730400,2017-10-11 14:00:00 +0000 UTC
3,4557,1507723200,2017-10-11 12:00:00 +0000 UTC,1507734000,2017-10-11 15:00:00 +0000 UTC
4,4558,1507723200,2017-10-11 12:00:00 +0000 UTC,1507737600,2017-10-11 16:00:00 +0000 UTC


In [87]:
df_one_day['forecast dt iso'][0]

'2017-10-11 12:00:00 +0000 UTC'

In [92]:
df_one_day['forecast dt iso'].apply(lambda row: datetime.datetime.strptime(row, "%Y-%m-%d %H:%M:%S %z %Z"))

0     2017-10-11 12:00:00+00:00
1     2017-10-11 12:00:00+00:00
2     2017-10-11 12:00:00+00:00
3     2017-10-11 12:00:00+00:00
4     2017-10-11 12:00:00+00:00
                 ...           
248   2017-10-11 12:00:00+00:00
249   2017-10-11 12:00:00+00:00
250   2017-10-11 12:00:00+00:00
251   2017-10-11 12:00:00+00:00
252   2017-10-11 12:00:00+00:00
Name: forecast dt iso, Length: 253, dtype: datetime64[ns, UTC]