In [1]:
######################
#### Dependencies ####
######################
def get_season(month):
    if month in [12,1,2]:
        return 1
    if month in [3,4,5]:
        return 2
    if month in [6,7,8]:
        return 3
    if month in [9,10,11]:
        return 4

def smooth_wind_dir(df):
    df['cos_wind_dir'] = np.cos(2 * np.pi * df['wind_dir'] / 360)
    df['sin_wind_dir'] = np.sin(2 * np.pi * df['wind_dir'] / 360)
    #print('smooth wind direction')
    df.drop(columns=['wind_dir'], inplace=True)
    return df

import os
import datetime
import pandas as pd
import numpy as np

main_dir = '../data/raw/measurements/'

In [8]:
!ls ../data/raw/measurements/

measurement_01-01-18.csv measurement_11-03-19.csv measurement_21-06-19.csv
measurement_01-01-19.csv measurement_11-03-20.csv measurement_21-06-20.csv
measurement_01-01-20.csv measurement_11-04-18.csv measurement_21-07-18.csv
measurement_01-02-18.csv measurement_11-04-19.csv measurement_21-07-19.csv
measurement_01-02-19.csv measurement_11-04-20.csv measurement_21-07-20.csv
measurement_01-02-20.csv measurement_11-05-18.csv measurement_21-08-17.csv
measurement_01-03-18.csv measurement_11-05-19.csv measurement_21-08-18.csv
measurement_01-03-19.csv measurement_11-05-20.csv measurement_21-08-19.csv
measurement_01-03-20.csv measurement_11-06-18.csv measurement_21-08-20.csv
measurement_01-04-18.csv measurement_11-06-19.csv measurement_21-09-17.csv
measurement_01-04-19.csv measurement_11-06-20.csv measurement_21-09-18.csv
measurement_01-04-20.csv measurement_11-07-18.csv measurement_21-09-19.csv
measurement_01-05-18.csv measurement_11-07-19.csv measurement_21-09-20.csv
measurement_

In [2]:
sys_date = '04/11/2020'
sys_year = sys_date[-2:]
sys_date = datetime.datetime.strptime(sys_date,'%d/%m/%Y')
sys_month = str(sys_date.month).zfill(2)
previous_month = str((sys_date - datetime.timedelta(sys_date.day)).month).zfill(2)
(sys_month,previous_month)


('11', '10')

In [3]:
data = pd.DataFrame()
for file_path in os.listdir(main_dir):
    if file_path[-9:] in [y + '-' + sys_year + '.csv' \
                          for y in (sys_month,previous_month)]:
        try:
            data = data.append(pd.read_csv(main_dir + file_path))
        except:
            print('failed : ', file_path)
            
data.sort_values(by=['datetime','mtime'], inplace=True)
data.drop_duplicates(subset = 'datetime', keep = 'last', inplace=True)

In [12]:
data = pd.DataFrame()
for file_path in os.listdir(main_dir):
    if file_path[-9:] in [y + '-' + sys_year + '.csv' \
                          for y in ('11', '10', '9', '8', '7', '6', '5')]:
        try:
            data = data.append(pd.read_csv(main_dir + file_path))
        except:
            print('failed : ', file_path)
            
data.sort_values(by=['datetime','mtime'], inplace=True)
data.drop_duplicates(subset = 'datetime', keep = 'last', inplace=True)

In [15]:
measurement = data.reset_index(drop=True)

# Date format
measurement['datetime'] = pd.to_datetime(measurement['datetime'],format='%Y-%m-%d %H:%M:%S')

# Skip incomplete hours
measurement['Id_hour'] = measurement['datetime'].map(lambda x : str(x)[0:13])
measurement = measurement.merge(measurement.groupby(['Id_hour'])['datetime'].count().reset_index() \
                                .rename(columns={'datetime':'Id_hour_count'}),
                                how='left')

measurement = measurement.loc[measurement['Id_hour_count'] >= 40,].reset_index(drop=True)


# Drop na
measurement = measurement.set_index('datetime') \
              [['speed','temp', 'radiation', 'precip','wind_dir']] \
              .dropna(axis=0, how='all')

# Smooth wind direction 
measurement = smooth_wind_dir(measurement)

# Init output measurement data
measurement_out = pd.DataFrame()
# Speed weighted hourly mean for sin & cos
measurement_out['cos_wind_dir'] = (measurement['cos_wind_dir'] * measurement['speed']).resample('H', label='right').sum() \
                                                   / measurement['speed'].resample('H', label='right').sum()
# Speed weighted hourly mean for sin & cos
measurement_out['sin_wind_dir'] = (measurement['sin_wind_dir'] * measurement['speed']).resample('H', label='right').sum() \
                                                   / measurement['speed'].resample('H', label='right').sum()

# Hourly mean for speed, temperature, radiation and precipitation
for col in ['speed','temp','radiation','precip']:
    measurement_out[col] = measurement[col].resample('1H', label='right').mean()

# Add caterogical features
measurement_out['season'] = measurement_out.index.month.map(get_season) # ordinal not categorical for linear models

measurement_out = measurement_out.reset_index()
# Select columns
measurement_out = measurement_out[['datetime','speed','cos_wind_dir','sin_wind_dir','temp','radiation','precip','season']]

# Build date Index and fill na
Idx_Measurement = pd.DataFrame(pd.date_range(measurement_out.datetime[0],
                                             measurement_out.datetime.iloc[-1],
                                             freq='H'),
                                             columns=['datetime'])

measurement_out = Idx_Measurement.merge(measurement_out,how='left').fillna(method='ffill')

# Save file
measurement_out.to_csv('../data/processed/last_measurement.csv',index=False)

In [17]:
measurement_out

Unnamed: 0,datetime,speed,cos_wind_dir,sin_wind_dir,temp,radiation,precip,season
0,2020-09-30 01:00:00,0.100000,0.110353,0.962968,23.085000,0.000000,0.000,4
1,2020-09-30 02:00:00,0.000000,0.110353,0.962968,22.680000,0.000000,0.000,4
2,2020-09-30 03:00:00,0.000000,0.110353,0.962968,22.223333,0.000000,0.000,4
3,2020-09-30 04:00:00,0.000000,0.110353,0.962968,22.005000,0.000000,0.000,4
4,2020-09-30 05:00:00,0.471667,0.557524,0.774675,22.091667,0.000000,0.000,4
...,...,...,...,...,...,...,...,...
1349,2020-11-25 06:00:00,3.600000,-0.834705,0.525138,17.146667,0.000000,0.000,4
1350,2020-11-25 07:00:00,4.260000,-0.845757,0.517055,17.166667,0.000000,0.000,4
1351,2020-11-25 08:00:00,4.081667,-0.854704,0.504485,16.976667,0.000000,0.000,4
1352,2020-11-25 09:00:00,4.215000,-0.941764,0.309068,17.050000,12.283333,0.000,4
