In [1]:
######################
#### Dependencies ####
######################
def comma_to_float(x):
    try:
        return float(x.replace(',','.'))
    except:
        return np.nan

def get_season(month):
    if month in [12,1,2]:
        return 1
    if month in [3,4,5]:
        return 2
    if month in [6,7,8]:
        return 3
    if month in [9,10,11]:
        return 4

import os
import datetime
import pandas as pd
import numpy as np

In [33]:
#main_dir = '//SA-MODAT-MTO-PR/AKABAR/'
main_dir = '../Sa-modat-mpo-pr/AKABAR/'
files_path = ['SD202103.SWD', 'SD202104.SWD', 'SD202105.SWD', 'SD202106.SWD', 'SD202107.SWD']

data = pd.DataFrame()
for file_path in files_path:
    data_part = pd.read_csv(main_dir + file_path,low_memory=False,
                               delimiter='\t',quotechar='"',decimal=',')
    # drop 2 rows (contains unit)
    data_part = data_part.loc[2:,:]

    data_part = data_part.rename(columns={'AKABAR': 'datetime',
                                 'Solar Rad (wat/m2)': 'radiation',
                                 'Temperature (*C)' : 'temp',
                                 'Rainfall (mm)' : 'precip',
                                 'Wind Dir (Deg)' : 'wind_dir',
                                 'Wind Speed (km/h)' : 'speed'})
    data = data.append(data_part)
    
data = data.reset_index(drop=True)

# convert str to float
for col in ['wind_dir','speed','temp','precip','radiation']:
    data[col] = data[col].map(comma_to_float)
    
# select columns
data = data[['datetime','speed','wind_dir', 'temp', 'radiation', 'precip']]

measurement = data.reset_index(drop=True)

# Date format
measurement['datetime'] = pd.to_datetime(measurement['datetime'] + ':00',format='%Y-%m-%d %H:%M:%S')

# Skip incomplete hours
measurement['Id_hour'] = measurement['datetime'].map(lambda x : str(x)[0:13])
measurement = measurement.merge(measurement.groupby(['Id_hour'])['datetime'].count().reset_index() \
                                .rename(columns={'datetime':'Id_hour_count'}),
                                how='left')
measurement = measurement.loc[measurement['Id_hour_count'] >= 40,].reset_index(drop=True)

# Drop na
measurement = measurement.set_index('datetime') \
              [['speed','temp', 'radiation', 'precip','wind_dir']] \
              .dropna(axis=0, how='all')

# Smooth wind direction 
measurement['cos_wind_dir'] = np.cos(2 * np.pi * measurement['wind_dir'] / 360)
measurement['sin_wind_dir'] = np.sin(2 * np.pi * measurement['wind_dir'] / 360)


# Init output measurement data
measurement_out = pd.DataFrame()
# Speed weighted hourly mean for sin & cos
measurement_out['cos_wind_dir'] = (measurement['cos_wind_dir'] * measurement['speed']).resample('H', label='right').sum() \
                                                   / measurement['speed'].resample('H', label='right').sum()
# Speed weighted hourly mean for sin & cos
measurement_out['sin_wind_dir'] = (measurement['sin_wind_dir'] * measurement['speed']).resample('H', label='right').sum() \
                                                   / measurement['speed'].resample('H', label='right').sum()


# Hourly mean for speed, temperature, radiation and precipitation
for col in ['speed','temp','radiation','precip']:
    measurement_out[col] = measurement[col].map(float).resample('1H', label='right').mean()

# Add caterogical features
measurement_out['season'] = measurement_out.index.month.map(get_season) # ordinal not categorical for linear models

measurement_out = measurement_out.reset_index()
# Select columns
measurement_out = measurement_out[['datetime','speed','cos_wind_dir','sin_wind_dir','temp','radiation','precip','season']]

# Build date Index and fill na
Idx_Measurement = pd.DataFrame(pd.date_range(measurement_out.datetime[0],
                                             measurement_out.datetime.iloc[-1],
                                             freq='H'),
                                             columns=['datetime'])

measurement_out = Idx_Measurement.merge(measurement_out,how='left').fillna(method='ffill')

In [34]:
# Save file
measurement_out.to_csv('../data/processed/last_measurement_akabar.csv',index=False)

measurement_out.head()

Unnamed: 0,datetime,speed,cos_wind_dir,sin_wind_dir,temp,radiation,precip,season
0,2021-03-16 14:00:00,10.466667,0.362045,-0.66705,19.8,989.016667,0.0,2
1,2021-03-16 15:00:00,14.216667,0.483021,-0.831387,20.376667,959.983333,0.0,2
2,2021-03-16 16:00:00,14.366667,0.432837,-0.822992,20.033333,863.5,0.0,2
3,2021-03-16 17:00:00,16.183333,0.460274,-0.771549,19.495,700.116667,0.0,2
4,2021-03-16 18:00:00,10.216667,0.42592,-0.86041,19.88,489.983333,0.0,2
