In [1]:
######################
#### Dependencies ####
######################
import os
import datetime
import pandas as pd
import numpy as np
import sys
sys.path.append('../')
from utils.util_functions import *
from subprocess import Popen, PIPE

In [5]:
main_dir = '//SA-MODAT-MTO-PR/Data-Safi/'
#main_dir = '../Sa-modat-mpo-pr/Data-Safi/'

p = Popen("last_file_gp2.bat", shell=True, stdout=PIPE,cwd='../utils/')
stdout, stderr = p.communicate()
file_path = stdout.decode('utf-8').rstrip()

data = pd.read_csv(main_dir + file_path,low_memory=False,
               delimiter='\t',quotechar='"',decimal=',')

In [6]:
display(data.head())
data.tail()

Unnamed: 0.1,Unnamed: 0,Power,RH,AirTemp,Rad'n,Speed@1m,Dir,Rain@1m,Pressure
0,,V,%,deg C,W.m-2,m.s-1,deg,mm,hPa
1,23/09/2021 00:00:00,110,#+INF,#+INF,0,10,931,00,10138
2,23/09/2021 00:01:00,110,#+INF,#+INF,0,11,929,00,10138
3,23/09/2021 00:02:00,110,#+INF,#+INF,0,11,1112,00,10150
4,23/09/2021 00:03:00,110,#+INF,#+INF,0,13,1092,00,10137


Unnamed: 0.1,Unnamed: 0,Power,RH,AirTemp,Rad'n,Speed@1m,Dir,Rain@1m,Pressure
40518,21/10/2021 03:17:00,110,#+INF,-190,0,19,89,0,10133
40519,21/10/2021 03:18:00,110,#+INF,#-INF,0,17,790,0,10134
40520,21/10/2021 03:19:00,111,#+INF,#-INF,0,19,504,0,10135
40521,21/10/2021 03:20:00,110,#+INF,#-INF,0,20,723,0,10134
40522,21/10/2021 03:21:00,110,#+INF,-199,0,19,647,0,10133


In [7]:
# rename columns
data = data.rename(columns={'Unnamed: 0' : 'datetime',
                            'Speed@1m': 'speed', 
                            'Dir': 'wind_dir',
                            'AirTemp' : 'temp',
                            "Rad'n" : 'radiation',
                            'Rain@1m' : 'precip',
                            'Speed@5m': 'speed', 
                            'Rain@5m' : 'precip'})

# convert str to float
for col in ['wind_dir','speed','temp','precip']:
    data[col] = data[col].map(comma_to_float)

# replace #-INF by 0
data.loc[data['radiation'] == '#-INF', 'radiation'] = 0
data.loc[data['radiation'] == '#+INF', 'radiation'] = 0
# select columns
data = data[['datetime','speed','wind_dir', 'temp', 'radiation', 'precip']]

measurement = data.reset_index(drop=True)

# Date format
measurement['datetime'] = pd.to_datetime(measurement['datetime'],format='%d/%m/%Y %H:%M:%S')

# Skip incomplete hours
measurement['Id_hour'] = measurement['datetime'].map(lambda x : str(x)[0:13])
measurement = measurement.merge(measurement.groupby(['Id_hour'])['datetime'].count().reset_index() \
                                .rename(columns={'datetime':'Id_hour_count'}),
                                how='left')
measurement = measurement.loc[measurement['Id_hour_count'] >= 40,].reset_index(drop=True)

# Drop na
measurement = measurement.set_index('datetime') \
              [['speed','temp', 'radiation', 'precip','wind_dir']] \
              .dropna(axis=0, how='all')

# Smooth wind direction 
measurement['cos_wind_dir'] = np.cos(2 * np.pi * measurement['wind_dir'] / 360)
measurement['sin_wind_dir'] = np.sin(2 * np.pi * measurement['wind_dir'] / 360)


# Init output measurement data
measurement_out = pd.DataFrame()
# Speed weighted hourly mean for sin & cos
measurement_out['cos_wind_dir'] = (measurement['cos_wind_dir'] * measurement['speed']).resample('H', label='right').sum() \
                                                   / measurement['speed'].resample('H', label='right').sum()
# Speed weighted hourly mean for sin & cos
measurement_out['sin_wind_dir'] = (measurement['sin_wind_dir'] * measurement['speed']).resample('H', label='right').sum() \
                                                   / measurement['speed'].resample('H', label='right').sum()


# Hourly mean for speed, temperature, radiation and precipitation
for col in ['speed','temp','radiation','precip']:
    measurement_out[col] = measurement[col].map(float).resample('1H', label='right').mean()

# Add caterogical features
measurement_out['season'] = measurement_out.index.month.map(get_season) # ordinal not categorical for linear models

measurement_out = measurement_out.reset_index()
# Select columns
measurement_out = measurement_out[['datetime','speed','cos_wind_dir','sin_wind_dir','temp','radiation','precip','season']]

# Build date Index and fill na
Idx_Measurement = pd.DataFrame(pd.date_range(measurement_out.datetime[0],
                                             measurement_out.datetime.iloc[-1],
                                             freq='H'),
                                             columns=['datetime'])

measurement_out = Idx_Measurement.merge(measurement_out,how='left').fillna(method=
                                                                           'ffill')

# Save file
measurement_out.to_csv('../data/processed/last_measurement_21102021.csv',index=False)