In [6]:
import sys
sys.path.append('../')
import os
import pandas as pd
import numpy as np
from xgboost import XGBRegressor
from datetime import timedelta
import pickle
import datetime

from utils import utils_scenario as utils, data_preparation as prep, data_process as proc


### Dependencies

In [2]:
def comma_to_float(x):
    try:
        return float(x.replace(',','.'))
    except:
        return np.nan
    
def get_season(month):
    if month in [12,1,2]:
        return 1
    if month in [3,4,5]:
        return 2
    if month in [6,7,8]:
        return 3
    if month in [9,10,11]:
        return 4

def get_am(hour):
    if hour in range(0,12):
        return 1
    else:
        return 0
    
def DAY_format(DAY):
    return ('20' + DAY[-2:] + '-' + DAY[3:5] + '-' + DAY[0:2])


### Load one file

In [3]:
def get_one_measurement(file_path,DAY):
    data = pd.read_csv(main_dir + file_path,low_memory=False,
                   delimiter='\t',quotechar='"',decimal=',').dropna()
    # drop first row (contains unit)
    data = data.loc[1:,:]
    # rename columns
    data = data.rename(columns={'Unnamed: 0' : 'datetime',
                                'Speed@1m': 'speed', 
                                'Dir': 'wind_dir',
                                'AirTemp' : 'temp',
                                "Rad'n" : 'radiation',
                                'Rain@1m' : 'precip'})
    # convert date from gmt to gmt+1
    data['datetime'] = pd.to_datetime(data['datetime'],format= '%d/%m/%Y %H:%M:%S') + datetime.timedelta(hours=1)

    # files can contain also old data so we filter on day to avoid duplicates and slow processing
    data['day'] = data['datetime'].map(lambda x : str(x)[0:10])
    data = data.loc[data['day'] == DAY_format(DAY)]

    # convert str to float
    for col in ['wind_dir','speed','temp','precip']:
        data[col] = data[col].map(comma_to_float)

#     # compute cos and sin wind_dir
        #Cynthia: do smooth_wind_dir later 
#     data = prep.smooth_wind_dir(data)

    # replace #-INF by 0
    data.loc[data['radiation'] == '#-INF', 'radiation'] = 0
    # select columns
    data = data[['datetime','speed','wind_dir', 'temp', 'radiation', 'precip']]
    return data.reset_index(drop=True)

### Init path and day

In [7]:
main_dir = './data/raw/measurement/'
save_dir = './data/processed/'

### Input system date using batch file
### Each hour we collect data for system date
# This data seems to be in gmt not gmt+1 
sys_date = '01/07/2020'
sys_year = sys_date.split('/')[2][-2:]
sys_month = sys_date.split('/')[1]
sys_day = sys_date.split('/')[0]
DAY = sys_day + '-' + sys_month + '-' + sys_year
print(DAY)

01-07-20


### Load one day of measurement

In [9]:
def get_one_day_measurement(DAY):
    data_per_day = pd.DataFrame()
    for file_path in os.listdir(main_dir):
        if (file_path[4:12] == DAY):
            data_per_day = data_per_day.append(get_one_measurement(file_path,DAY))
    return data_per_day.drop_duplicates().reset_index(drop=True)

file_path = os.listdir(main_dir)[0]
df = get_one_measurement(file_path,DAY).head()

### Concatenate 15 days data
- Save one file per day in ./data/raw/daily_measurement/

In [22]:
for i in range(1,15):
    DAY = str(i).zfill(2) + '-07-20'
    data = get_one_day_measurement(DAY)
    data.to_csv('./data/raw/daily_measurement/measurement_' + DAY + '.csv',index=False)

### Process measurement
- Load all files in ./data/raw/daily_measurement/
- Aggregate by hour (weighted by speed for cos & sin)

In [23]:
# Load all files available
main_dir = './data/raw/daily_measurement/'
measurement = pd.DataFrame()
for file_path in os.listdir(main_dir):
    measurement = measurement.append(pd.read_csv(main_dir + file_path))
measurement = measurement.reset_index(drop=True)

# Date format
measurement['datetime'] = pd.to_datetime(measurement['datetime'],format='%Y-%m-%d %H:%M:%S')

# Drop na
measurement = measurement.set_index('datetime') \
              [['speed','temp', 'radiation', 'precip','wind_dir']] \
              .dropna(axis=0, how='all')
# Cynthia: smooth wind direction 
measurement = prep.smooth_wind_dir(measurement)

# Init output measurement data
measurement_out = pd.DataFrame()
# Speed weighted hourly mean for sin & cos
measurement_out['cos_wind_dir'] = (measurement['cos_wind_dir'] * measurement['speed']).resample('H', label='right').sum() \
                                                   / measurement['speed'].resample('H', label='right').sum()
# Speed weighted hourly mean for sin & cos
measurement_out['sin_wind_dir'] = (measurement['sin_wind_dir'] * measurement['speed']).resample('H', label='right').sum() \
                                                   / measurement['speed'].resample('H', label='right').sum()

# Hourly mean for speed, temperature, radiation and precipitation
for col in ['speed','temp','radiation','precip']:
    measurement_out[col] = measurement[col].resample('1H', label='right').mean()
 
 #add caterogical features
measurement_out['season'] = measurement_out.index.month.map(get_season) # ordinal not categorical for linear models
measurement_out['am'] = measurement_out.index.hour.map(get_am)

measurement_out = measurement_out.reset_index()
# Add 1 hour to data because of hourly mean : data between 1am and 2am will be available at 2am
#measurement_out['datetime'] = measurement_out['datetime'] + datetime.timedelta(hours=1)

# Select columns
measurement_out = measurement_out[['datetime','speed','cos_wind_dir','sin_wind_dir','temp','radiation','precip','season']] #'am' feature is not currently in the model
measurement_out.head()

Unnamed: 0,datetime,speed,cos_wind_dir,sin_wind_dir,temp,radiation,precip,season
0,2020-07-01 01:00:00,3.523333,0.419934,0.878093,24.59,0.0,0.0,3
1,2020-07-01 02:00:00,3.715,0.451142,0.845504,24.34,0.0,0.0,3
2,2020-07-01 03:00:00,2.5,0.758071,0.584625,23.993333,0.0,0.0,3
3,2020-07-01 04:00:00,3.043333,0.760389,0.600481,23.605,0.0,0.0,3
4,2020-07-01 05:00:00,4.423333,0.595837,0.763091,23.58,0.0,0.0,3


## Cynthia: check if any column is missing 

In [49]:
# #save columns into a pickle file 
# measurement_cols = measurement_out.columns
# import pickle
# with open('measurement_cols.pkl', 'wb') as f:
#     pickle.dump(measurement_cols, f)

#read pickle format columns 
measurement_cols = pickle.load(open('measurement_cols.pkl', 'rb'))
print('checking missing columns are: ', (measurement_cols).difference(measurement_out.columns))


checking missing columns are:  Index([], dtype='object')


### Save file
- last_measurement in ./data/processed/

In [47]:
measurement_out.to_csv('./data/processed/last_measurement.csv',index=False)

### Notes
- Should we add 1 hour to datetime because of hourly mean ? 
- cos and sin computed before doing the hourly mean here, should we do it after ? 