### Dependencies

In [6]:
import os 
import pandas as pd
import numpy as np
import datetime
import pickle
from utils import data_preparation as prep

### Init path

In [2]:
main_dir = './data/raw/forecast/'
sub_dir = os.listdir(main_dir)

### Process One file

In [3]:
def get_one_forecast(DAY,HOUR):
    forecast_dir_path = main_dir + DAY + '_' + HOUR + '/'
    # Init output
    data = pd.DataFrame()
    # Init f_period
    period_root = 0
    # Read 2 meteo file
    for dir_day in os.listdir(forecast_dir_path):
        if len(dir_day) == 10:
            # load file
            data_per_day = pd.read_csv(forecast_dir_path + dir_day + '/meteo.txt',delimiter=";")
            data_per_day['date'] = dir_day.replace('_','-')
            data_per_day['f_period'] = period_root + data_per_day['heure']
            # append to output
            data = data.append(data_per_day)
            # add 1 day to fperiod
            period_root += 24

    # forecast date
    data['f_date'] = data['date'] + ' ' 
    data['f_date'] += data['heure'].map(lambda x : str(x).zfill(2))
    data['f_date'] += ':00:00'
    # present date
    data['p_date'] = DAY.replace('_','-') + ' ' + HOUR + ':00:00'

    # rename columns 
    data = data.rename(columns={'vitesse' : 'speed', 
                                'temperature' : 'temp', 
                                'rayonnement' : 'rad',
                                'direction' : 'wind_dir'})

    # compute cos and sin
    data = prep.smooth_wind_dir(data)

    # select columns
    data = data[['p_date','f_date','speed','temp','rad','precip','cos_wind_dir','sin_wind_dir']]
    return data

#### get on file
file_path = sub_dir[0]
DAY = file_path[0:10]
HOUR = file_path[11:13]
get_one_forecast(DAY,HOUR).head()

Unnamed: 0,p_date,f_date,speed,temp,rad,precip,cos_wind_dir,sin_wind_dir
0,2020-07-01 00:00:00,2020-07-01 00:00:00,4.0,20.8,0,0,0.731354,0.681998
1,2020-07-01 00:00:00,2020-07-01 01:00:00,3.8,20.2,0,0,0.694658,0.71934
2,2020-07-01 00:00:00,2020-07-01 02:00:00,3.8,19.9,0,0,0.669131,0.743145
3,2020-07-01 00:00:00,2020-07-01 03:00:00,4.0,19.8,0,0,0.615661,0.788011
4,2020-07-01 00:00:00,2020-07-01 04:00:00,4.1,19.6,0,0,0.559193,0.829038


### Process 15 days data
- Concatenate all files 
- Save dataframe in ./data/processed/

In [18]:
# concat files
forecast = pd.DataFrame()
for file_path in sub_dir:
    DAY = file_path[0:10]
    HOUR = file_path[11:13]
    forecast = forecast.append(get_one_forecast(DAY,HOUR))
forecast = forecast.reset_index(drop=True)
display(forecast.head())


Unnamed: 0,p_date,f_date,speed,temp,rad,precip,cos_wind_dir,sin_wind_dir
0,2020-07-01 00:00:00,2020-07-01 00:00:00,4.0,20.8,0,0,0.731354,0.681998
1,2020-07-01 00:00:00,2020-07-01 01:00:00,3.8,20.2,0,0,0.694658,0.71934
2,2020-07-01 00:00:00,2020-07-01 02:00:00,3.8,19.9,0,0,0.669131,0.743145
3,2020-07-01 00:00:00,2020-07-01 03:00:00,4.0,19.8,0,0,0.615661,0.788011
4,2020-07-01 00:00:00,2020-07-01 04:00:00,4.1,19.6,0,0,0.559193,0.829038


## Cynthia: crop out <=6 f_period, and select last forecast 

In [19]:
#change to dt 
forecast['f_date']= pd.to_datetime(forecast['f_date'],format='%Y-%m-%d %H:%M:%S')
forecast['p_date']= pd.to_datetime(forecast['p_date'],format='%Y-%m-%d %H:%M:%S')

#calculate forecast period (f_period)
forecast['f_period'] =forecast['f_date'] - forecast['p_date']
forecast['f_period'] = forecast['f_period'].dt.components['hours']+forecast['f_period'].dt.components['days']*24

#crop out <=6 hours 
forecast = forecast.loc[forecast['f_period']>=6]
forecast= prep.keep_last_forecast(forecast)
forecast.reset_index(inplace=True)

## Cynthia: check if any columns is missing 

In [20]:
#read pickle format columns 
forecast_cols = pickle.load(open('forecast_cols.pkl', 'rb'))
print('checking missing columns are: ', (forecast_cols).difference(forecast.columns))

checking missing columns are:  Index([], dtype='object')


### Save file

In [22]:
forecast.to_csv('./data/processed/last_forecast.csv',index=False)

### Remarks
- We will need p_date and f_period columns also here ???
- The way i'm computing f_period (period_root in function get_one_forecast above) here will give us values in [0,47] not [0,48]
- Precipitation always equal to 0


- Cynthia: now we have 1 forecast per f_period, but since we can't use those <= 6 hours, we need to get older forecast... Perhaps the best way is to get all forecast, and then do: (this is done)
    - 1) crop out f_period <=6, 
    - 2) select last forecast