# Wind Energy Forecasting: Inferential Statistics
The dataset chosen for this exercise is provided by Institute of Electrical and Electronics Engineers (IEEE), Power & Energy Society, and retrieved through the Kaggle database (https://www.kaggle.com/c/GEF2012-wind-forecasting).  The dataset is a time series dataset with historical power generation, wind speeds and wind directions, for the time period from July 2009 to December 2010. 

#### Importing Packages and Defining Custom Functions for Data Cleaning

In [2]:
# Import packages for data visualization
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
# Custom Functions
def convert_to_iso(date_col):
    """
    Convert a date, originally in format YYYYMMDDHH,
    to ISO 8601 format (https://en.wikipedia.org/wiki/ISO_8601)
    
    Input: an array of DateTimes in YYYYMMDD format
    Output: an array of DateTimes in ISO 8601 format
    """
    date_col = date_col.astype(str)
    
    # year = YYYY
    year = date_col.str[0:4]
    # month = MM
    month = date_col.str[4:6]
    # day = DD
    day = date_col.str[6:8]
    # hour = HH
    hour = date_col.str[8:10]
    date_iso8601 = pd.to_datetime(year + '-' + month + '-' + day + 'T' + hour + ':00:00')
    
    return date_iso8601

def add_forecast_cat(wfn):
    """
    Add a forecast category column to the Wind Farm data
    Forecast Category 1:  1-12 hour forecasts
    Forecast Category 2: 13-24 hour forecasts
    Forecast Category 3: 25-36 hour forecasts
    Forecast Category 4: 37-48 hour forecasts
    
    Input: A DataFrame of Wind Farm data with column 'hors' containing hour-ahead forecasts 
    Output: The same DataFrame with an added column, 'forecast_cat' containing the forecast category
    """
    
    wfn['forecast_cat'] = None
    wfn.loc[ (wfn['hors'] >= 1) & (wfn['hors'] <= 12), 'forecast_cat'] = 1
    wfn.loc[ (wfn['hors'] >= 13) & (wfn['hors'] <= 24), 'forecast_cat'] = 2
    wfn.loc[ (wfn['hors'] >= 25) & (wfn['hors'] <= 36), 'forecast_cat'] = 3
    wfn.loc[ (wfn['hors'] >= 37) & (wfn['hors'] <= 48), 'forecast_cat'] = 4

    return wfn

def wfn_by_fc(wfn, forecast_cat):
    """
    Take a windfarm DataFrame and return a boolean sliced 
    version including data for a given forecast category
    
    Input: A DataFrame of Wind Farm data
    Output: The same DataFrame, but including only data for the requested forecast category
    """

    wfn = wfn.loc[(wfn['forecast_cat'] == forecast_cat)] # row slice
    return wfn

#### Importing Wind Speed and Direction Data

In [4]:
# Import wind speed and wind direction data for each wind farm, "wind farm data"
wf_dict = {'wf1': pd.read_csv('windforecasts_wf1.csv'), # Wind Farm 1
           'wf2': pd.read_csv('windforecasts_wf2.csv'), # Wind Farm 2
           'wf3': pd.read_csv('windforecasts_wf3.csv'), # Wind Farm 3
           'wf4': pd.read_csv('windforecasts_wf4.csv'), # Wind Farm 4
           'wf5': pd.read_csv('windforecasts_wf5.csv'), # Wind Farm 5
           'wf6': pd.read_csv('windforecasts_wf6.csv'), # Wind Farm 6
           'wf7': pd.read_csv('windforecasts_wf7.csv')} # Wind Farm 7

#### Importing Power Data

Note: we include only 2009-2010 data because these are the years for which there is complete data (i.e. wind speed, wind direction, and wind power data for every DateTime).

In [5]:
# Import wind power data
power = pd.read_csv('train.csv')

# Convert DateTimes to ISO 8601 format for standardization
power['date'] = convert_to_iso(power['date']) 

# Include only 2009-2010 data for wind power data
power = power.loc[ (power['date'] >= '2009-07-01') & 
                   (power['date'] <=  '2010-12-31')]

# Set index for wind power data
power.set_index('date', inplace=True)   

In [6]:
# Dictionary with wind farm data as keys and wind power data as values
wp_lookup = {'wf1':'wp1',
             'wf2':'wp2',
             'wf3':'wp3',
             'wf4':'wp4',
             'wf5':'wp5', 
             'wf6':'wp6',
             'wf7':'wp7'}

#### Cleaning Wind Speed & Direction Data, Merging with Wind Power Data
Note: we include only 2009-2010 data because these are the years for which there is complete data (i.e. wind speed, wind direction, and wind power data for every DateTime).

In [7]:
for key, _ in wf_dict.items():
    
    # Convert date-times to ISO 8601 format for standardization
    wf_dict[key]['date'] = convert_to_iso(wf_dict[key]['date'])
    # Initialize mod_date column
    wf_dict[key]['mod_date'] = (wf_dict[key]['date'] + 
                                pd.to_timedelta(arg=wf_dict[key]['hors'],unit='h'))
    # Initialize forecast_cat column
    wf_dict[key] = add_forecast_cat(wf_dict[key])

    # Include only 2009-2010 data for wind speed/direction data
    wf_dict[key] = wf_dict[key].loc[(wf_dict[key]['mod_date'] >= '2009-07-01') & 
                                    (wf_dict[key]['mod_date'] <= '2010-12-31')]
    # Set Index column
    wf_dict[key].set_index('mod_date',inplace=True)
    
    # Merge wind speed/direction data with wind power data
    wf_dict[key] = wf_dict[key].merge(power[[wp_lookup[key]]], 
                                      how='left',
                                      left_index=True,       
                                      right_index=True)

In [8]:
# Explore wind farm data
print(wf_dict['wf1'].head(15))

                                   date  hors     u     v    ws      wd  \
2009-07-01 01:00:00 2009-07-01 00:00:00     1  2.34 -0.79  2.47  108.68   
2009-07-01 02:00:00 2009-07-01 00:00:00     2  2.18 -0.99  2.40  114.31   
2009-07-01 03:00:00 2009-07-01 00:00:00     3  2.20 -1.21  2.51  118.71   
2009-07-01 04:00:00 2009-07-01 00:00:00     4  2.35 -1.40  2.73  120.86   
2009-07-01 05:00:00 2009-07-01 00:00:00     5  2.53 -1.47  2.93  120.13   
2009-07-01 06:00:00 2009-07-01 00:00:00     6  2.66 -1.29  2.96  115.79   
2009-07-01 07:00:00 2009-07-01 00:00:00     7  2.69 -0.81  2.81  106.71   
2009-07-01 08:00:00 2009-07-01 00:00:00     8  2.72 -0.26  2.73   95.39   
2009-07-01 09:00:00 2009-07-01 00:00:00     9  2.87  0.08  2.87   88.50   
2009-07-01 10:00:00 2009-07-01 00:00:00    10  3.23 -0.01  3.23   90.19   
2009-07-01 11:00:00 2009-07-01 00:00:00    11  3.65 -0.33  3.66   95.15   
2009-07-01 12:00:00 2009-07-01 00:00:00    12  3.89 -0.60  3.94   98.71   
2009-07-01 13:00:00 2009-

NOTE: All data is unitless.
**Description:** Index is dates in ISO-8601 DateTime format. 'date' column data is unformatted DateTime data. 'hors' column data is unformatted hour data, representing the number of hours-ahead being forecasted at the corresponding DateTime. 'u' is magnitude of x-axis wind speed vector. 'v' is magnitude of y-axis wind speed vector. 'ws' is magnitude of wind speed. 'wd' is angle of wind direction. 'forecast_cat' is forecast category, ranging from 1-4.
'wp(n)' column represents wind power data where n=1...7 represent each of the seven wind farms.

### Inferential Statistics

There are four forecast categories-- Forecast Category 1: 1-12 hours ahead. Forecast Category 2: 13-24 hours ahead. Forecast Category 3: 25-36 hours ahead. Forecast Category 4: 37-48 hours ahead.

We are interested to see if the wind speeds of one forecast category different from the wind speeds of other forecast categories.



As an example, we could take a look at Wind Farm 1, and compare the mean wind speed of Forecast Category 1 ($\mu$<sub>1</sub>) to the mean wind speed of Forecast Category 2-4 ($\mu$<sub>234</sub>). The null hypothesis in this case would be that the mean wind speed of Forecast Category 1 is equal to the mean wind speed of Forecast Category 2-4: 

$\mu$<sub>1</sub> = $\mu$<sub>234</sub>

The alternate hypothesis would then be that the mean wind speed of Forecast Category 1 is not equal to the mean wind speed of the other Forecast Categories: 

$\mu$<sub>1</sub> $\neq$ $\mu$<sub>234</sub>

Similarly, we could repeat the comparison for:

$\mu$<sub>2</sub> = $\mu$<sub>34</sub>, comparing forecast category 2 to longer forecast categories (3, 4)

$\mu$<sub>3</sub> = $\mu$<sub>4</sub>, comparing forecast category 3 to longer forecast categories (4)

$\mu$<sub>4</sub> = $\mu$<sub>123</sub>, comparing forecast category 4 to shorter forecast categories (1, 2, 3)

$\mu$<sub>3</sub> = $\mu$<sub>12</sub>, (comparing forecast category 3 to shorter forecast categories (1, 2)

$\mu$<sub>2</sub> = $\mu$<sub>1</sub>, (comparing forecast category 2 to shorter forecast category (1)

In [10]:
# Dictionary to hold Z-scores for each wind farm
z_scores = {'wf1':[], 
            'wf2':[],
            'wf3':[],
            'wf4':[],
            'wf5':[],
            'wf6':[],
            'wf7':[]}

for key, _ in wf_dict.items():
    wf = wf_dict[key]
    fc_scores = []  # Empty list of Z Scores
    
    # Compare shorter forecasts to longer forecasts
    for i in [1,2,3]:
            fc_single = wf.loc[(wf['forecast_cat'] == i)]['ws']
            fc_others = wf.loc[(wf['forecast_cat'] > i)]['ws']
            # Sample mean wind speed of single Forecast Category: x_single
            x_single = np.mean(fc_single) 
            # Sample mean wind speed of all other Forecast Categories: x_others
            x_others = np.mean(fc_others)
            # Sample standard deviation wind speed of single Forecast Category: s_single
            s_single = np.std(fc_single)
            # Sample standard deviation wind speed of all other Forecast Categories: s_others
            s_others = np.std(fc_others)
            # Sample size of single Forecast Category: n_single
            n_single = len(fc_single)
            # Sample size of all other Forecast Categories: n_others
            n_others = len(fc_others)
            # Compute Z score
            z = (x_single - x_others) / np.sqrt((s_single**2)/n_single + (s_others**2)/n_others)
            # Add Z score to list of scores
            fc_scores.append(z)
            
    # Compare longer forecasts to shorter forecasts
    for i in [4,3,2]:
            fc_single = wf.loc[(wf['forecast_cat'] == i)]['ws']
            fc_others = wf.loc[(wf['forecast_cat'] < i)]['ws'] 
            # Sample mean wind speed of single Forecast Category: x_single
            x_single = np.mean(fc_single) 
            # Sample mean wind speed of all other Forecast Categories: x_others
            x_others = np.mean(fc_others)
            # Sample standard deviation wind speed of single Forecast Category: s_single
            s_single = np.std(fc_single)
            # Sample standard deviation wind speed of all other Forecast Categories: s_others
            s_others = np.std(fc_others)
            # Sample size of single Forecast Category: n_single
            n_single = len(fc_single)
            # Sample size of all other Forecast Categories: n_others
            n_others = len(fc_others)
            # Compute Z score
            z = (x_single - x_others) / np.sqrt((s_single**2)/n_single + (s_others**2)/n_others)
            # Add Z score to list of scores
            fc_scores.append(z)
    z_scores[key] = fc_scores

z_scores = pd.DataFrame(z_scores) # Convert dictionary to DataFrame
z_scores['fc_comparison'] = ['1v234', '2v34', '3v4', '4v123', '3v12', '2v1']  # List of labels for forecast comparisons 
z_scores.set_index('fc_comparison', inplace=True) # Set indexes to represent forecast comparison
z_scores['avg_z'] = z_scores.mean(axis=1)

print(z_scores)

                    wf1       wf2       wf3       wf4       wf5       wf6  \
fc_comparison                                                               
1v234          1.798917  2.972729  2.087631  2.845424  2.910819  2.321831   
2v34           0.625090  1.699720  0.458644  1.400497  0.789813  0.058644   
3v4            0.136969  0.310422  0.448495  0.896691  0.410644  0.639480   
4v123         -1.012907 -2.076900 -1.284078 -2.351041 -1.683877 -1.330527   
3v12          -1.190827 -2.378989 -1.027465 -1.773526 -1.667149 -0.776293   
2v1           -1.116845 -1.463473 -1.434667 -1.519789 -1.932011 -1.859611   

                    wf7     avg_z  
fc_comparison                      
1v234          2.015196  2.421793  
2v34          -0.110330  0.703154  
3v4            0.628765  0.495924  
4v123         -1.138319 -1.553950  
3v12          -0.521537 -1.333684  
2v1           -1.706717 -1.576159  


### Conclusion

Comparing forecast categories to each other, it was found that Forecast Category 1 (1-12hrs ahead) tended to be significantly different from longer Forecast Categories (13-48hrs ahead), with 95% confidence. Z-scores for this comparison were in excess of 1.96 (95% confidence threshold) for all wind farms except Wind Farm 1, which had a Z-score of 1.798917, which still corresponds to 92.8% confidence.

Forecast caegories 2, 3, 4 were found to be not significantly different from each other. 

Another conclusion is that Z-scores seem to vary across all wind farms, with no apparent pattern. This may be due to geological features not accounted for in the data set.