In [2]:
import requests
from bs4 import BeautifulSoup
import datetime
import os
import logging

In [6]:

__author__ = 'Jan'
"""
To download the xls for Denmark, just call the download_xls_file() in the notebook.
TODO: Sometimes the server returns a malformed table! Investigate if the code/request is wrong or the server code
    is not done properly.
"""
import requests
from bs4 import BeautifulSoup
import datetime
import os
import logging

log = logging.getLogger('download_denmark')
log.setLevel(logging.INFO)

_TARGET_URL = r'http://www.energinet.dk/en/el/engrosmarked/udtraek-af-markedsdata/Sider/default.aspx?language=en'
_REQUEST_URL = r'http://www.energinet.dk/_layouts/Markedsdata/Framework/Integrations/MarkedsdataExcelOutput.aspx'
_POST_PARAMETER_FILE = 'post_parameter.txt'


def _extract_dotNet_variables():
    log.debug('1 Extracting dotNet variables')
    s = requests.Session()
    r = s.get(_TARGET_URL)
    if r.status_code != 200:
        raise requests.ConnectionError

    soup = BeautifulSoup(r.content, 'lxml')
    # extract viewstate
    view_state = soup.select('#__VIEWSTATE')[0]['value']
    # extract event validation
    event_validation = soup.select('#__EVENTVALIDATION')[0]['value']
    return view_state, event_validation, s


def _construct_parameter(view_state, event_validation, file_path):
    log.debug('2 Constructing post parameter')
    parameter = {}
    with open(file_path, 'r') as para_file:
        lines = para_file.read().splitlines()
        # last element is endDate. It has to be changed to current date
        current_time = datetime.datetime.now()
        current_date = current_time.strftime("%d-%m-%Y")
        lines[-1] = 'endDate=' + current_date
        # split lines into dictionary
        for key_value in lines:
            try:
                key, value = key_value.split('=')
                parameter[key] = value
            except:
                log.warning('Error: ' + key_value)

        # add view_state and event_validation
        parameter['__VIEWSTATE'] = view_state
        parameter['__EVENTVALIDATION'] = event_validation
        log.debug(parameter['startDate'])
        log.debug(parameter['endDate'])

        return parameter


def _download_excel(parameter, session, output_path):
    """
    This function downloads the xls file from the website. To download, you have to send a post request with parameters
    that specify the data you want first. After that you request the xls/html file.
    Parameters
    ----------
    parameter : Specifies the data (ASP.NET Web Form)
    session : The session that was used for extracting the variables
    output_path : Defines the location for the downloaded xls file
    Returns
    -------
    """
    log.info('3 Downloading xls file.')
    header = {
        'referer': r'http://www.energinet.dk/_layouts/Markedsdata/framework/integrations/markedsdatatemplate.aspx?language=en',
        'content-type': 'application/vnd.ms-excel; charset=utf-8',
        'accept-language': 'de,en-US;q=0.7,en;q=0.3',
        'accept-encoding': 'gzip, deflate',
        'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
        'user-agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:47.0) Gecko/20100101 Firefox/47.0',
        'connection': 'keep-alive'
    }

    # Specifies the data you want.
    p = session.post(_REQUEST_URL, data=parameter, headers=header)
    log.debug('4 Post request headers: '+ str(p.request.headers))
    log.debug('5 post response headers: ' + str(p.headers))

    # Gets the data you want.
    r = session.get(_REQUEST_URL, stream=True, headers=header)
    log.debug('6 get request headers: ' + str(r.request.headers))
    log.debug('7 get response headers: ' + str(r.headers))
    with open(output_path, 'wb') as out_file:
        for chunk in r.iter_content(chunk_size=1024):
            out_file.write(chunk)

    log.info('8 Download completed')


def download_xls_file(output_directory='', output_file_name='danish2.xls'):
    """
    Prepares the parameters and downloads the xls file.
    Parameters
    ----------
    output_directory : Directory where the xls file is saved.
    output_file_name : Name of the downloaded file.
    Returns
    -------
    """
    #loc = os.path.realpath(os.path.join(os.getcwd(), os.path.dirname(__file__)))
    para_path = 'post_parameter.txt' #os.path.join(loc, _POST_PARAMETER_FILE)
    output_path = os.path.join(output_directory, output_file_name)
    view, event, session = _extract_dotNet_variables()
    parameter_dict = _construct_parameter(view, event, para_path)
    _download_excel(parameter_dict, session, output_path)


if __name__ == '__main__':
    log.addHandler(logging.StreamHandler())
    log.setLevel(logging.DEBUG)
download_xls_file()

1 Extracting dotNet variables
1 Extracting dotNet variables
1 Extracting dotNet variables
DEBUG:download_denmark:1 Extracting dotNet variables
2 Constructing post parameter
2 Constructing post parameter
2 Constructing post parameter
DEBUG:download_denmark:2 Constructing post parameter
01-01-2000
01-01-2000
01-01-2000
DEBUG:download_denmark:01-01-2000
07-08-2016
07-08-2016
07-08-2016
DEBUG:download_denmark:07-08-2016
3 Downloading xls file.
3 Downloading xls file.
3 Downloading xls file.
INFO:download_denmark:3 Downloading xls file.
4 Post request headers: {'accept-language': 'de,en-US;q=0.7,en;q=0.3', 'content-type': 'application/vnd.ms-excel; charset=utf-8', 'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'connection': 'keep-alive', 'Content-Length': '47249', 'user-agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:47.0) Gecko/20100101 Firefox/47.0', 'referer': 'http://www.energinet.dk/_layouts/Markedsdata/framework/integrations/markedsdatatemplate.aspx?langu

In [3]:
os.path.realpath(os.path.join(os.getcwd(), os.path.dirname(__file__)))

NameError: name '__file__' is not defined

In [9]:
from pytz import timezone as tz

In [7]:
start = datetime.datetime.now()

In [12]:
start2 = (
    tz('Europe/Brussels')
    .localize(datetime.datetime.combine(start, datetime.time()))
    .astimezone(tz('UTC'))
)

In [20]:
datetime.datetime.today()#.date()

datetime.datetime(2016, 8, 7, 11, 54, 40, 806777)

In [25]:
(tz('Europe/Brussels')
 .localize(datetime.datetime.combine(start, datetime.time()))
 .astimezone(tz('UTC')))

datetime.datetime(2016, 8, 6, 22, 0, tzinfo=<UTC>)

In [26]:
start

datetime.datetime(2016, 8, 7, 11, 15, 53, 172608)

In [32]:
tz('Europe/Brussels').localize(datetime.datetime.now()).astimezone(tz('UTC'))

datetime.datetime(2016, 8, 7, 10, 29, 7, 103477, tzinfo=<UTC>)

In [33]:
datetime.datetime.now()

datetime.datetime(2016, 8, 7, 12, 29, 29, 325460)

In [36]:
url = 'URL'
source = 'Energinet.dk'
colmap = {
    'DK-West': 
        ('price', 'DKw', 'Elspot', source, url),
    'DK-East': 
        ('price', 'DKw', 'Elspot', source, url),
    'Norway': 
        ('price', 'NO', 'Elspot', source, url),
    'Sweden (SE)': 
        ('price', 'SE', 'Elspot', source, url),
    'Sweden (SE3)': 
        ('price', 'SE3', 'Elspot', source, url),
    'Sweden (SE4)': 
        ('price', 'SE4', 'Elspot', source, url),
    'DE European Power Exchange': 
        ('price', 'DE', 'EPEX', source, url),
    'DK-West: Wind power production': 
        ('wind', 'DKw', 'generation', source, url),
    'DK-West: Solar cell production (estimated)': 
        ('solar', 'Dke', 'generation', source, url),
    'DK-East: Wind power production': 
        ('wind', 'DKe', 'generation', source, url),
    'DK-East: Solar cell production (estimated)': 
        ('solar', 'DKe', 'generation', source, url),
    'DK: Wind power production (onshore)': 
        ('wind', 'DK', 'onshore', source, url),
    'DK: Wind power production (offshore)': 
        ('wind', 'DK', 'offshore', source, url)
}
colmap

{'DE European Power Exchange': ('price', 'DE', 'EPEX', 'Energinet.dk', 'URL'),
 'DK-East': ('price', 'DKw', 'Elspot', 'Energinet.dk', 'URL'),
 'DK-East: Solar cell production (estimated)': ('solar',
  'DKe',
  'generation',
  'Energinet.dk',
  'URL'),
 'DK-East: Wind power production': ('wind',
  'DKe',
  'generation',
  'Energinet.dk',
  'URL'),
 'DK-West': ('price', 'DKw', 'Elspot', 'Energinet.dk', 'URL'),
 'DK-West: Solar cell production (estimated)': ('solar',
  'Dke',
  'generation',
  'Energinet.dk',
  'URL'),
 'DK-West: Wind power production': ('wind',
  'DKw',
  'generation',
  'Energinet.dk',
  'URL'),
 'DK: Wind power production (offshore)': ('wind',
  'DK',
  'offshore',
  'Energinet.dk',
  'URL'),
 'DK: Wind power production (onshore)': ('wind',
  'DK',
  'onshore',
  'Energinet.dk',
  'URL'),
 'Norway': ('price', 'NO', 'Elspot', 'Energinet.dk', 'URL'),
 'Sweden (SE)': ('price', 'SE', 'Elspot', 'Energinet.dk', 'URL'),
 'Sweden (SE3)': ('price', 'SE3', 'Elspot', 'Energinet.d

In [None]:
        df = df[~((df['date'] == '2012-03-25') & ((df['pos'] == 8) | (df['pos'] == 10)))]

In [51]:
print('what\thello\tagain\t\tstill')

what	hello	again		still


In [58]:
tech_attribute = 'wind_generation_with-offshore'
x, y = tech_attribute.split('_')

ValueError: too many values to unpack (expected 2)

In [62]:
colmap = {
    'DK-West': 
        ('price', 'DKw', 'Elspot', source, url),
    'DK-East': 
        ('price', 'DKw', 'Elspot', source, url),
    'Norway': 
        ('price', 'NO', 'Elspot', source, url),
    'Sweden (SE)': 
        ('price', 'SE', 'Elspot', source, url),
    'Sweden (SE3)': 
        ('price', 'SE3', 'Elspot', source, url),
    'Sweden (SE4)': 
        ('price', 'SE4', 'Elspot', source, url),
    'DE European Power Exchange': 
        ('price', 'DE', 'EPEX', source, url),
    'DK-West: Wind power production': 
        ('wind-total', 'DKwest', 'generation', source, url),
    'DK-West: Solar cell production (estimated)': 
        ('solar', 'Dkwest', 'generation', source, url),
    'DK-East: Wind power production': 
        ('wind-total', 'DKeast', 'generation', source, url),
    'DK-East: Solar cell production (estimated)': 
        ('solar', 'DKeast', 'generation', source, url),
    'DK: Wind power production (onshore)': 
        ('wind-onshore', 'DK', 'generation', source, url),
    'DK: Wind power production (offshore)': 
        ('wind-offshore', 'DK', 'generation', source, url)
}

list(colmap.keys())

['DE European Power Exchange',
 'Sweden (SE4)',
 'Norway',
 'DK-West: Solar cell production (estimated)',
 'DK: Wind power production (offshore)',
 'DK-West: Wind power production',
 'DK-East: Wind power production',
 'DK: Wind power production (onshore)',
 'Sweden (SE3)',
 'DK-East: Solar cell production (estimated)',
 'DK-East',
 'Sweden (SE)',
 'DK-West']