Skip to content

Commit

Permalink
Implement list-species, list-phases and list-stations for data source…
Browse files Browse the repository at this point in the history
… DWD
  • Loading branch information
amotl committed Mar 12, 2018
1 parent c604347 commit 5047c4d
Show file tree
Hide file tree
Showing 3 changed files with 76 additions and 11 deletions.
19 changes: 12 additions & 7 deletions phenodata/command.py
@@ -1,11 +1,11 @@
# -*- coding: utf-8 -*-
# (c) 2018 Andreas Motl <andreas@hiveeyes.org>
import json
import logging
from docopt import docopt, DocoptExit
from tabulate import tabulate
from phenodata import __version__
from phenodata.dwd import DwdDataAcquisition
from phenodata.util import boot_logging, normalize_options, read_list
from phenodata.util import boot_logging, normalize_options

"""
phenodata is a data acquisition and manipulation toolkit for open access phenology data.
Expand Down Expand Up @@ -51,11 +51,16 @@ def run():

# Dispatch command
data = None
if 'list-species' in options:
if options['list-species']:
data = client.get_species()
elif 'list-phases' in options:
elif options['list-phases']:
data = client.get_phases()
elif 'list-stations' in options:
data = client.get_stations()
elif options['list-stations']:
data = client.get_stations(dataset=options['dataset'])

print(json.dumps(data))
# TODO: Do either this or that
#print data.to_string()
#print data.to_json(orient='index')

# TODO: How to make "tabulate" print index column name
print tabulate(data, headers=data.columns, showindex=True, tablefmt='psql')
66 changes: 62 additions & 4 deletions phenodata/dwd.py
@@ -1,20 +1,78 @@
# -*- coding: utf-8 -*-
# (c) 2018 Andreas Motl <andreas@hiveeyes.org>
import re
import attr
import logging
import requests
import requests_ftp
import pandas as pd
from six import StringIO

logger = logging.getLogger(__name__)

# Monkeypatch Requests Sessions to provide all the helper methods needed for use with FTP
requests_ftp.monkeypatch_session()

@attr.s
class DwdDataAcquisition(object):

baseurl = 'ftp://ftp-cdc.dwd.de/pub/CDC'

dataset = attr.ib()

def __attrs_post_init__(self):
self.dwdftp = requests.Session()

def read_ftp_csv(self, url):

# Retrieve CSV file
response = self.dwdftp.retr(url)

# TODO: Honor status code
#print 'status:', resp.status_code

# Acquire file content
content = response.content

# Fix CSV formatting
content = content.replace('\r\n', '')
content = re.sub(';eor;\s*', ';eor;\n', content)

# Debugging
#print 'content:\n', response.content.decode('Windows-1252').encode('utf8'); return

# Read CSV into Pandas DataFrame
# https://pandas.pydata.org/pandas-docs/stable/io.html
df = pd.read_csv(
StringIO(content), engine='c', encoding='Windows-1252',
delimiter=';', skipinitialspace=True, skip_blank_lines=True,
index_col=0)

# Remove empty rows
df.dropna(subset=['eor'], inplace=True)

# Remove trailing nonsense columns
last_column = len(df.columns) - 1
df.drop(df.columns[[last_column]], axis=1, inplace=True)
df.drop('eor', axis=1, inplace=True)

return df

def get_species(self):
pass
"""Return Pandas DataFrame containing complete species information"""
return self.read_ftp_csv(self.baseurl + '/help/PH_Beschreibung_Pflanze.txt')

def get_phases(self):
pass
"""Return Pandas DataFrame containing complete phases information"""
return self.read_ftp_csv(self.baseurl + '/help/PH_Beschreibung_Phase.txt')

def get_stations(self, dataset):
"""Return Pandas DataFrame containing complete stations information"""
if dataset == 'immediate':
filename = 'PH_Beschreibung_Phaenologie_Stationen_Sofortmelder.txt'
elif dataset == 'annual':
filename = 'PH_Beschreibung_Phaenologie_Stationen_Jahresmelder.txt'
else:
raise KeyError('Unknown dataset "{}"'.format(dataset))

def get_stations(self):
pass
return self.read_ftp_csv(self.baseurl + '/help/' + filename)
2 changes: 2 additions & 0 deletions setup.py
Expand Up @@ -10,6 +10,8 @@
'requests-ftp==0.3.1',
'docopt==0.6.2',
'attrs==17.4.0',
'pandas>=0.18.1,<=0.22.0',
'tabulate==0.8.2',
]

test_requires = [
Expand Down

0 comments on commit 5047c4d

Please sign in to comment.