Implement list-species, list-phases and list-stations for data source…

… DWD
earthobservations · Mar 12, 2018 · 5047c4d · 5047c4d
1 parent c604347
commit 5047c4d
Show file tree

Hide file tree

Showing 3 changed files with 76 additions and 11 deletions.
diff --git a/phenodata/command.py b/phenodata/command.py
@@ -1,11 +1,11 @@
 # -*- coding: utf-8 -*-
 # (c) 2018 Andreas Motl <andreas@hiveeyes.org>
-import json
 import logging
 from docopt import docopt, DocoptExit
+from tabulate import tabulate
 from phenodata import __version__
 from phenodata.dwd import DwdDataAcquisition
-from phenodata.util import boot_logging, normalize_options, read_list
+from phenodata.util import boot_logging, normalize_options
 
 """
 phenodata is a data acquisition and manipulation toolkit for open access phenology data.
@@ -51,11 +51,16 @@ def run():
 
     # Dispatch command
     data = None
-    if 'list-species' in options:
+    if options['list-species']:
         data = client.get_species()
-    elif 'list-phases' in options:
+    elif options['list-phases']:
         data = client.get_phases()
-    elif 'list-stations' in options:
-        data = client.get_stations()
+    elif options['list-stations']:
+        data = client.get_stations(dataset=options['dataset'])
 
-    print(json.dumps(data))
+    # TODO: Do either this or that
+    #print data.to_string()
+    #print data.to_json(orient='index')
+
+    # TODO: How to make "tabulate" print index column name
+    print tabulate(data, headers=data.columns, showindex=True, tablefmt='psql')
diff --git a/phenodata/dwd.py b/phenodata/dwd.py
@@ -1,20 +1,78 @@
 # -*- coding: utf-8 -*-
 # (c) 2018 Andreas Motl <andreas@hiveeyes.org>
+import re
 import attr
 import logging
+import requests
+import requests_ftp
+import pandas as pd
+from six import StringIO
 
 logger = logging.getLogger(__name__)
 
+# Monkeypatch Requests Sessions to provide all the helper methods needed for use with FTP
+requests_ftp.monkeypatch_session()
+
 @attr.s
 class DwdDataAcquisition(object):
 
+    baseurl = 'ftp://ftp-cdc.dwd.de/pub/CDC'
+
     dataset = attr.ib()
 
+    def __attrs_post_init__(self):
+        self.dwdftp = requests.Session()
+
+    def read_ftp_csv(self, url):
+
+        # Retrieve CSV file
+        response = self.dwdftp.retr(url)
+
+        # TODO: Honor status code
+        #print 'status:', resp.status_code
+
+        # Acquire file content
+        content = response.content
+
+        # Fix CSV formatting
+        content = content.replace('\r\n', '')
+        content = re.sub(';eor;\s*', ';eor;\n', content)
+
+        # Debugging
+        #print 'content:\n', response.content.decode('Windows-1252').encode('utf8'); return
+
+        # Read CSV into Pandas DataFrame
+        # https://pandas.pydata.org/pandas-docs/stable/io.html
+        df = pd.read_csv(
+            StringIO(content), engine='c', encoding='Windows-1252',
+            delimiter=';', skipinitialspace=True, skip_blank_lines=True,
+            index_col=0)
+
+        # Remove empty rows
+        df.dropna(subset=['eor'], inplace=True)
+
+        # Remove trailing nonsense columns
+        last_column = len(df.columns) - 1
+        df.drop(df.columns[[last_column]], axis=1, inplace=True)
+        df.drop('eor', axis=1, inplace=True)
+
+        return df
+
     def get_species(self):
-        pass
+        """Return Pandas DataFrame containing complete species information"""
+        return self.read_ftp_csv(self.baseurl + '/help/PH_Beschreibung_Pflanze.txt')
 
     def get_phases(self):
-        pass
+        """Return Pandas DataFrame containing complete phases information"""
+        return self.read_ftp_csv(self.baseurl + '/help/PH_Beschreibung_Phase.txt')
+
+    def get_stations(self, dataset):
+        """Return Pandas DataFrame containing complete stations information"""
+        if dataset == 'immediate':
+            filename = 'PH_Beschreibung_Phaenologie_Stationen_Sofortmelder.txt'
+        elif dataset == 'annual':
+            filename = 'PH_Beschreibung_Phaenologie_Stationen_Jahresmelder.txt'
+        else:
+            raise KeyError('Unknown dataset "{}"'.format(dataset))
 
-    def get_stations(self):
-        pass
+        return self.read_ftp_csv(self.baseurl + '/help/' + filename)
diff --git a/setup.py b/setup.py
@@ -10,6 +10,8 @@
     'requests-ftp==0.3.1',
     'docopt==0.6.2',
     'attrs==17.4.0',
+    'pandas>=0.18.1,<=0.22.0',
+    'tabulate==0.8.2',
 ]
 
 test_requires = [