Merge https://github.com/kevinloeffler/opendata-hackathon

kevinloeffler · Dec 3, 2023 · 8a1217c · 8a1217c
2 parents c21a2e4 + 089400f
commit 8a1217c
Show file tree

Hide file tree

Showing 6 changed files with 206 additions and 27 deletions.
diff --git a/download-data.sh b/download-data.sh
@@ -1,8 +1,22 @@
 #!/bin/bash
 
-url='https://www.daten.stadt.sg.ch/api/explore/v2.1/catalog/datasets/fullstandssensoren-sammelstellen-stadt-stgallen/exports/csv?lang=en&timezone=Europe%2FZurich&use_labels=true&delimiter=%3B'
-output_file='data/fill-levels-2.csv'
+url_w='https://www.daten.stadt.sg.ch/api/explore/v2.1/catalog/datasets/fuellstandsensoren-glassammelstellen-weissglas/exports/csv?lang=de&timezone=Europe%2FZurich&use_labels=true&delimiter=%3B'
+output_file_w='data/data_w.csv'
 
-curl -Lo "$output_file" "$url"
+curl -Lo "$output_file_w" "$url_w"
 
-echo "Download complete. File saved to $output_file."
+echo "Download (w) complete. File saved to $output_file_w."
+
+url_b='https://www.daten.stadt.sg.ch/api/explore/v2.1/catalog/datasets/fuellstandsensoren-glassammelstellen-braunglas/exports/csv?lang=de&timezone=Europe%2FZurich&use_labels=true&delimiter=%3B'
+output_file_b='data/data_b.csv'
+
+curl -Lo "$output_file_b" "$url_b"
+
+echo "Download (w) complete. File saved to $output_file_b."
+
+url_g='https://www.daten.stadt.sg.ch/api/explore/v2.1/catalog/datasets/fuellstandsensoren-glassammelstellen-gruenglas/exports/csv?lang=de&timezone=Europe%2FZurich&use_labels=true&delimiter=%3B'
+output_file_g='data/data_g.csv'
+
+curl -Lo "$output_file_g" "$url_g"
+
+echo "Download (w) complete. File saved to $output_file_g."
diff --git a/lib/preprocessing.py b/lib/preprocessing.py
@@ -3,29 +3,59 @@
 from numpy import array
 import pandas as pd
 
+
 MAX_SENSOR_INPUT = 1600
 
 
-def read_data(from_path: str, use_coordinates: bool = False):
+def read_data(from_path_w: str, from_path_g: str, from_path_b: str, use_coordinates: bool = True):
     columns = [
-        'Gemessen am',
-        'Tags',
-        'Füllstandsdistanz',
-        'Sensorname',
+        'measured_at',
+        'data_distance',
+        'name',
     ]
 
     if use_coordinates:
         columns.append('geo_point_2d')
         # TODO: needs to be implemented in later functions (e.g. _merge_days) if really needed
+
+
+    raw_data_w = pd.read_csv(from_path_w, delimiter=';', usecols=columns)
+    raw_data_w = raw_data_w.rename(columns={'measured_at': 'date', 'data_distance': 'level', 'name': 'sensor_id'})
+
+    #adding type column based on name of csv input file
+    glasstype="Weissglas"
+    raw_data_w ['type'] = glasstype
+    filtered_data_w = raw_data_w[raw_data_w['level'].notna()]  # filter all rows with invalid sensor data
+    days_merged_w = _merge_days(filtered_data_w, use_coordinates)
+    days_merged_w['level'] = days_merged_w['level'].apply(lambda level: normalize_data(level))  # normalise data
+
+    raw_data_g = pd.read_csv(from_path_g, delimiter=';', usecols=columns)
+    raw_data_g = raw_data_g.rename(columns={'measured_at': 'date', 'data_distance': 'level', 'name': 'sensor_id'})
+
+    #adding type column based on name of csv input file
+    glasstype="Grünglas"
+    raw_data_g ['type'] = glasstype
+    filtered_data_g = raw_data_g[raw_data_g['level'].notna()]  # filter all rows with invalid sensor data
+    days_merged_g = _merge_days(filtered_data_g, use_coordinates)
+    days_merged_g['level'] = days_merged_g['level'].apply(lambda level: normalize_data(level))  # normalise data
+
+    raw_data_b = pd.read_csv(from_path_w, delimiter=';', usecols=columns)
+    raw_data_b = raw_data_b.rename(columns={'measured_at': 'date', 'data_distance': 'level', 'name': 'sensor_id'})
+
+    #adding type column based on name of csv input file
+    glasstype="Braunglas"
+    raw_data_b ['type'] = glasstype
+    filtered_data_b = raw_data_b[raw_data_b['level'].notna()]  # filter all rows with invalid sensor data
+    days_merged_b = _merge_days(filtered_data_b, use_coordinates)
+    days_merged_b['level'] = days_merged_b['level'].apply(lambda level: normalize_data(level))  # normalise data
 
-    raw_data = pd.read_csv(from_path, delimiter=';', usecols=columns)
-    raw_data = raw_data.rename(columns={'Gemessen am': 'date', 'Tags': 'type', 'Füllstandsdistanz': 'level', 'Sensorname': 'sensor_id'})
-
-    filtered_data = raw_data[raw_data['level'].notna()]  # filter all rows with invalid sensor data
-    days_merged = _merge_days(filtered_data, use_coordinates)
-    days_merged['level'] = days_merged['level'].apply(lambda level: normalize_data(level))  # normalise data
+
+    days_merged=pd.concat([days_merged_b,days_merged_g, days_merged_w]).sort_values(by="date")
+
+
     return days_merged
 
+
 def data_split(data: pd.DataFrame) -> tuple[pd.DataFrame, pd.DataFrame]:
     train_data = data[0:int(len(data)*0.9)]
     validate_data = data [int(len(data)*0.9):]
@@ -76,3 +106,9 @@ def check_if_array_is_ascending(array: list[float]) -> bool:
 def split_data(data: pd.DataFrame, ratio: float) -> tuple[pd.DataFrame, pd.DataFrame]:
     total_len = len(data)
     return data[0:round(total_len * ratio)], data[round(total_len * ratio):]
+
+
+if __name__ == '__main__':
+    file_path = "data/days_merged.csv"
+    days_merged = read_data("data/data_w.csv", "data/data_g.csv", "data/data_b.csv")
+    days_merged.to_csv(file_path,sep=',', index=False) 
diff --git a/lib/sensor_api.py b/lib/sensor_api.py
@@ -1,12 +1,16 @@
 # sensor_api.py
 from flask import Blueprint, jsonify
 from preprocessing import read_data
+import pandas as pd
 
 sensor_api = Blueprint('sensor_api', __name__)
 
-data_file = 'data/fill-level.csv'
+data_file = 'data/days_merged.csv'
 
-sensor_data = read_data(data_file, use_coordinates=True)
+columns = [
+    'sensor_id','date','geo_point_2d','level','type'
+]
+sensor_data = pd.read_csv(data_file, delimiter=',', usecols=columns)
 sensor_data = sensor_data.loc[sensor_data.groupby('sensor_id').date.idxmax()]  # Get sensor_id only once
 
 

diff --git a/predictor-app/src/api.js b/predictor-app/src/api.js
@@ -1,7 +1,7 @@
 /* Use the exported functions to call the API.
  * If necessary, adjust the backend address below:
  */
-const backend = "http://localhost:5000";
+const backend = "http://127.0.0.1:5000";
 
 export function getPath(params) {
   return getJson("/path", params).then(parseJSON);

diff --git a/readme.md b/readme.md
@@ -1,7 +1,7 @@
 # Open Data Hack 2023
 
 **Goal:** Use an LSTM AI model to predict the fill levels of recycling stations in St. Gallen
-and create a pathfinding algorithm that finds the ideal routes for city employees, for the next week.
+and create a pathfinding algorithm that finds the ideal routes for city employees, for the next work week.
 
 ### Components
 
@@ -10,21 +10,30 @@ and create a pathfinding algorithm that finds the ideal routes for city employee
 3) Visualise optimal route with a web app
 
 ### Setup
+Create a conda environment with `conda create --name <env> --file requirements.txt`. We recommend using python version 3.9.
 
-Download the file [here](https://www.daten.stadt.sg.ch/explore/dataset/fullstandssensoren-sammelstellen-stadt-stgallen/export/?disjunctive.name&disjunctive.tags&sort=measured_at) 
-and safe it at `data/fill-level.csv` or use the `download-data.sh` script. Or similar
+Create a Google Maps API key and save it for later steps (you can easily find the documentation online).
 
-#### Update 
-The API has changed. The API has been split up in 3 parts for each glass type.
-The new API is documented as follows:
+Run the bash script `download-data.sh`. This will download the data to the data folder. Afterwards run `preprocessing.py` directly which will create a single csv file from all the 3 datasets.
+
+##### Alternative Setup:
+Download the files below into the data folder: 
 - [Füllstandsensoren Glassammelstellen (Weissglas)](https://www.daten.stadt.sg.ch/explore/dataset/fuellstandsensoren-glassammelstellen-weissglas/table/?disjunctive.device_id&disjunctive.name)
 - [Füllstandsensoren Glassammelstellen (Grünglas)](https://www.daten.stadt.sg.ch/explore/dataset/fuellstandsensoren-glassammelstellen-gruenglas/table/?disjunctive.device_id&disjunctive.name)
 - [Füllstandsensoren Glassammelstellen (Braunglas)](https://www.daten.stadt.sg.ch/explore/dataset/fuellstandsensoren-glassammelstellen-braunglas/table/?disjunctive.device_id&disjunctive.name)
 
-### API
+#### Train the model
+To train the model, the datasets must be available and merged (as done in the Setup step). The file must be available under data/days_merged.csv. Afterwards, you can execute `main.py` in the project root which will train the model and save a snapshot to the trained-models/ folder.
+
+#### API
+The Google Maps API key must be saved under as `MAPS_KEY=<key>` in a .env file in the project root.
+
+In order to use the model, the datasets must be available and merged (as done in the Setup step). Also, the model must be trained. Afterwards, the command `flask --app lib/api run` can be executed from the project root. 
+
+#### UI
+The Google Maps API key must be saved in a second location as `REACT_APP_GOOGLE_MAPS_API_KEY=<key>` in a .env file in the predictor-app/ folder.
 
-Use the generate_dummy_data method to receive random sensor values. The model should return the prediction the same way.
-The method takes a list of the requested sensor names as argument.
+The UI is located in the predictor-app/ folder. First run `npm install` in the given folder, the start it using `npm run start`.
 
 ### Ideas
 Ideas that can be explored if there is time:

diff --git a/requirements.txt b/requirements.txt
@@ -0,0 +1,116 @@
+# This file may be used to create an environment using:
+# $ conda create --name <env> --file <this file>
+# platform: osx-arm64
+absl-py=2.0.0=pypi_0
+astunparse=1.6.3=pypi_0
+blinker=1.7.0=pyhd8ed1ab_0
+brotli=1.1.0=hb547adb_1
+brotli-bin=1.1.0=hb547adb_1
+bzip2=1.0.8=h93a5062_5
+ca-certificates=2023.11.17=hf0a4a13_0
+cachetools=5.3.2=pypi_0
+certifi=2023.11.17=pyhd8ed1ab_0
+charset-normalizer=3.3.2=pypi_0
+click=8.1.7=unix_pyh707e725_0
+contourpy=1.2.0=py39he9de807_0
+cycler=0.12.1=pyhd8ed1ab_0
+flask=3.0.0=pyhd8ed1ab_0
+flatbuffers=23.5.26=pypi_0
+fonttools=4.45.1=py39h17cfd9d_0
+freetype=2.12.1=hadb7bae_2
+gast=0.5.4=pypi_0
+google-auth=2.24.0=pypi_0
+google-auth-oauthlib=1.1.0=pypi_0
+google-pasta=0.2.0=pypi_0
+googlemaps=4.10.0=pypi_0
+grpcio=1.59.3=pypi_0
+h5py=3.10.0=pypi_0
+idna=3.6=pypi_0
+importlib-metadata=6.9.0=pyha770c72_0
+importlib-resources=6.1.1=pyhd8ed1ab_0
+importlib_resources=6.1.1=pyhd8ed1ab_0
+itsdangerous=2.1.2=pyhd8ed1ab_0
+jinja2=3.1.2=pyhd8ed1ab_1
+joblib=1.3.2=pyhd8ed1ab_0
+keras=2.15.0=pypi_0
+kiwisolver=1.4.5=py39hbd775c9_1
+lcms2=2.15=hf2736f0_3
+lerc=4.0.0=h9a09cb3_0
+libblas=3.9.0=20_osxarm64_openblas
+libbrotlicommon=1.1.0=hb547adb_1
+libbrotlidec=1.1.0=hb547adb_1
+libbrotlienc=1.1.0=hb547adb_1
+libcblas=3.9.0=20_osxarm64_openblas
+libclang=16.0.6=pypi_0
+libcxx=16.0.6=h4653b0c_0
+libdeflate=1.19=hb547adb_0
+libffi=3.4.2=h3422bc3_5
+libgfortran=5.0.0=13_2_0_hd922786_1
+libgfortran5=13.2.0=hf226fd6_1
+libjpeg-turbo=3.0.0=hb547adb_1
+liblapack=3.9.0=20_osxarm64_openblas
+libopenblas=0.3.25=openmp_h6c19121_0
+libpng=1.6.39=h76d750c_0
+libsqlite=3.44.2=h091b4b1_0
+libtiff=4.6.0=ha8a6c65_2
+libwebp-base=1.3.2=hb547adb_0
+libxcb=1.15=hf346824_0
+libzlib=1.2.13=h53f4e23_5
+llvm-openmp=17.0.6=hcd81f8e_0
+markdown=3.5.1=pypi_0
+markupsafe=2.1.3=py39h0f82c59_1
+matplotlib=3.8.2=py39hdf13c20_0
+matplotlib-base=3.8.2=py39h1a09f3e_0
+ml-dtypes=0.2.0=pypi_0
+munkres=1.1.4=pyh9f0ad1d_0
+ncurses=6.4=h463b476_2
+numpy=1.26.2=py39heee92a0_0
+oauthlib=3.2.2=pypi_0
+openjpeg=2.5.0=h4c1507b_3
+openssl=3.2.0=h0d3ecfb_1
+opt-einsum=3.3.0=pypi_0
+packaging=23.2=pyhd8ed1ab_0
+pandas=2.1.3=py39hf8cecc8_0
+pillow=10.1.0=py39h755f0b7_0
+pip=23.3.1=pyhd8ed1ab_0
+protobuf=4.23.4=pypi_0
+pthread-stubs=0.4=h27ca646_1001
+pyasn1=0.5.1=pypi_0
+pyasn1-modules=0.3.0=pypi_0
+pyparsing=3.1.1=pyhd8ed1ab_0
+python=3.9.18=hfa1ae8a_0_cpython
+python-dateutil=2.8.2=pyhd8ed1ab_0
+python-tzdata=2023.3=pyhd8ed1ab_0
+python_abi=3.9=4_cp39
+pytz=2023.3.post1=pyhd8ed1ab_0
+readline=8.2=h92ec313_1
+requests=2.31.0=pypi_0
+requests-oauthlib=1.3.1=pypi_0
+rsa=4.9=pypi_0
+scikit-learn=1.3.2=py39h172c841_1
+scipy=1.11.4=py39h36c428d_0
+setuptools=68.2.2=pyhd8ed1ab_0
+six=1.16.0=pyh6c4a22f_0
+tensorboard=2.15.1=pypi_0
+tensorboard-data-server=0.7.2=pypi_0
+tensorflow=2.15.0=pypi_0
+tensorflow-estimator=2.15.0=pypi_0
+tensorflow-io-gcs-filesystem=0.34.0=pypi_0
+tensorflow-macos=2.15.0=pypi_0
+tensorflow-metal=1.1.0=pypi_0
+termcolor=2.4.0=pypi_0
+threadpoolctl=3.2.0=pyha21a80b_0
+tk=8.6.13=h5083fa2_1
+tornado=6.3.3=py39h0f82c59_1
+typing-extensions=4.8.0=pypi_0
+tzdata=2023c=h71feb2d_0
+unicodedata2=15.1.0=py39h0f82c59_0
+urllib3=2.1.0=pypi_0
+werkzeug=3.0.1=pyhd8ed1ab_0
+wheel=0.42.0=pyhd8ed1ab_0
+wrapt=1.14.1=pypi_0
+xorg-libxau=1.0.11=hb547adb_0
+xorg-libxdmcp=1.1.3=h27ca646_0
+xz=5.2.6=h57fd34a_0
+zipp=3.17.0=pyhd8ed1ab_0
+zstd=1.5.5=h4f39d0f_0