In [None]:
import requests
import os
import pandas as pd
import glob
from datetime import datetime



MONTH_START = "2022-03" # Start month in the format yyyy-mm
MONTH_COUNT = 1 # sensor data will be downloaded for this amount of months
ROOT_URL = "https://archive.sensor.community/"
ROOT_DIR = '../data/SensorCommunity/'
WAIT_BETWEEN_DOWNLOADS = (0, 1)
SENSORS = [
    # 'bme280', 
    # 'bmp180', 
    # 'bmp280', 
    # 'dht22',
    # 'ds18b20', 
    # 'hpm', 
    # 'htu21d', 
    # 'pms1003', 
    # 'pms3003', 
    # 'pms5003', 
    # 'pms6003', 
    # 'pms7003', 
    # 'ppd42ns', 
    'sds011',
]
LAT_RANGE = [
    (53.013, 53.1456), 
    (50.030681, 50.205692),
]
# Bremen: (53.013, 53.1456)
# Frankfurt a. M.: (50.030681, 50.205692)
LON_RANGE = [
    (8.67, 8.9334), 
    (8.430634, 8.919868),
]


# get all csv files in root directory and the current date
all_files = glob.glob(ROOT_DIR + "/*.csv")
today = datetime.today()

# loop over all defined sensors
for sensor in SENSORS:
    # store the corresponding files in a list and sort it
    sensor_files = []   
    for file in all_files:
        if sensor in file:
            sensor_files.append(file)
    sensor_files.sort(reverse=True)

    # get the first date that is not already in the monthly csv
    start_date = None
    # If the latest file includes 'daily', it could be incomplete.
    # Then get the date from the timestamps.
    if 'daily' in sensor_files[0]:
        df_daily_last = pd.read_csv(sensor_files[0])
        start_date = pd.to_datetime(df_daily_last['timestamp']).dt.date.max()
        month = start_date.month
        start_date += pd.Timedelta(1, unit='days')
    # else: get the date from the filename
    else:
        start_year = int(sensor_files[0].split('-')[0][-4:])
        start_month = int(sensor_files[0].split('_')[0][-2:]) + 1
        if start_month > 12:
            start_month = 1
            start_year += 1
        start_date = pd.to_datetime(f"{start_year}-0{start_month}-01").date()
    # make a date range of the dates between start_date and today
    dates = pd.date_range(start_date, today, freq='d')
    
    # get the sensor IDs from previous months' files
    sensor_ids = pd.Series(dtype=int)
    for i in range(12):
        df_old = pd.read_csv(sensor_files[i])
        sensor_ids = pd.concat([sensor_ids, pd.Series(df_old['sensor_id'].unique())])
    sensor_ids = sensor_ids.unique()

    # make possible download URLs from date, sensor and sensor_id
    possible_urls = {}
    for date in dates:
        for sensor_id in sensor_ids:
            file_name = f"{date.strftime('%Y-%m-%d')}_{sensor}_sensor_{sensor_id}.csv"
            possible_urls[file_name] = ROOT_URL + str(date.strftime('%Y-%m-%d')) + '/' + file_name

    # make a daily folder if it doesn't exist
    if not os.path.exists(os.path.join(ROOT_DIR, "daily")):
        os.mkdir(os.path.join(ROOT_DIR, "daily"))

    # loop through possible URLs
    
    for name, url in possible_urls.items():
        csv_path = os.path.join(ROOT_DIR, "daily", name)
        # check if the files were already downloaded
        if not os.path.isfile(csv_path):
            response = requests.get(url, timeout=50)
            # if not: check whether the download URL is valid
            if 'The requested URL was not found on this server.' not in str(response.content):
                # if it's valid, download the file
                with open(csv_path, 'wb') as f:
                    f.write(response.content)
                print(f"{name} downloaded")

    # get all daily files
    all_daily_files = glob.glob(os.path.join(ROOT_DIR, "daily") + "/*.csv")

    # make_new_files helps afterwards to figure out, 
    # if we can append the data to an existing file or if we need a new file
    make_new_file = False

    # load the latest file
    df_daily_last = pd.read_csv(sensor_files[0])
    # get the last date and add 1 day to get the start_date
    last_date = pd.to_datetime(df_daily_last['timestamp']).dt.date.max()
    start_date = last_date + pd.Timedelta(1, unit='days')
    # if we switched to the next month, we will have to make a new file
    if last_date.month != start_date.month:
        make_new_file = True
        
    # make the file name
    if start_date.month < 10:
        file = ROOT_DIR + f"{start_date.year}-0{start_date.month}_{sensor}_daily.csv"
    else:
        file = ROOT_URL + f"{start_date.year}-{start_date.month}_{sensor}_daily.csv"

    # loop through all daily files
    for daily_file in all_daily_files:
        # if they include the relevant month, year and sensor, they are read
        if (f"{start_date.year}-{start_date.month}" in daily_file or f"{start_date.year}-0{start_date.month}" in daily_file) and sensor in daily_file:
            df_daily = pd.read_csv(daily_file, sep=';')
            # make an empty dataframe for filtered data
            # df_daily_filtered = pd.DataFrame()
            for j, lat in enumerate(LAT_RANGE):
                # filter for the coordinates defined above
                df_temp = df_daily[
                    (df_daily['lat'] > LAT_RANGE[j][0]) & \
                    (df_daily['lat'] < LAT_RANGE[j][1]) & \
                    (df_daily['lon'] > LON_RANGE[j][0]) & \
                    (df_daily['lon'] < LON_RANGE[j][1])
                ]
                # append filtered data
                # df_daily_filtered = pd.concat([df_daily_filtered, df_temp])
                # if we need a new file make one, otherwise append to an existing
                if make_new_file:
                    df_temp.to_csv(file, header=True, index=False)
                    make_new_file = False
                else:
                    df_temp.to_csv(file, mode='a', header=False, index=False)
