# Automated .csv download
Automated download of sensor data from [Sensor Community Archive](https://archive.sensor.community/csv_per_month/).</br>
Define `MONTH_START`, `MONTH_COUNT` and `SENSORS` for specifying the files that should be downloaded.</br>
Define `WAIT_BETWEEN_DOWNLOADS` to define the waiting time between one download started and the net begins. A random number between the two defined will be used.</br>

In [None]:
import requests
from bs4 import BeautifulSoup as bs
import time
import os
import zipfile
import numpy as np
import pandas as pd

MONTH_START = "2020-01" # Start month in the format yyyy-mm
MONTH_COUNT = 1 # sensor data will be downloaded for this amount of months
URL = "https://archive.sensor.community/csv_per_month/"
ROOT_DIR = os.path.join(os.curdir, "../data", "")
WAIT_BETWEEN_DOWNLOADS = (0, 5)
SENSORS = [
    'bme280', 
    # 'bmp180', 
    'bmp280', 
    'dht22',
    # 'ds18b20', 
    # 'hpm', 
    # 'htu21d', 
    # 'pms1003', 
    # 'pms3003', 
    # 'pms5003', 
    # 'pms6003', 
    # 'pms7003', 
    # 'ppd42ns', 
    'sds011',
]
LAT_RANGE = [
    (53.013, 53.1456), 
    (50.030681, 50.205692),
    ] # Bremen: (53.013, 53.1456), Frankfurt: (50.030681, 50.205692)
LON_RANGE = [
    (8.67, 8.9334), 
    (8.430634, 8.919868),
    ] # Bremen: (8.67, 8.9334), Frankfurt: (8.430634, 8.919868)

def write_to_log(log_file, *args):
    """writes text to the defined log file

    Args:
        log_file: path to the log file
        *args: one or more strings that are written to the log file
    """
    with open(log_file, 'a') as log:
        for text in args:
            log.write(text)


script_start = time.time()

# make log file if it doesn't exist
date = time.strftime('%Y_%m_%d')
log_file_name = date + "_download_log.txt"
log_file_dir = os.path.join(ROOT_DIR + log_file_name)
print(log_file_dir)
if not os.path.exists(ROOT_DIR):
    os.mkdir(ROOT_DIR)
if os.path.isfile(log_file_dir):
    print('log file already exists.')
    print('New entries will be appended.')
else:
    log = open(log_file_dir, "w")
    log.close()

write_to_log(log_file_dir, "Session started at " + time.strftime('%Y_%m_%d-%H_%M_%S') + '\n')

# make list of relevant months
month_current = MONTH_START
months = [MONTH_START]
for month in range(MONTH_COUNT-1):
    y, m = month_current.split('-')
    if m == '12':
        m = '01'
        y = str(int(y) + 1)
    elif int(m) < 9:
        m = '0' + str(int(m) + 1)
    else:
        m = str(int(m) + 1)
    month_current = y + '-' + m
    months.append(month_current)

write_to_log(log_file_dir, f"Months: {months}\n")

# get download links for relevant months and sensors
for month in months:
    # get url
    url_curr = URL + month + '/'
    print(url_curr)    
    write_to_log(log_file_dir, f"URL: {url_curr}\n")

    # find download links according to the sensors list and save them with file names
    r = requests.get(url_curr)
    soup = bs(r.text)
    urls = []
    names = []
    for i, link in enumerate(soup.findAll('a')):
        if '.zip' in str(link) and any([sensor in str(link) for sensor in SENSORS]):
            url_download = url_curr + link.get('href')
            urls.append(url_download)
            names.append(soup.select('a')[i].attrs['href'])
    print("Files to download:")
    for file_name in names:
        print(file_name)
    write_to_log(log_file_dir, f"\tFiles: {names}\n")
    names_urls = zip(names, urls)

    # download files
    files_finished = 0
    for name, url in names_urls:

        # define path where downloaded file will be saved
        category = name.split('.')[0].split('_')[-1]
        # directory = os.path.join(ROOT_DIR, category, "")
        directory = ROOT_DIR
        full_path = os.path.join(directory, name)

        if not os.path.exists(directory):
            os.mkdir(directory)
        
        # define path for processed .csv file
        processed_dir = os.path.join(ROOT_DIR, "processed", "")
        if not os.path.exists(processed_dir):
            os.mkdir(processed_dir)
        name_csv = name.split('.')[0] + ".csv"
        csv_processed_dir = os.path.join(processed_dir, name_csv)

        # get path of unprocessed .csv file
        csv_full = os.path.join(directory, name_csv)

        # if the processed .csv file already exists skip download
        if os.path.isfile(csv_processed_dir) or os.path.isfile(csv_full) or os.path.isfile(full_path):
            if os.path.isfile(csv_processed_dir):
                write_to_log(log_file_dir, f"\t\t{csv_processed_dir} already exists... download and processing {name} gets skipped.\n")
                continue
            elif os.path.isfile(csv_full):
                write_to_log(log_file_dir, f"\t\t{csv_full} already exists... download of {name} gets skipped.\n")
            elif os.path.isfile(full_path):
                write_to_log(log_file_dir, f"\t\t{full_path} already exists... download of {name} gets skipped.\n")

        # download .zip file if it doesn't exist yet
        if not os.path.isfile(csv_full) and not os.path.isfile(full_path):
            print(f"Start downloading {name}.")
            start = time.time()
            response = requests.get(url, timeout=50)
            with open(full_path, 'wb') as f:
                f.write(response.content)
            end = time.time()
            print(f"The download took {round((end - start) / 60, 1)} minutes.")
            write_to_log(log_file_dir, f"\t\t{name}\n", f"\t\t\tDownload successfully finished after {(end - start) / 60} minutes.\n")

        if os.path.isfile(full_path):
            # unzip file
            print("Unzip file...")
            with zipfile.ZipFile(full_path, 'r') as zip_ref:
                zip_ref.extractall(directory)
            print("Unzipping finished")        
            write_to_log(log_file_dir, f"\t\t\t{name} unzipped\n")

            # delete .zip
            os.remove(full_path)
            print(".zip file deleted")
            write_to_log(log_file_dir, f"\t\t\t.zip file deleted\n")

        # define the chunk size that is read from .csv
        chunksize = 10 ** 6

        # read .csv chunkwise
        with pd.read_csv(csv_full, sep=";", chunksize=chunksize) as reader:
            write_to_log(log_file_dir, f"\t\t\tprocessing {csv_full}\n")
            print(f"processing {csv_full}\n")
            for i, chunk in enumerate(reader):
                # filter data by desired longitude and latitude
                for j, lat in enumerate(LAT_RANGE):
                    df_temp = chunk[(chunk['lat'] > LAT_RANGE[j][0]) & (chunk['lat'] < LAT_RANGE[j][1]) & (chunk['lon'] > LON_RANGE[j][0]) & (chunk['lon'] < LON_RANGE[j][1])]
                    # make a new file for the first chunk and append the subsequent chunks
                    if not i and not j:
                        df_temp.to_csv(csv_processed_dir, header=True, index=False)
                    else:
                        df_temp.to_csv(csv_processed_dir, mode='a', header=False, index=False)
                    write_to_log(log_file_dir, f"\t\t\t\twrote chunk #{i} for region #{j}\n")

        #delete original .csv file
        os.remove(csv_full)
        write_to_log(log_file_dir, f"\t\t\t\t{csv_full} deleted\n")
        print(f"{csv_full} deleted")

        # wait before next download starts
        wait = np.random.randint(WAIT_BETWEEN_DOWNLOADS[0], WAIT_BETWEEN_DOWNLOADS[1])
        print(f"Wait for {wait} minutes")
        write_to_log(log_file_dir, f"\t\t\twait for {wait} minutes\n\n")
        time.sleep(wait * 60)
        print()

script_end = time.time()
print(f"Finished script after {round((script_end - script_start) / 60, 1)} minutes")
write_to_log(log_file_dir, f"Finished script after {round((script_end - script_start) / 60, 1)} minutes")
