# Donwload the files to the cluster
---

In [slurm/01_download_rhone.py](../slurm/01_download_rhone.py) there is a pipeline to download all files and folders to the cluster.
Without access to the nextcloud client it is not possible to download single files. So here we showcase how to get one folder and unpack it.

If you struggle setting it up, please refer to the instructions in the [condaenv folder's](../condaenv/) README.md.

In [1]:
### import libraries
import requests
import os
from tqdm import tqdm
import multiprocessing as mp
from datetime import datetime, timedelta
from dotenv import load_dotenv

# get the base path of the git repository
repo_dir = os.popen('git rev-parse --show-toplevel').read().strip()
###load the .env file
load_dotenv(dotenv_path=f"{repo_dir}/.env")

#get the environment vairables
base=os.getenv("BASE_FOLDER")

Set the URL and credentials

In [2]:
######## set the URL and credentials ########

# URL for the whole dataset:
base_url = os.getenv("NEXTCLOUD_BASE")
# set the credentials
username = os.getenv("NEXTCLOUD_USERNAME")
password = os.getenv("NEXTCLOUD_PW")

print(f"Cloudbase: {base_url}")

######## set the storage path ########
# Create the directory if it does not exist
storage_path = repo_dir #base

Cloudbase: https://cloud.scadsai.uni-leipzig.de/index.php/s/gozxE5r9YdwGL8w/download


As the folder names are not requesteable, we have to define them ourselves. In the slurm pipeline this is downloading the whole folder DAS_2020.zip and then unzips it. 

In [3]:
############# Get the folder names #############
# Define the start and end dates
start_date = datetime(2020, 7, 4)
end_date = datetime(2020, 8, 4)

# Initialize vars
current_date = start_date
folder_names = []

# Loop through each day from the start date to the end date
while current_date <= end_date:
    date_str = current_date.strftime("%Y%m%d")
    date_str_2 = date_str+"_2"
    folder_names.append([date_str, date_str_2])

    # Move to the next day
    current_date += timedelta(days=1)

def flatten(xss):
    return [x for xs in xss for x in xs]
    
folder_names=flatten(folder_names)  

Define a function to download a folder and unzip it

In [4]:
############# download the files #############
def request_folder(folder):
    
    #request the folder
    url= f"{base_url}?path=/&files={folder}"
    # url= f"{base_url}?path=%2F{folder}#"
    path = os.path.join(storage_path, f"{folder}.zip")
    
    if not os.path.exists(path.split('.zip')[0]):
        print("url:", url)
        response = requests.get(url, auth=(username, password), stream=True)
        print(response)
    
        if response.status_code == 200:
            print("File downloading...")
            
            size = int(response.headers.get("Content-Length", 0))
            
            progress = tqdm(response.iter_content(1024), f"Downloading {path.split('.zip')[0]}", total=size, unit="B", unit_scale=True, unit_divisor=1024)
            
            with open(path, 'wb') as file:
                for data in progress.iterable:
                    file.write(data)
                    progress.update(len(data))      
                    
            print("File saved successfully at", path)  
        else:
            print(f"File {path} not found. Status code:", response.status_code)
    else:
        print(f"File {path} already exists.")

Request the folder to the repository 
> Warning: This is around 170GB and will take some minutes!

In [None]:
request_folder(folder_names[:1][0]) 

url: https://cloud.scadsai.uni-leipzig.de/index.php/s/gozxE5r9YdwGL8w/download?path=/&files=20200704
<Response [200]>
File downloading...


Downloading /home/sc.uni-leipzig.de/ju554xqou/big-data-praktikum/20200704: 20.0GB [01:51, 195MB/s] 

Unzip the folder

In [6]:
# unzip the files
def unzip_file(file):
    os.chdir(storage_path)
    print("Unzipping", file)
    os.system(f"unzip {file} -d {storage_path}")
    print("Unzipping done for", file)
    
unzip_file(folder_names[:1][0]+".zip")

Unzipping 20200704.zip
Archive:  20200704.zip
Unzipping done for 20200704.zip


  End-of-central-directory signature not found.  Either this file is not
  a zipfile, or it constitutes one disk of a multi-part archive.  In the
  latter case the central directory and zipfile comment will be found on
  the last disk(s) of this archive.
unzip:  cannot find zipfile directory in one of 20200704.zip or
        20200704.zip.zip, and cannot find 20200704.zip.ZIP, period.


In [None]:
##### remove all files #####
# for folder in folder_names:
#     path = os.path.join(storage_path, f"{folder}.h5")
#     if os.path.exists(path):
#         os.remove(path)
#         print(f"File {path} removed.")
#     else:
#         print(f"File {path} does not exist.")