In [0]:
%pip install -r requirements.txt
dbutils.library.restartPython()

In [0]:
import yaml 
import os 
import requests
import zipfile
import io


In [0]:
# Load config
with open('ws_config.yaml', 'r') as file:
    config = yaml.safe_load(file)

# Load environment vars and unity catalog vars 
env_vars = config['environment_variables']
unity_catalog_vars = config['unity_catalog']

In [0]:
# Set ENV vars using yaml config 
os.environ["CUDA_LAUNCH_BLOCKING"] = env_vars['CUDA_LAUNCH_BLOCKING']
os.environ['DATABRICKS_HOST'] = env_vars['DATABRICKS_HOST']
os.environ['DATABRICKS_WORKSPACE_ID'] = env_vars['DATABRICKS_WORKSPACE_ID']
os.environ['DEBUG'] = env_vars['DEBUG']

# Get UC vars from config 
catalog_name = unity_catalog_vars['catalog']
schema_name = unity_catalog_vars['schema']
volume_name = unity_catalog_vars['volume']

# Get Databricks secrets information
databricks_token_scope = env_vars['DATABRICKS_TOKEN']['scope']
databricks_token_key = env_vars['DATABRICKS_TOKEN']['key']

# Set Databricks token using Databricks secrets
os.environ['DATABRICKS_TOKEN'] = db_token = dbutils.secrets.get(scope=databricks_token_scope, key=databricks_token_key)

In [0]:
# Create assets if not exist

try:
    spark.sql(f"CREATE CATALOG IF NOT EXISTS {catalog_name}")
except Exception as e:
    if "quota" in str(e).lower():
        print("Quota limit reached for catalog creation.")
        print(f"Full error: {str(e)} \n")
    else:
        raise e

# Quota limits affecting the catalog, but schema and volume should work fine. 

try: 
    spark.sql(f"CREATE SCHEMA IF NOT EXISTS {catalog_name}.{schema_name}")
    spark.sql(f"CREATE VOLUME IF NOT EXISTS {catalog_name}.{schema_name}.{volume_name}")
    print(f"Successfully created schema: {schema_name}, and volume: {volume_name} in catalog: {catalog_name}")
except Exception as e:
    raise e

In [0]:
# Use dbutils.fs.mkdirs instead of os.makedirs to create directories in DBFS
# due to limitations with Databricks Connect
# otherwise may receive: PermissionError: [Errno 13] Permission denied

# Config project structure directory
project_path = f"/Volumes/{catalog_name}/{schema_name}/{volume_name}"

# Create project structure
training_path = f"{project_path}/training_runs/'"
result_path = f"{project_path}/training_results/"
data_path = f'{project_path}/data/'
raw_model_path = f'{project_path}/raw_model/'

# for cache related to ultralytics
os.environ['ULTRALYTICS_CACHE_DIR'] = raw_model_path

dbutils.fs.mkdirs(training_path)
dbutils.fs.mkdirs(result_path)
dbutils.fs.mkdirs(data_path)
dbutils.fs.mkdirs(raw_model_path)

dbutils.fs.ls(project_path)

### Optional DBFS-based code:

##### More "traditional" way, setup folder under DBFS.
##### dbfs_project_location = '/dbfs/FileStore/cv_project_location/yolo/'
```python
dbfs_project_location = '/dbfs/tmp/cv_project_location/yolo/'
os.makedirs(dbfs_project_location, exist_ok=True)
```

##### ephemeral /tmp/ project location on VM
```python
tmp_project_location = "/tmp/training_results/"
os.makedirs(tmp_project_location, exist_ok=True)
```

In [0]:
%sh
mkdir coco_files
curl -L https://github.com/ultralytics/ultralytics/raw/main/ultralytics/cfg/datasets/coco8.yaml -o coco_files/coco8.yaml

In [0]:
local_coco_path = 'coco_files/coco8.yaml'
default_coco_path = '../datasets/coco8'

with open(local_coco_path, 'r') as file:
    data = file.read()

# Replace the default path with the Volumes path 
data = data.replace(default_coco_path, data_path)

with open(local_coco_path, 'w') as file:
    file.write(data)

In [0]:
if os.environ['DEBUG'] == 'True': 
    with open(local_coco_path, 'r') as file:
        data = yaml.safe_load(file)
    display(data)
    display(data['path'])

In [0]:
download_response = requests.get(data['download'])
raw_coco_data = zipfile.ZipFile(io.BytesIO(download_response.content))
raw_coco_data.extractall(data['path'])

In [0]:
dbutils.fs.ls(data_path +'coco8/')


## Now the coco8 data is in the UC Volume ready for training and evaluation! 

#### What's Next: [01-data-transformation](https://adb-984752964297111.11.azuredatabricks.net/editor/notebooks/4389956761274528?o=984752964297111#command/4389956761274529)

In [0]:
# %python
# os.environ["DOWNLOAD_PATH"] = data['download']
# os.environ["PERSIST_PATH"] = data['path']

# %sh

# mkdir -p coco_tmp
# curl -L $DOWNLOAD_PATH -o coco_tmp/coco8.zip
# unzip -o coco_tmp/coco8.zip -d coco_tmp