d
# Dataset-Mounts-Test
The purpose of this notebook is to faciliate testing of our systems.

In [0]:
spark.conf.set("com.databricks.training.module-name", "dataset-mounts-test")

In [0]:
%run ./Dataset-Mounts

In [0]:
%scala

val testStart = System.currentTimeMillis
val mountPointBase = "/mnt/training-test"

def unmount(mountPoint:String):Unit = {
  try {
    dbutils.fs.unmount(mountPoint)
  } catch {
    case e:Exception => println(s"Not mounted: $mountPoint")
  }
}

def testRegion(regionType:String, regionName:String, mapper: (String) => (String,Map[String,String])):Unit = {
  val start = System.currentTimeMillis
  
  val (source, extraConfigs) = mapper(regionName)
  val mountPoint = s"$mountPointBase-${regionType.toLowerCase}-$regionName"
  println(s"""\nTesting the $regionType region $regionName ($mountPoint)""")

  mountSource(true, false, mountPoint, source, extraConfigs)
  Thread.sleep(5*1000) // give it a second
  validateDatasets(mountPoint)
  
  val duration = (System.currentTimeMillis - start) / 1000.0
  println(f"...all tests passed in $duration%1.2f seconds!")
}

def validateDataset(mountPoint:String, target:String):Unit = {
  val map = scala.collection.mutable.Map[String,(Long,Long)]()
  for (file <- dbutils.fs.ls(s"/mnt/training/$target")) {
    map.put(file.name, (file.size, -1L))
  }

  val path = s"$mountPoint/$target"
  for (file <- dbutils.fs.ls(path)) {
      if (map.contains(file.name)) {
        val (sizes, _) = map(file.name)
        map.put(file.name, (sizes, file.size))
      } else {
        map.put(file.name, (-1, file.size))
      }
  }
  
  var errors = ""
  for (key <- map.keySet) {
    val (sizeA, sizeB) = map(key)
    if (sizeA == sizeB) {
      // Everything matches up... no issue here.
    } else if (sizeA == -1) {
      if (!key.endsWith("_$folder$"))
        errors += s"Extra file: $path$key\n"
    } else if (sizeB == -1) {
      errors += s"Missing file: $path$key\n"
    }
  }
  
  errors = errors.trim()
  if (errors != "") {
    println(errors)
    throw new IllegalStateException(s"Errors were found while processing $path")
  }
}


def validateDatasets(mountPoint:String) {
  val paths = List(
    "",
    "301/",
    "Chicago-Crimes-2018.csv",
    "City-Data.parquet/",
    "EDGAR-Log-20170329/",
    "UbiqLog4UCI/",
    "_META/",
    "adventure-works/",
    "airbnb/",
    "airbnb-sf-listings.csv",
    "asa/",
    "auto-mpg.csv",
    "bigrams/",
    "bikeSharing/",
    "bostonhousing/",
    "cancer/",
    "countries/",
    "crime-data-2016/",
    "data/",
    "data-cleansing/",
    "databricks-blog.json",
    "databricks-datasets/",
    "dataframes/",
    "day-of-week/",
    "definitive-guide/",
    "dl/",
    "gaming_data/",
    "global-sales/",
    "graphx-demo/",
    "initech/",
    "ip-geocode.parquet/",
    "iris/",
    "mini_newsgroups/",
    "mnist/",
    "movie-reviews/",
    "movielens/",
    "movies/",
    "online_retail/",
    "philadelphia-crime-data-2015-ytd.csv",
    "purchases.txt",
    "sensor-data/",
    "ssn/",
    "stopwords",
    "structured-streaming/",
    "test.log",
    "tom-sawyer/",
    "tweets.txt",
    "twitter/",
    "wash_dc_crime_incidents_2013.csv",
    "wash_dc_crime_incidents_2015-10-03-to-2016-10-02.csv",
    "weather/",
    "wikipedia/",
    "wine.parquet/",
    "word-game-dict.txt",
    "zip3state.csv",
    "zips.json"
  )
  for (path <- paths) {
    validateDataset(mountPoint, path)
  }
}

In [0]:
%scala
val awsRegions=List(
  "us-west-2",
  "ap-northeast-1",
  "ap-northeast-2",
  "ap-south-1",
  "ap-southeast-1",
  "ap-southeast-2",
  "ca-central-1",
  "eu-central-1",
  "eu-west-1",
  "eu-west-2",
  "eu-west-3",
  "sa-east-1",
  "us-east-1",
  "us-east-2"
).map(_.toLowerCase())

val azureRegions=List(
 "AustraliaCentral",
 "AustraliaCentral2",
 "AustraliaEast",
 "AustraliaSoutheast",
 "CanadaCentral",
 "CanadaEast",
 "CentralIndia",
 "CentralUS",
 "EastAsia",
 "EastUS",
 "EastUS2",
 "JapanEast",
 "JapanWest",
 "NorthCentralUS",
 "NorthCentralUS",
 "NorthEurope",
 "SouthCentralUS",
 "SouthCentralUS",
 "SouthIndia",
 "SoutheastAsia",
 "UKSouth",
 "UKWest",
 "WestCentralUS",  // Azure Databricks isn't available in region, but we historically copied data here anyway.
 "WestEurope",
 "WestIndia",
 "WestUS",
 "WestUS2"
).map(_.toLowerCase())

In [0]:
%scala
for (region <- awsRegions) {
  testRegion("AWS", region, getAwsMapping _)
}

In [0]:
%python
for mount in (mount[0] for mount in dbutils.fs.mounts() if "training-test" in mount[0]):
  dbutils.fs.unmount(mount)

In [0]:
%scala
for (region <- azureRegions) {
  testRegion("Azure", region, getAzureMapping _)
}

In [0]:
%scala
println(f"...all tests passed in ${(System.currentTimeMillis - testStart) / 1000.0 / 60.0}%1.2f minutes!")

In [0]:
%python
for mount in (mount[0] for mount in dbutils.fs.mounts() if "training-test" in mount[0]):
  dbutils.fs.unmount(mount)
