// Databricks notebook source exported at Sun, 21 Feb 2016 05:12:57 UTC
 #### This notebooks downloads the latest Pagecounts file from the Wikimedia Foundation to a staging folder in S3.

In [None]:
import scala.io.Source
import java.text.SimpleDateFormat
import java.util.Calendar
import sys.process._
import scala.language.postfixOps
import java.net.URL
import java.io.File
import scala.util.matching.Regex
import scala.collection.mutable

 #### Decide what the latest hourly pagecounts file is:

 The function in the next cell will:
* Check the current year and month locally
* Go to wikimedia.org to download the webpage (html file) for the current month's file dumps to the local Driver container
* Parse the downloaded webpage and find the latest file to download
* Return the URL for the latest file to download

In [None]:
// Define a function that figures out what the latest file is
def decideLatestFile():String = {

  // Construct the URL to download the todays pagecounts webpage from WMF
  val today = new java.util.Date
  
  val yearFormat = new SimpleDateFormat("y")
  val year = yearFormat.format(today)
  
  val monthFormat = new SimpleDateFormat("MM")
  val month = monthFormat.format(today)
  
  val todaysURL = s"https://dumps.wikimedia.org/other/pagecounts-raw/$year/$year-$month"
  
  // Download todaysURL HTML page to a local folder on the Driver
  new URL(todaysURL) #> new File("/home/ubuntu/databricks/driver/thisMonthsPagecountFiles.txt") !!
  
  // Read the local file into String currentPagecountsWebpage
  val source = scala.io.Source.fromFile("/home/ubuntu/databricks/driver/thisMonthsPagecountFiles.txt")
  val currentPagecountsWebpage = try source.mkString finally source.close()
  
  // Define a regex pattern and apply it to currentPagecountsWebpage
  val pattern = "<a[^>]*>([^<]+)</a>".r
  //val str2 = """<li><a href="pagecounts-20120501-010000.gz">pagecounts-20120501-010000.gz</a>, size 67M</li>"""
  val pagecountNames = (pattern findAllMatchIn currentPagecountsWebpage).map(_.group(1)).mkString(",")
  val items = pagecountNames.split(",") //This creates an items array of all the names
  
  // Loop through all the items, looking for just pagecount files
  val a = scala.collection.mutable.ArrayBuffer.empty[Long]
  for(currentItem <- items if currentItem.take(10) == "pagecounts") {
    a += currentItem.drop(11).take(15).replaceAll("[^\d]", "").toLong
  }
  
  // Find the newest timestamped file
  val newestFile = a.reduceLeft(_ max _)
  
  // Construct a URL for the latest file to download and return it
  todaysURL + "/" + "pagecounts-" + newestFile.toString.take(8) + "-" + newestFile.toString.drop(8) + ".gz" //newestFile.toString.drop(7)
}

 Call the decideLatestFile() function and store the returned URL string in value 'url':

In [None]:
val url = decideLatestFile()

In [None]:
%sh ls -lh /home/ubuntu/databricks/driver/

In [None]:
%sh cat thisMonthsPagecountFiles.txt

 #### Download the latest pagecounts file to a shared S3 staging folder:

 First, check which hour's pagecount file is currently in the staging folder:

In [None]:
%fs ls /mnt/wikipedia-readwrite/pagecounts/staging

In [None]:
// Define a function that downloads the latest file to DBFS
def downloadLatestFile(url:String) = {
  val baseFile = url.drop(62) //get the filename
  val temp = s"/home/ubuntu/databricks/driver/$baseFile"
  val targetDir = "dbfs:/mnt/wikipedia-readwrite/pagecounts/staging"
  // Clear target directory/bucket
  try {
    dbutils.fs.ls(targetDir).foreach(f => dbutils.fs.rm(f.path, recurse=false))
  }
  catch {
    case _: java.io.FileNotFoundException => // don't worry about it
  }
  
  // Download the file to the Driver's local file system
  new URL(url) #> new File(temp) !!
  
  // Copy the file from the Driver's file system to S3
  dbutils.fs.cp(s"file://$temp", s"$targetDir/$baseFile")
  
  // Remove the local temporary file.
  //s"rm $temp" !!
  
  println(s"Sucessfully downloaded: $baseFile")
}

 This download should take about 1-2 minutes to complete:

In [None]:
downloadLatestFile(url)

In [None]:
%sh ls -lh /home/ubuntu/databricks/driver/