In [1]:
%%configure -f { "jars": ["wasb:///example/jars/sqljdbc41.jar"] }

Endpoint:
	http://hn0-spark1.drd1ytvzbg4upmyutmhhfsvgmd.jx.internal.cloudapp.net:8998

Current session ID number:
	None

Session configs:
	{u'jars': [u'wasb:///example/jars/sqljdbc41.jar'], 'kind': 'spark'}

Info for endpoint:
    Sessions:




In [2]:
case class Rectangle(name: String, width: Double, height: Double)

Creating SparkContext as 'sc'
Creating SqlContext as 'sqlContext'
Creating HiveContext as 'hiveContext'
defined class Rectangle

In [3]:
object DatabaseUtilities {

  def getSqlJdbcConnectionString(sqlServerFQDN: String, sqlDatabaseName: String,
                             databaseUsername: String, databasePassword: String): String = {

    val serverName = sqlServerFQDN.split('.')(0)
    val certificateHostname = sqlServerFQDN.replace(serverName, "*")
    val serverPort = "1433"

    val sqlDatabaseConnectionString = f"jdbc:sqlserver://$sqlServerFQDN:$serverPort;database=$sqlDatabaseName;" +
      f"user=$databaseUsername@$serverName;password=$databasePassword;" +
      f"encrypt=true;hostNameInCertificate=$certificateHostname;loginTimeout=30;"

    sqlDatabaseConnectionString
  }
}

defined module DatabaseUtilities

In [12]:
val sqlServerFQDN = "<SQL Sever FQDN>"
val sqlDatabaseName = "<SQL Database Name>"
val databaseUsername = "<SQL Database User Name>"
val databasePassword = "<SQL Database User Password>"
val databaseTableName = "RectangleDetails"

 val sqlDatabaseConnectionString : String = DatabaseUtilities.getSqlJdbcConnectionString(
      sqlServerFQDN, sqlDatabaseName, databaseUsername, databasePassword)
      
sqlDatabaseConnectionString


res24: String = jdbc:sqlserver://<SQL Sever FQDN>:1433;database=<SQL Database Name>;user=<SQL Database User Name>@<SQL Sever FQDN>;password=<SQL Database User Password>;encrypt=true;hostNameInCertificate=*;loginTimeout=30;

In [5]:
import java.sql.{Statement, Connection, DriverManager}

val sqlDriverConnection: Connection =  DriverManager.getConnection(sqlDatabaseConnectionString)

sqlDriverConnection.setAutoCommit(false)

val sqlDriverStatement: Statement = sqlDriverConnection.createStatement()

sqlDriverStatement.addBatch(f"IF NOT EXISTS(SELECT * FROM sys.objects WHERE object_id" +
    f" = OBJECT_ID(N'[dbo].[$databaseTableName]') AND type in (N'U'))" +
    f"\nCREATE TABLE $databaseTableName(Name NVARCHAR(128) NOT NULL, Width FLOAT, Height FLOAT)")

sqlDriverStatement.addBatch(f"IF IndexProperty(Object_Id('$databaseTableName'), 'IX_RectangleName', 'IndexId') IS NULL" +
    f"\nCREATE CLUSTERED INDEX IX_RectangleName ON $databaseTableName(Name)")
    
sqlDriverStatement.executeBatch()
sqlDriverConnection.commit()

sqlDriverConnection.close()

In [7]:
import org.apache.spark.sql.DataFrame

object DataFrameExtensions {

  implicit def extendedDataFrame(dataFrame: DataFrame): ExtendedDataFrame = new ExtendedDataFrame(dataFrame: DataFrame)

  class ExtendedDataFrame(dataFrame: DataFrame) {

    def saveToAzureSql(sqlDatabaseConnectionString: String, sqlTableName: String): Unit = {

      val tableHeader: String = dataFrame.columns.mkString(",")

      val recordFormat: scala.collection.mutable.StringBuilder = new scala.collection.mutable.StringBuilder()

      dataFrame.dtypes.foreach(x => {

        x._2 match {

          case "StringType" => recordFormat.append("'%s',")

          case _ => recordFormat.append("%s,")
        }
      })

      val formatRecord: Seq[Any] => String = recordFormat.stripSuffix(",").format

      dataFrame.foreachPartition { partition =>

        val sqlExecutorConnection: Connection = DriverManager.getConnection(sqlDatabaseConnectionString)

        //Batch size of 1000 is used since Azure SQL database cannot insert more than 1000 rows at the same time.

        partition.grouped(1000).foreach {

          group => {

            val insertString: scala.collection.mutable.StringBuilder = new scala.collection.mutable.StringBuilder()

            group.foreach {

              record => {

                insertString.append("(" + formatRecord(record.toSeq) + "),")
              }
            }

            sqlExecutorConnection.createStatement().executeUpdate(f"INSERT INTO [dbo].[$sqlTableName] ($tableHeader) VALUES "
                                                                  + insertString.stripSuffix(","))

          }
        }

        sqlExecutorConnection.close()
      }
    }
  }
}

defined module DataFrameExtensions

In [8]:
val rectangleList: List[Rectangle] = List(Rectangle("RectangleA", 10, 20),
Rectangle("RectangleB", 30, 40), Rectangle("RectangleC", 50, 60))

rectangleList: List[Rectangle] = List(Rectangle(RectangleA,10.0,20.0), Rectangle(RectangleB,30.0,40.0), Rectangle(RectangleC,50.0,60.0))

In [9]:
val rectangleDataFrame = hiveContext.createDataFrame(rectangleList)

rectangleDataFrame: org.apache.spark.sql.DataFrame = [name: string, width: double, height: double]

In [10]:
import DataFrameExtensions._

rectangleDataFrame.saveToAzureSql(sqlDatabaseConnectionString, databaseTableName)