# Big Data - Final Work

## Initial configuration for Spark + JVM

In [None]:
%%init_spark
launcher.master = "local[*]"
launcher.driver_memory = '20g'
launcher.executor_memory = '20g'
launcher.verbose = 'true'

In [None]:
%%init_spark
launcher.conf.set("spark.app.name", "scalaXgbTest")
launcher.num_executors = 3
launcher.executor_cores = 7 //launcher.conf.spark.executor.cores = 8
launcher.conf.spark.task.cpus = 6
launcher.driver_memory = '4g'
launcher.executor_memory = '4g'
launcher.conf.set("spark.executor.heartbeatInterval", "6000s")
launcher.conf.set("spark.yarn.scheduler.heartbeat.interval-ms", "10000s")
launcher.conf.set("spark.network.timeout", "10000s")
launcher.conf.set("spark.yarn.executor.memoryOverhead", "8192")
launcher.conf.set("spark.sql.catalogImplementation", "hive")
launcher.jars = ["file://some/jar.jar", "xgboost-maven-0.82/xgboost4j-spark-0.82.jar", "xgboost-maven-0.82/xgboost4j-0.82.jar"]

In [2]:
println(sc.appName)
println(sc.master)

spylon-kernel
local[*]


In [3]:
import org.apache.spark.sql.types._      // include the Spark Types to define our schema
import org.apache.spark.sql.functions._  // include the Spark helper functions
import spark.implicits._                 // For implicit conversions like converting RDDs to DataFrames
import org.apache.spark.sql.expressions._

import org.apache.spark.sql.types._
import org.apache.spark.sql.functions._
import spark.implicits._
import org.apache.spark.sql.expressions._


## DB JSON schema

In [4]:
val location_schema =
    MapType(StringType,
        new StructType()
            .add("accuracy", DoubleType)
            .add("address", StringType)
            .add("altitude", DoubleType)
            .add("country", StringType)
            .add("latitude", DoubleType)
            .add("longitude", DoubleType)
            .add("provider", StringType)
            .add("timestamp", 
             new StructType()
                .add("date", LongType)
                .add("day", LongType)
                .add("hours", LongType)
                .add("month", LongType)
                .add("nanos", LongType)
                .add("seconds", LongType)
                .add("time", LongType)
                .add("timezoneOffset", LongType)
                .add("year", LongType)
            )
            .add("uid", StringType)
        )

val schema = new StructType()
    .add("locations", location_schema)
    .add("user-locations",
        MapType(StringType, location_schema)
    )
    .add("users",
        MapType(StringType,
            new StructType()
                .add("email", StringType)
                .add("username", StringType)
        )
    )

location_schema: org.apache.spark.sql.types.MapType = MapType(StringType,StructType(StructField(accuracy,DoubleType,true), StructField(address,StringType,true), StructField(altitude,DoubleType,true), StructField(country,StringType,true), StructField(latitude,DoubleType,true), StructField(longitude,DoubleType,true), StructField(provider,StringType,true), StructField(timestamp,StructType(StructField(date,LongType,true), StructField(day,LongType,true), StructField(hours,LongType,true), StructField(month,LongType,true), StructField(nanos,LongType,true), StructField(seconds,LongType,true), StructField(time,LongType,true), StructField(timezoneOffset,LongType,true), StructField(year,LongType,true)),true), StructField(uid,StringType,true)),true)
schema: org.apache.spark.sql.types.StructType = S...


In [5]:
val df = spark.read.option("multiline", true).schema(schema).json("trackme-sample-data.json")

df: org.apache.spark.sql.DataFrame = [locations: map<string,struct<accuracy:double,address:string,altitude:double,country:string,latitude:double,longitude:double,provider:string,timestamp:struct<date:bigint,day:bigint,hours:bigint,month:bigint,nanos:bigint,seconds:bigint,time:bigint,timezoneOffset:bigint,year:bigint>,uid:string>>, user-locations: map<string,map<string,struct<accuracy:double,address:string,altitude:double,country:string,latitude:double,longitude:double,provider:string,timestamp:struct<date:bigint,day:bigint,hours:bigint,month:bigint,nanos:bigint,seconds:bigint,time:bigint,timezoneOffset:bigint,year:bigint>,uid:string>>> ... 1 more field]


In [6]:
df.printSchema()

root
 |-- locations: map (nullable = true)
 |    |-- key: string
 |    |-- value: struct (valueContainsNull = true)
 |    |    |-- accuracy: double (nullable = true)
 |    |    |-- address: string (nullable = true)
 |    |    |-- altitude: double (nullable = true)
 |    |    |-- country: string (nullable = true)
 |    |    |-- latitude: double (nullable = true)
 |    |    |-- longitude: double (nullable = true)
 |    |    |-- provider: string (nullable = true)
 |    |    |-- timestamp: struct (nullable = true)
 |    |    |    |-- date: long (nullable = true)
 |    |    |    |-- day: long (nullable = true)
 |    |    |    |-- hours: long (nullable = true)
 |    |    |    |-- month: long (nullable = true)
 |    |    |    |-- nanos: long (nullable = true)
 |    |    |    |-- seconds: long (nullable = true)
 |    |    |    |-- time: long (nullable = true)
 |    |    |    |-- timezoneOffset: long (nullable = true)
 |    |    |    |-- year: long (nullable = true)
 |    |    |-- uid: string (

In [7]:
val locationsDF = df.select(explode($"locations") as Seq("timestamp_id", "value"))
val userLocationsDF = df.select(explode($"user-locations") as Seq("uid", "timestamp"))
val usersDF = df.select(explode($"users") as Seq("uid", "user_attr"))

locationsDF: org.apache.spark.sql.DataFrame = [timestamp_id: string, value: struct<accuracy: double, address: string ... 7 more fields>]
userLocationsDF: org.apache.spark.sql.DataFrame = [uid: string, timestamp: map<string,struct<accuracy:double,address:string,altitude:double,country:string,latitude:double,longitude:double,provider:string,timestamp:struct<date:bigint,day:bigint,hours:bigint,month:bigint,nanos:bigint,seconds:bigint,time:bigint,timezoneOffset:bigint,year:bigint>,uid:string>>]
usersDF: org.apache.spark.sql.DataFrame = [uid: string, user_attr: struct<email: string, username: string>]


In [8]:
usersDF.printSchema()

root
 |-- uid: string (nullable = false)
 |-- user_attr: struct (nullable = true)
 |    |-- email: string (nullable = true)
 |    |-- username: string (nullable = true)



## Function to flatten schema

In [9]:
import org.apache.spark.sql.Column

def flattenSchema(schema: StructType, prefix: String = null) : Array[Column] = {
  schema.fields.flatMap(f => {
    val colName = if (prefix == null) f.name else (prefix + "." + f.name)

    f.dataType match {
      case st: StructType => flattenSchema(st, colName)
      case _ => Array(col(colName))
    }
  })
}

import org.apache.spark.sql.Column
flattenSchema: (schema: org.apache.spark.sql.types.StructType, prefix: String)Array[org.apache.spark.sql.Column]


## Flat struct DataFrames

In [10]:
val flatLocationsDF = locationsDF.select(flattenSchema(locationsDF.schema):_*)
val flatUserLocationsDF = userLocationsDF.select(flattenSchema(userLocationsDF.schema):_*)
val flatUsersDF = usersDF.select(flattenSchema(usersDF.schema):_*)

flatLocationsDF: org.apache.spark.sql.DataFrame = [timestamp_id: string, accuracy: double ... 16 more fields]
flatUserLocationsDF: org.apache.spark.sql.DataFrame = [uid: string, timestamp: map<string,struct<accuracy:double,address:string,altitude:double,country:string,latitude:double,longitude:double,provider:string,timestamp:struct<date:bigint,day:bigint,hours:bigint,month:bigint,nanos:bigint,seconds:bigint,time:bigint,timezoneOffset:bigint,year:bigint>,uid:string>>]
flatUsersDF: org.apache.spark.sql.DataFrame = [uid: string, email: string ... 1 more field]


In [11]:
flatLocationsDF.printSchema()

root
 |-- timestamp_id: string (nullable = false)
 |-- accuracy: double (nullable = true)
 |-- address: string (nullable = true)
 |-- altitude: double (nullable = true)
 |-- country: string (nullable = true)
 |-- latitude: double (nullable = true)
 |-- longitude: double (nullable = true)
 |-- provider: string (nullable = true)
 |-- date: long (nullable = true)
 |-- day: long (nullable = true)
 |-- hours: long (nullable = true)
 |-- month: long (nullable = true)
 |-- nanos: long (nullable = true)
 |-- seconds: long (nullable = true)
 |-- time: long (nullable = true)
 |-- timezoneOffset: long (nullable = true)
 |-- year: long (nullable = true)
 |-- uid: string (nullable = true)



In [12]:
flatLocationsDF.show(false)

+-------------+------------------+-----------------------------------------------------------------------+------------------+-------+-----------+-----------+--------+----+---+-----+-----+---------+-------+-------------+--------------+----+----------------------------+
|timestamp_id |accuracy          |address                                                                |altitude          |country|latitude   |longitude  |provider|date|day|hours|month|nanos    |seconds|time         |timezoneOffset|year|uid                         |
+-------------+------------------+-----------------------------------------------------------------------+------------------+-------+-----------+-----------+--------+----+---+-----+-----+---------+-------+-------------+--------------+----+----------------------------+
|1602119776824|15.666000366210938|R. Mena Barreto, 182 - Botafogo, Rio de Janeiro - RJ, 22271-100, Brazil|14.90000057220459 |Brazil |-22.9556468|-43.1880249|fused   |7   |3  |22   |9    |824000

## Calculate Distance

### Element version

In [14]:
import scala.math._

def calculate_distance_elem(lat1:Double, lon1:Double, lat2:Double, lon2:Double):Double = {   
    val earth_radius = 6371e3;           // meters
    val phi1 = lat1 * Pi/180;                  // radians
    val phi2 = lat2 * Pi/180;                  // radians
    val delta_phi = phi2 - phi1;               // radians

    val delta_lampda = (lon2 - lon1) * Pi/180; // radians

    val a = sin(delta_phi/2)*sin(delta_phi/2) + cos(phi1)*cos(phi2)*sin(delta_lampda/2)*sin(delta_lampda/2);
    val c = 2*atan2(sqrt(a), sqrt(1-a));

    val d = earth_radius*c; // meters
    
    return d
}

val calculate_distance_elem_sqlfunc = udf(calculate_distance_elem(_,_,_,_))

import scala.math._
calculate_distance_elem: (lat1: Double, lon1: Double, lat2: Double, lon2: Double)Double
calculate_distance_elem_sqlfunc: org.apache.spark.sql.expressions.UserDefinedFunction = SparkUserDefinedFunction($Lambda$3429/799971909@1622c15e,DoubleType,List(Some(class[value[0]: double]), Some(class[value[0]: double]), Some(class[value[0]: double]), Some(class[value[0]: double])),None,false,true)


### Test calculate_distance_elem function

In [16]:
val lat1 = -22.9556473
val lon1 = -43.1881019

val lat2 = -23.9556473
val lon2 = -44.1881019

val dist = calculate_distance_elem(lat1, lon1, lat2, lon2)

assert (dist == 150894.75616346067)

lat1: Double = -22.9556473
lon1: Double = -43.1881019
lat2: Double = -23.9556473
lon2: Double = -44.1881019
dist: Double = 150894.75616346067


### Column version

In [15]:
//import scala.math._
import org.apache.spark.sql.functions._

def calculate_distance_col(lat1:org.apache.spark.sql.Column, lon1:org.apache.spark.sql.Column, lat2:org.apache.spark.sql.Column, lon2:org.apache.spark.sql.Column):org.apache.spark.sql.Column = {   
    val earth_radius = 6371e3;           // meters
    val pi_over_180 = lit(Pi/180);
    val phi1 = lat1 * pi_over_180;                  // radians
    val phi2 = lat2 * pi_over_180;                  // radians
    val delta_phi = phi2 - phi1;               // radians

    val delta_lampda = (lon2 - lon1) * pi_over_180; // radians

    val a = sin(delta_phi/2)*sin(delta_phi/2) + cos(phi1)*cos(phi2)*sin(delta_lampda/2)*sin(delta_lampda/2);
    val c = lit(2)*atan2(sqrt(a), sqrt(lit(1)-a));

    val d = lit(earth_radius)*c; // meters
    
    return d;
}

// val calculate_distance_sqlfunc = udf(calculate_distance(_,_,_,_))

import org.apache.spark.sql.functions._
calculate_distance_col: (lat1: org.apache.spark.sql.Column, lon1: org.apache.spark.sql.Column, lat2: org.apache.spark.sql.Column, lon2: org.apache.spark.sql.Column)org.apache.spark.sql.Column


## Calculate distance

In [43]:
import org.apache.spark.sql.expressions.Window

import org.apache.spark.sql.expressions.Window


In [25]:
val lat_col = Window.partitionBy("latitude").orderBy($"timestamp_id".asc)
val lon_col = Window.partitionBy("longitude").orderBy($"timestamp_id".asc)

val lat2 = col("latitude")
// val lat1 = lag("latitude", 1).over(lat_col)
val lat1 = when((lag("latitude", 1).over(lat_col)).isNotNull, lag("latitude", 1).over(lat_col)).otherwise($"latitude")

val lon2 = col("longitude")
// val lon1 = lag("longitude", 1).over(lat_col)
val lon1 = when((lag("longitude", 1).over(lon_col)).isNotNull, lag("longitude", 1).over(lon_col)).otherwise($"longitude")

lat_col: org.apache.spark.sql.expressions.WindowSpec = org.apache.spark.sql.expressions.WindowSpec@6c539996
lon_col: org.apache.spark.sql.expressions.WindowSpec = org.apache.spark.sql.expressions.WindowSpec@57ef6b85
lat2: org.apache.spark.sql.Column = latitude
lat1: org.apache.spark.sql.Column = lag(latitude, 1, NULL) OVER (PARTITION BY latitude ORDER BY timestamp_id ASC NULLS FIRST unspecifiedframe$())
lon2: org.apache.spark.sql.Column = longitude
lon1: org.apache.spark.sql.Column = lag(longitude, 1, NULL) OVER (PARTITION BY longitude ORDER BY timestamp_id ASC NULLS FIRST unspecifiedframe$())


In [26]:
val flatLocationsWithDistDF = flatLocationsDF.withColumn("distance", calculate_distance_col(lat1, lon1, lat2, lon2))

flatLocationsWithDistDF: org.apache.spark.sql.DataFrame = [timestamp_id: string, accuracy: double ... 17 more fields]


In [27]:
flatLocationsWithDistDF.printSchema()

root
 |-- timestamp_id: string (nullable = false)
 |-- accuracy: double (nullable = true)
 |-- address: string (nullable = true)
 |-- altitude: double (nullable = true)
 |-- country: string (nullable = true)
 |-- latitude: double (nullable = true)
 |-- longitude: double (nullable = true)
 |-- provider: string (nullable = true)
 |-- date: long (nullable = true)
 |-- day: long (nullable = true)
 |-- hours: long (nullable = true)
 |-- month: long (nullable = true)
 |-- nanos: long (nullable = true)
 |-- seconds: long (nullable = true)
 |-- time: long (nullable = true)
 |-- timezoneOffset: long (nullable = true)
 |-- year: long (nullable = true)
 |-- uid: string (nullable = true)
 |-- distance: double (nullable = true)



In [28]:
flatLocationsWithDistDF
.select("distance")
    .show(40)

+--------+
|distance|
+--------+
|    null|
|     0.0|
|     0.0|
|     0.0|
|    null|
|    null|
|    null|
|     0.0|
|    null|
|    null|
|    null|
|    null|
|     0.0|
|    null|
|    null|
|    null|
|    null|
|     0.0|
|    null|
|    null|
|     0.0|
|    null|
|    null|
|     0.0|
|    null|
|    null|
|     0.0|
|    null|
|    null|
|    null|
|     0.0|
|    null|
|     0.0|
|    null|
|     0.0|
|     0.0|
|    null|
|     0.0|
|    null|
|     0.0|
+--------+
only showing top 40 rows



In [29]:
val emails = "wallace.mendes.rj@gmail.com"

val joinExpression = flatLocationsDF.col("uid") === flatUsersDF.col("uid")
var joinType = "inner"
val consultaFinal = flatLocationsDF.join(flatUsersDF, joinExpression, joinType).drop(flatUsersDF.col("uid"))//.filter($"Email" === emails)

consultaFinal.show(5,false)

+-------------+------------------+-----------------------------------------------------------------------+------------------+-------+-----------+-----------+--------+----+---+-----+-----+---------+-------+-------------+--------------+----+----------------------------+---------------------------+-----------------+
|timestamp_id |accuracy          |address                                                                |altitude          |country|latitude   |longitude  |provider|date|day|hours|month|nanos    |seconds|time         |timezoneOffset|year|uid                         |email                      |username         |
+-------------+------------------+-----------------------------------------------------------------------+------------------+-------+-----------+-----------+--------+----+---+-----+-----+---------+-------+-------------+--------------+----+----------------------------+---------------------------+-----------------+
|1602119776824|15.666000366210938|R. Mena Barreto, 182 

emails: String = wallace.mendes.rj@gmail.com
joinExpression: org.apache.spark.sql.Column = (uid = uid)
joinType: String = inner
consultaFinal: org.apache.spark.sql.DataFrame = [timestamp_id: string, accuracy: double ... 18 more fields]


In [34]:
flatLocationsWithDistDF.withColumn("data", concat($"day",lit("/"),$"month"+1,lit("/"),$"year"+1900)).show(5)

+-------------+------------------+--------------------+-----------------+-------+-----------+-----------+--------+----+---+-----+-----+---------+-------+-------------+--------------+----+--------------------+--------+---------+
| timestamp_id|          accuracy|             address|         altitude|country|   latitude|  longitude|provider|date|day|hours|month|    nanos|seconds|         time|timezoneOffset|year|                 uid|distance|     data|
+-------------+------------------+--------------------+-----------------+-------+-----------+-----------+--------+----+---+-----+-----+---------+-------+-------------+--------------+----+--------------------+--------+---------+
|1602281418819|15.708999633789062|R. Tôrres Homem, ...|             28.0| Brasil|-22.9143819|-43.2478775|   fused|   9|  5|   19|    9|819000000|     18|1602281418819|           180| 120|PgXtDvjeJQgc6FzN9...|    null|5/10/2020|
|1602281495309|15.708999633789062|R. Tôrres Homem, ...|             28.0| Brasil|-22.914