# Big Data - Final Work - Scala

## Initial configuration for Spark + JVM

In [3]:
%%init_spark
launcher.master = "local[*]"
launcher.driver_memory = '20g'
launcher.executor_memory = '20g'
launcher.verbose = 'true'

In [None]:
%%init_spark
launcher.conf.set("spark.app.name", "scalaXgbTest")
launcher.num_executors = 3
launcher.executor_cores = 7 //launcher.conf.spark.executor.cores = 8
launcher.conf.spark.task.cpus = 6
launcher.driver_memory = '4g'
launcher.executor_memory = '4g'
launcher.conf.set("spark.executor.heartbeatInterval", "6000s")
launcher.conf.set("spark.yarn.scheduler.heartbeat.interval-ms", "10000s")
launcher.conf.set("spark.network.timeout", "10000s")
launcher.conf.set("spark.yarn.executor.memoryOverhead", "8192")
launcher.conf.set("spark.sql.catalogImplementation", "hive")
launcher.jars = ["file://some/jar.jar", "xgboost-maven-0.82/xgboost4j-spark-0.82.jar", "xgboost-maven-0.82/xgboost4j-0.82.jar"]

In [4]:
println(sc.appName)
println(sc.master)

spylon-kernel
local[*]


## General imports

In [5]:
import org.apache.spark.sql.types._      // include the Spark Types to define our schema
import org.apache.spark.sql.functions._  // include the Spark helper functions
import spark.implicits._                 // For implicit conversions like converting RDDs to DataFrames
import org.apache.spark.sql.expressions._

import org.apache.spark.sql.types._
import org.apache.spark.sql.functions._
import spark.implicits._
import org.apache.spark.sql.expressions._


## DB JSON schema

In [6]:
val location_schema =
    MapType(StringType,
        new StructType()
            .add("accuracy", DoubleType)
            .add("address", StringType)
            .add("altitude", DoubleType)
            .add("country", StringType)
            .add("latitude", DoubleType)
            .add("longitude", DoubleType)
            .add("provider", StringType)
            .add("timestamp", 
             new StructType()
                .add("date", LongType)
                .add("day", LongType)
                .add("hours", LongType)
                .add("month", LongType)
                .add("nanos", LongType)
                .add("seconds", LongType)
                .add("time", LongType)
                .add("timezoneOffset", LongType)
                .add("year", LongType)
            )
            .add("uid", StringType)
        )

val schema = new StructType()
    .add("locations", location_schema)
    .add("user-locations",
        MapType(StringType, location_schema)
    )
    .add("users",
        MapType(StringType,
            new StructType()
                .add("email", StringType)
                .add("username", StringType)
        )
    )

location_schema: org.apache.spark.sql.types.MapType = MapType(StringType,StructType(StructField(accuracy,DoubleType,true), StructField(address,StringType,true), StructField(altitude,DoubleType,true), StructField(country,StringType,true), StructField(latitude,DoubleType,true), StructField(longitude,DoubleType,true), StructField(provider,StringType,true), StructField(timestamp,StructType(StructField(date,LongType,true), StructField(day,LongType,true), StructField(hours,LongType,true), StructField(month,LongType,true), StructField(nanos,LongType,true), StructField(seconds,LongType,true), StructField(time,LongType,true), StructField(timezoneOffset,LongType,true), StructField(year,LongType,true)),true), StructField(uid,StringType,true)),true)
schema: org.apache.spark.sql.types.StructType = S...


## Import JSON DB

In [9]:
val df = spark.read.option("multiline", true).schema(schema).json("trackme-sample-data.json")

df: org.apache.spark.sql.DataFrame = [locations: map<string,struct<accuracy:double,address:string,altitude:double,country:string,latitude:double,longitude:double,provider:string,timestamp:struct<date:bigint,day:bigint,hours:bigint,month:bigint,nanos:bigint,seconds:bigint,time:bigint,timezoneOffset:bigint,year:bigint>,uid:string>>, user-locations: map<string,map<string,struct<accuracy:double,address:string,altitude:double,country:string,latitude:double,longitude:double,provider:string,timestamp:struct<date:bigint,day:bigint,hours:bigint,month:bigint,nanos:bigint,seconds:bigint,time:bigint,timezoneOffset:bigint,year:bigint>,uid:string>>> ... 1 more field]


In [10]:
df.printSchema()

root
 |-- locations: map (nullable = true)
 |    |-- key: string
 |    |-- value: struct (valueContainsNull = true)
 |    |    |-- accuracy: double (nullable = true)
 |    |    |-- address: string (nullable = true)
 |    |    |-- altitude: double (nullable = true)
 |    |    |-- country: string (nullable = true)
 |    |    |-- latitude: double (nullable = true)
 |    |    |-- longitude: double (nullable = true)
 |    |    |-- provider: string (nullable = true)
 |    |    |-- timestamp: struct (nullable = true)
 |    |    |    |-- date: long (nullable = true)
 |    |    |    |-- day: long (nullable = true)
 |    |    |    |-- hours: long (nullable = true)
 |    |    |    |-- month: long (nullable = true)
 |    |    |    |-- nanos: long (nullable = true)
 |    |    |    |-- seconds: long (nullable = true)
 |    |    |    |-- time: long (nullable = true)
 |    |    |    |-- timezoneOffset: long (nullable = true)
 |    |    |    |-- year: long (nullable = true)
 |    |    |-- uid: string (

## Breakdown raw DB into contextual structures

In [11]:
val locationsDF = df.select(explode($"locations") as Seq("timestamp_id", "value"))
val userLocationsDF = df.select(explode($"user-locations") as Seq("uid", "timestamp"))
val usersDF = df.select(explode($"users") as Seq("uid", "user_attr"))

locationsDF: org.apache.spark.sql.DataFrame = [timestamp_id: string, value: struct<accuracy: double, address: string ... 7 more fields>]
userLocationsDF: org.apache.spark.sql.DataFrame = [uid: string, timestamp: map<string,struct<accuracy:double,address:string,altitude:double,country:string,latitude:double,longitude:double,provider:string,timestamp:struct<date:bigint,day:bigint,hours:bigint,month:bigint,nanos:bigint,seconds:bigint,time:bigint,timezoneOffset:bigint,year:bigint>,uid:string>>]
usersDF: org.apache.spark.sql.DataFrame = [uid: string, user_attr: struct<email: string, username: string>]


In [12]:
usersDF.printSchema()

root
 |-- uid: string (nullable = false)
 |-- user_attr: struct (nullable = true)
 |    |-- email: string (nullable = true)
 |    |-- username: string (nullable = true)



## Function to flatten schema

In [13]:
import org.apache.spark.sql.Column

def flattenSchema(schema: StructType, prefix: String = null) : Array[Column] = {
  schema.fields.flatMap(f => {
    val colName = if (prefix == null) f.name else (prefix + "." + f.name)

    f.dataType match {
      case st: StructType => flattenSchema(st, colName)
      case _ => Array(col(colName))
    }
  })
}

import org.apache.spark.sql.Column
flattenSchema: (schema: org.apache.spark.sql.types.StructType, prefix: String)Array[org.apache.spark.sql.Column]


## Flat struct DataFrames

In [14]:
val flatLocationsDF = locationsDF.select(flattenSchema(locationsDF.schema):_*)
val flatUserLocationsDF = userLocationsDF.select(flattenSchema(userLocationsDF.schema):_*)
val flatUsersDF = usersDF.select(flattenSchema(usersDF.schema):_*)

flatLocationsDF: org.apache.spark.sql.DataFrame = [timestamp_id: string, accuracy: double ... 16 more fields]
flatUserLocationsDF: org.apache.spark.sql.DataFrame = [uid: string, timestamp: map<string,struct<accuracy:double,address:string,altitude:double,country:string,latitude:double,longitude:double,provider:string,timestamp:struct<date:bigint,day:bigint,hours:bigint,month:bigint,nanos:bigint,seconds:bigint,time:bigint,timezoneOffset:bigint,year:bigint>,uid:string>>]
flatUsersDF: org.apache.spark.sql.DataFrame = [uid: string, email: string ... 1 more field]


In [15]:
flatLocationsDF.printSchema()

root
 |-- timestamp_id: string (nullable = false)
 |-- accuracy: double (nullable = true)
 |-- address: string (nullable = true)
 |-- altitude: double (nullable = true)
 |-- country: string (nullable = true)
 |-- latitude: double (nullable = true)
 |-- longitude: double (nullable = true)
 |-- provider: string (nullable = true)
 |-- date: long (nullable = true)
 |-- day: long (nullable = true)
 |-- hours: long (nullable = true)
 |-- month: long (nullable = true)
 |-- nanos: long (nullable = true)
 |-- seconds: long (nullable = true)
 |-- time: long (nullable = true)
 |-- timezoneOffset: long (nullable = true)
 |-- year: long (nullable = true)
 |-- uid: string (nullable = true)



In [16]:
flatLocationsDF.take(2).foreach(println)

[1602119776824,15.666000366210938,R. Mena Barreto, 182 - Botafogo, Rio de Janeiro - RJ, 22271-100, Brazil,14.90000057220459,Brazil,-22.9556468,-43.1880249,fused,7,3,22,9,824000000,16,1602119776824,180,120,H5LG3vN3jcPlcbJ2A5RGo6H4AHw2]
[1602119934507,15.967000007629395,R. Mena Barreto, 182 - Botafogo, Rio de Janeiro - RJ, 22271-100, Brazil,15.300000190734863,Brazil,-22.9556367,-43.1880211,fused,7,3,22,9,507000000,54,1602119934507,180,120,H5LG3vN3jcPlcbJ2A5RGo6H4AHw2]


In [17]:
flatUsersDF.take(2).foreach(println)

[5Jf44SGWhzZmxsZs7n6KLzrHark1,rodrigomesquita0@gmail.com,rodrigomesquita0]
[BHNpkg1LH2Sna0axjb8pFWDIycD2,vivian.lopesg@gmail.com,vivian.lopesg]


## Function for distance calculation based on lat/long

### Element version - calculate_distance_elem

In [70]:
import scala.math._

def calculate_distance_elem(lat1:Double, lon1:Double, lat2:Double, lon2:Double):Double = {   
    val earth_radius = 6371e3;           // meters
    val phi1 = lat1 * Pi/180;                  // radians
    val phi2 = lat2 * Pi/180;                  // radians
    val delta_phi = phi2 - phi1;               // radians

    val delta_lampda = (lon2 - lon1) * Pi/180; // radians

    val a = sin(delta_phi/2)*sin(delta_phi/2) + cos(phi1)*cos(phi2)*sin(delta_lampda/2)*sin(delta_lampda/2);
    val c = 2*atan2(sqrt(a), sqrt(1-a));

    val d = earth_radius*c; // meters
    
    return d
}

val calculate_distance_elem_sqlfunc = udf(calculate_distance_elem(_,_,_,_))

import scala.math._
calculate_distance_elem: (lat1: Double, lon1: Double, lat2: Double, lon2: Double)Double
calculate_distance_elem_sqlfunc: org.apache.spark.sql.expressions.UserDefinedFunction = SparkUserDefinedFunction($Lambda$4815/1659349508@6e7871ea,DoubleType,List(Some(class[value[0]: double]), Some(class[value[0]: double]), Some(class[value[0]: double]), Some(class[value[0]: double])),None,false,true)


### Test - calculate_distance_elem

In [13]:
val lat1 = -22.9556473
val lon1 = -43.1881019

val lat2 = -23.9556473
val lon2 = -44.1881019

val dist = calculate_distance_elem(lat1, lon1, lat2, lon2)

assert (dist == 150894.75616346067)

<console>: 43: error: not found: value calculate_distance_elem

### Column version

In [18]:
import scala.math.Pi
import org.apache.spark.sql.functions._

def calculate_distance_col(lat1:org.apache.spark.sql.Column, lon1:org.apache.spark.sql.Column, lat2:org.apache.spark.sql.Column, lon2:org.apache.spark.sql.Column):org.apache.spark.sql.Column = {   
    val earth_radius = 6371e3;           // meters
    val pi_over_180 = lit(Pi/180);
    val phi1 = lat1 * pi_over_180;                  // radians
    val phi2 = lat2 * pi_over_180;                  // radians
    val delta_phi = phi2 - phi1;               // radians

    val delta_lampda = (lon2 - lon1) * pi_over_180; // radians

    val a = sin(delta_phi/2)*sin(delta_phi/2) + cos(phi1)*cos(phi2)*sin(delta_lampda/2)*sin(delta_lampda/2);
    val c = lit(2)*atan2(sqrt(a), sqrt(lit(1)-a));

    val d = lit(earth_radius)*c; // meters
    
    return d;
}

// val calculate_distance_sqlfunc = udf(calculate_distance(_,_,_,_))

import scala.math.Pi
import org.apache.spark.sql.functions._
calculate_distance_col: (lat1: org.apache.spark.sql.Column, lon1: org.apache.spark.sql.Column, lat2: org.apache.spark.sql.Column, lon2: org.apache.spark.sql.Column)org.apache.spark.sql.Column


## Calculate distance

In [19]:
import org.apache.spark.sql.expressions.Window

import org.apache.spark.sql.expressions.Window


In [38]:
// val lat_col = Window.partitionBy("latitude").orderBy($"timestamp_id".asc)
// val lon_col = Window.partitionBy("longitude").orderBy($"timestamp_id".asc)
val lat_col = Window.orderBy($"timestamp_id".asc)
val lon_col = Window.orderBy($"timestamp_id".asc)

val lat2 = col("latitude")
val lat1 = lag("latitude", 1).over(lat_col)
// val lat1 = when((lag("latitude", 1).over(lat_col)).isNotNull, lag("latitude", 1).over(lat_col)).otherwise(0)

val lon2 = col("longitude")
val lon1 = lag("longitude", 1).over(lat_col)
// val lon1 = when((lag("longitude", 1).over(lon_col)).isNotNull, lag("longitude", 1).over(lon_col)).otherwise(0)

lat_col: org.apache.spark.sql.expressions.WindowSpec = org.apache.spark.sql.expressions.WindowSpec@93651ba
lon_col: org.apache.spark.sql.expressions.WindowSpec = org.apache.spark.sql.expressions.WindowSpec@74119f1f
lat2: org.apache.spark.sql.Column = latitude
lat1: org.apache.spark.sql.Column = lag(latitude, 1, NULL) OVER (ORDER BY timestamp_id ASC NULLS FIRST unspecifiedframe$())
lon2: org.apache.spark.sql.Column = longitude
lon1: org.apache.spark.sql.Column = lag(longitude, 1, NULL) OVER (ORDER BY timestamp_id ASC NULLS FIRST unspecifiedframe$())


In [39]:
val emails = "viniciusmgaspar@gmail.com"

val joinExpression = flatLocationsDF.col("uid") === flatUsersDF.col("uid")
var joinType = "inner"
val consultaFinal = flatLocationsDF.join(flatUsersDF, joinExpression, joinType).drop(flatUsersDF.col("uid")).filter($"Email" === emails).orderBy($"timestamp_id")

consultaFinal.show(5,false)

val flatLocationsWithDistDF = consultaFinal.withColumn("distance", when(calculate_distance_col(lat1, lon1, lat2, lon2).isNotNull,calculate_distance_col(lat1, lon1, lat2, lon2)).otherwise(0.0))

//val flatLocationsWithDistDF = consultaFinal.withColumn("distance", calculate_distance_col(lat1, lon1, lat2, lon2))

+-------------+------------------+--------------------------------------------------------------------------+------------------+-------+-----------+-----------+--------+----+---+-----+-----+---------+-------+-------------+--------------+----+----------------------------+-------------------------+---------------+
|timestamp_id |accuracy          |address                                                                   |altitude          |country|latitude   |longitude  |provider|date|day|hours|month|nanos    |seconds|time         |timezoneOffset|year|uid                         |email                    |username       |
+-------------+------------------+--------------------------------------------------------------------------+------------------+-------+-----------+-----------+--------+----+---+-----+-----+---------+-------+-------------+--------------+----+----------------------------+-------------------------+---------------+
|1602124371945|14.199000358581543|R. Tôrres Homem, 538 - V

emails: String = viniciusmgaspar@gmail.com
joinExpression: org.apache.spark.sql.Column = (uid = uid)
joinType: String = inner
consultaFinal: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [timestamp_id: string, accuracy: double ... 18 more fields]
flatLocationsWithDistDF: org.apache.spark.sql.DataFrame = [timestamp_id: string, accuracy: double ... 19 more fields]


In [40]:
flatLocationsWithDistDF.printSchema()

root
 |-- timestamp_id: string (nullable = false)
 |-- accuracy: double (nullable = true)
 |-- address: string (nullable = true)
 |-- altitude: double (nullable = true)
 |-- country: string (nullable = true)
 |-- latitude: double (nullable = true)
 |-- longitude: double (nullable = true)
 |-- provider: string (nullable = true)
 |-- date: long (nullable = true)
 |-- day: long (nullable = true)
 |-- hours: long (nullable = true)
 |-- month: long (nullable = true)
 |-- nanos: long (nullable = true)
 |-- seconds: long (nullable = true)
 |-- time: long (nullable = true)
 |-- timezoneOffset: long (nullable = true)
 |-- year: long (nullable = true)
 |-- uid: string (nullable = true)
 |-- email: string (nullable = true)
 |-- username: string (nullable = true)
 |-- distance: double (nullable = true)



# Show walked distance

In [41]:
flatLocationsWithDistDF.withColumn("data", (col("timestamp_id")/1000).cast(TimestampType)).orderBy($"data".asc)
.select("username","data","latitude","longitude","distance")
    .show(false)

+---------------+-----------------------+-----------+-----------+------------------+
|username       |data                   |latitude   |longitude  |distance          |
+---------------+-----------------------+-----------+-----------+------------------+
|viniciusmgaspar|2020-10-07 23:32:51.945|-22.9143424|-43.2479267|0.0               |
|viniciusmgaspar|2020-10-07 23:34:51.961|-22.9143476|-43.2479225|0.720675492840136 |
|viniciusmgaspar|2020-10-07 23:36:56.689|-22.9143736|-43.2479302|2.9967018357873862|
|viniciusmgaspar|2020-10-07 23:37:57.042|-22.9143736|-43.2479302|0.0               |
|viniciusmgaspar|2020-10-07 23:38:57.116|-22.9143736|-43.2479302|0.0               |
|viniciusmgaspar|2020-10-07 23:40:02.011|-22.914351 |-43.247925 |2.5688213633650445|
|viniciusmgaspar|2020-10-07 23:41:02.089|-22.914351 |-43.247925 |0.0               |
|viniciusmgaspar|2020-10-07 23:42:09.072|-22.9143672|-43.2479284|1.8347079899956038|
|viniciusmgaspar|2020-10-07 23:43:09.149|-22.9143672|-43.2479284|

In [42]:
// flatLocationsWithDistDF.withColumn("data", date_format((col("timestamp_id")/1000).cast(TimestampType),"dd-MM-yyyy")).filter($"data" >= "08-10-2020").show(50)

// flatLocationsWithDistDF.withColumn("data", date_format((col("timestamp_id")/1000).cast(TimestampType),"dd-MM-yyyy")).filter($"data" >= "08-10-2020").filter($"data" <= "11-10-2020").orderBy($"data".desc).show(false)


//DISTANCIA
val loc =  flatLocationsWithDistDF.withColumn("data", (col("timestamp_id")/1000).cast(TimestampType)).orderBy($"data".desc)

loc.groupBy($"email",date_format(col("data"),"dd-MM-yyyy").as("data")).sum("distance").orderBy($"email".asc, $"data".asc)
.show(false)
    

// loc.select($"email",$"data",$"distance").filter($"distance" >= 1000).show(500,false)

+-------------------------+----------+------------------+
|email                    |data      |sum(distance)     |
+-------------------------+----------+------------------+
|viniciusmgaspar@gmail.com|07-10-2020|27.519468577232463|
|viniciusmgaspar@gmail.com|08-10-2020|27236.486130992485|
|viniciusmgaspar@gmail.com|09-10-2020|9797.745650086401 |
+-------------------------+----------+------------------+



loc: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [timestamp_id: string, accuracy: double ... 20 more fields]


In [43]:
flatLocationsWithDistDF.withColumn("data", date_format((col("timestamp_id")/1000).cast(TimestampType),"dd-MM-yyyy")).show(5)

+-------------+------------------+--------------------+------------------+-------+-----------+-----------+--------+----+---+-----+-----+---------+-------+-------------+--------------+----+--------------------+--------------------+---------------+------------------+----------+
| timestamp_id|          accuracy|             address|          altitude|country|   latitude|  longitude|provider|date|day|hours|month|    nanos|seconds|         time|timezoneOffset|year|                 uid|               email|       username|          distance|      data|
+-------------+------------------+--------------------+------------------+-------+-----------+-----------+--------+----+---+-----+-----+---------+-------+-------------+--------------+----+--------------------+--------------------+---------------+------------------+----------+
|1602124371945|14.199000358581543|R. Tôrres Homem, ...|19.200000762939453| Brasil|-22.9143424|-43.2479267|   fused|   7|  3|   23|    9|945000000|     51|1602124371945| 

In [79]:
// Register the DataFrame as a SQL temporary view
flatLocationsWithDistDF.createOrReplaceTempView("flatLocationSQL")

val sqlDF = spark.sql("SELECT * FROM flatLocationSQL")
sqlDF.show()

+-------------+------------------+--------------------+------------------+-------+-----------+-----------+--------+----+---+-----+-----+---------+-------+-------------+--------------+----+--------------------+------------------+
| timestamp_id|          accuracy|             address|          altitude|country|   latitude|  longitude|provider|date|day|hours|month|    nanos|seconds|         time|timezoneOffset|year|                 uid|          distance|
+-------------+------------------+--------------------+------------------+-------+-----------+-----------+--------+----+---+-----+-----+---------+-------+-------------+--------------+----+--------------------+------------------+
|1602281418819|15.708999633789062|R. Tôrres Homem, ...|              28.0| Brasil|-22.9143819|-43.2478775|   fused|   9|  5|   19|    9|819000000|     18|1602281418819|           180| 120|PgXtDvjeJQgc6FzN9...|  5322025.72753382|
|1602281495309|15.708999633789062|R. Tôrres Homem, ...|              28.0| Brasil|-2

sqlDF: org.apache.spark.sql.DataFrame = [timestamp_id: string, accuracy: double ... 17 more fields]


## Usage of reduceByKey to sum up values

In [1]:
val userReducedDistance = flatLocationsWithDistDF.reduceByKey((v1,v2) => v1 + v2)

Intitializing Scala interpreter ...

Spark Web UI available at http://cln-rio-06:4041
SparkContext available as 'sc' (version = 3.0.1, master = local[*], app id = local-1602515908080)
SparkSession available as 'spark'


<console>: 24: error: not found: value flatLocationsWithDistDF