# Big Data - Trabalho Final - Scala

## Initial configuration for Spark + JVM

In [2]:
%%init_spark
launcher.master = "local[*]"
launcher.driver_memory = '20g'
launcher.executor_memory = '20g'
launcher.verbose = 'true'

In [3]:
println(sc.appName)
println(sc.master)

spylon-kernel
local[*]


## General imports

In [4]:
import org.apache.spark.sql.types._      // include the Spark Types to define our schema
import org.apache.spark.sql.functions._  // include the Spark helper functions
import spark.implicits._                 // For implicit conversions like converting RDDs to DataFrames
import org.apache.spark.sql.expressions._

import org.apache.spark.sql.types._
import org.apache.spark.sql.functions._
import spark.implicits._
import org.apache.spark.sql.expressions._


## DB JSON schema

In [5]:
val location_schema =
    MapType(StringType,
        new StructType()
            .add("accuracy", DoubleType)
            .add("address", StringType)
            .add("altitude", DoubleType)
            .add("country", StringType)
            .add("latitude", DoubleType)
            .add("longitude", DoubleType)
            .add("provider", StringType)
            .add("timestamp", 
             new StructType()
                .add("date", LongType)
                .add("day", LongType)
                .add("hours", LongType)
                .add("minutes", LongType)
                .add("month", LongType)
                .add("nanos", LongType)
                .add("seconds", LongType)
                .add("time", LongType)
                .add("timezoneOffset", LongType)
                .add("year", LongType)
            )
            .add("uid", StringType)
        )

val schema = new StructType()
    .add("locations", location_schema)
    .add("user-locations",
        MapType(StringType, location_schema)
    )
    .add("users",
        MapType(StringType,
            new StructType()
                .add("email", StringType)
                .add("username", StringType)
        )
    )

location_schema: org.apache.spark.sql.types.MapType = MapType(StringType,StructType(StructField(accuracy,DoubleType,true), StructField(address,StringType,true), StructField(altitude,DoubleType,true), StructField(country,StringType,true), StructField(latitude,DoubleType,true), StructField(longitude,DoubleType,true), StructField(provider,StringType,true), StructField(timestamp,StructType(StructField(date,LongType,true), StructField(day,LongType,true), StructField(hours,LongType,true), StructField(minutes,LongType,true), StructField(month,LongType,true), StructField(nanos,LongType,true), StructField(seconds,LongType,true), StructField(time,LongType,true), StructField(timezoneOffset,LongType,true), StructField(year,LongType,true)),true), StructField(uid,StringType,true)),true)
schema: org.a...


## Import JSON DB

In [6]:
val df = spark.read.option("multiline", true).schema(schema).json("trackme-export.json")

df: org.apache.spark.sql.DataFrame = [locations: map<string,struct<accuracy:double,address:string,altitude:double,country:string,latitude:double,longitude:double,provider:string,timestamp:struct<date:bigint,day:bigint,hours:bigint,minutes:bigint,month:bigint,nanos:bigint,seconds:bigint,time:bigint,timezoneOffset:bigint,year:bigint>,uid:string>>, user-locations: map<string,map<string,struct<accuracy:double,address:string,altitude:double,country:string,latitude:double,longitude:double,provider:string,timestamp:struct<date:bigint,day:bigint,hours:bigint,minutes:bigint,month:bigint,nanos:bigint,seconds:bigint,time:bigint,timezoneOffset:bigint,year:bigint>,uid:string>>> ... 1 more field]


In [7]:
df.printSchema()

root
 |-- locations: map (nullable = true)
 |    |-- key: string
 |    |-- value: struct (valueContainsNull = true)
 |    |    |-- accuracy: double (nullable = true)
 |    |    |-- address: string (nullable = true)
 |    |    |-- altitude: double (nullable = true)
 |    |    |-- country: string (nullable = true)
 |    |    |-- latitude: double (nullable = true)
 |    |    |-- longitude: double (nullable = true)
 |    |    |-- provider: string (nullable = true)
 |    |    |-- timestamp: struct (nullable = true)
 |    |    |    |-- date: long (nullable = true)
 |    |    |    |-- day: long (nullable = true)
 |    |    |    |-- hours: long (nullable = true)
 |    |    |    |-- minutes: long (nullable = true)
 |    |    |    |-- month: long (nullable = true)
 |    |    |    |-- nanos: long (nullable = true)
 |    |    |    |-- seconds: long (nullable = true)
 |    |    |    |-- time: long (nullable = true)
 |    |    |    |-- timezoneOffset: long (nullable = true)
 |    |    |    |-- year:

## Separação do banco em DataFrames

In [8]:
val rawLocationsDF = df.select(explode($"locations") as Seq("timestamp_id", "value"))
val rawUserLocationsDF = df.select(explode($"user-locations") as Seq("uid", "timestamp"))
val rawUsersDF = df.select(explode($"users") as Seq("uid", "user_attr"))

rawLocationsDF: org.apache.spark.sql.DataFrame = [timestamp_id: string, value: struct<accuracy: double, address: string ... 7 more fields>]
rawUserLocationsDF: org.apache.spark.sql.DataFrame = [uid: string, timestamp: map<string,struct<accuracy:double,address:string,altitude:double,country:string,latitude:double,longitude:double,provider:string,timestamp:struct<date:bigint,day:bigint,hours:bigint,minutes:bigint,month:bigint,nanos:bigint,seconds:bigint,time:bigint,timezoneOffset:bigint,year:bigint>,uid:string>>]
rawUsersDF: org.apache.spark.sql.DataFrame = [uid: string, user_attr: struct<email: string, username: string>]


In [9]:
rawUsersDF.printSchema()

root
 |-- uid: string (nullable = false)
 |-- user_attr: struct (nullable = true)
 |    |-- email: string (nullable = true)
 |    |-- username: string (nullable = true)



## Função para "flatenizar" o schema

In [10]:
import org.apache.spark.sql.Column

def flattenSchema(schema: StructType, prefix: String = null) : Array[Column] = {
  schema.fields.flatMap(f => {
    val colName = if (prefix == null) f.name else (prefix + "." + f.name)

    f.dataType match {
      case st: StructType => flattenSchema(st, colName)
      case _ => Array(col(colName))
    }
  })
}

import org.apache.spark.sql.Column
flattenSchema: (schema: org.apache.spark.sql.types.StructType, prefix: String)Array[org.apache.spark.sql.Column]


## Uso da função nos DataFrames

In [11]:
val locationsDF = rawLocationsDF.select(flattenSchema(rawLocationsDF.schema):_*)
val userLocationsDF = rawUserLocationsDF.select(flattenSchema(rawUserLocationsDF.schema):_*)
val usersDF = rawUsersDF.select(flattenSchema(rawUsersDF.schema):_*)

locationsDF: org.apache.spark.sql.DataFrame = [timestamp_id: string, accuracy: double ... 17 more fields]
userLocationsDF: org.apache.spark.sql.DataFrame = [uid: string, timestamp: map<string,struct<accuracy:double,address:string,altitude:double,country:string,latitude:double,longitude:double,provider:string,timestamp:struct<date:bigint,day:bigint,hours:bigint,minutes:bigint,month:bigint,nanos:bigint,seconds:bigint,time:bigint,timezoneOffset:bigint,year:bigint>,uid:string>>]
usersDF: org.apache.spark.sql.DataFrame = [uid: string, email: string ... 1 more field]


In [12]:
locationsDF.printSchema()

root
 |-- timestamp_id: string (nullable = false)
 |-- accuracy: double (nullable = true)
 |-- address: string (nullable = true)
 |-- altitude: double (nullable = true)
 |-- country: string (nullable = true)
 |-- latitude: double (nullable = true)
 |-- longitude: double (nullable = true)
 |-- provider: string (nullable = true)
 |-- date: long (nullable = true)
 |-- day: long (nullable = true)
 |-- hours: long (nullable = true)
 |-- minutes: long (nullable = true)
 |-- month: long (nullable = true)
 |-- nanos: long (nullable = true)
 |-- seconds: long (nullable = true)
 |-- time: long (nullable = true)
 |-- timezoneOffset: long (nullable = true)
 |-- year: long (nullable = true)
 |-- uid: string (nullable = true)



In [13]:
usersDF.take(2).foreach(println)

[5Jf44SGWhzZmxsZs7n6KLzrHark1,rodrigomesquita0@gmail.com,rodrigomesquita0]
[BHNpkg1LH2Sna0axjb8pFWDIycD2,vivian.lopesg@gmail.com,vivian.lopesg]


## Criar coluna com tipo Timestamp

In [14]:
val locationsWithDateDF = locationsDF.withColumn("ts_date", (col("timestamp_id")/1000).cast(TimestampType))

locationsWithDateDF: org.apache.spark.sql.DataFrame = [timestamp_id: string, accuracy: double ... 18 more fields]


## Join com Users DB

In [15]:
val joinExpression = locationsWithDateDF.col("uid") === usersDF.col("uid")
var joinType = "inner"
val locWithDateJoinUserDF = locationsWithDateDF.join(usersDF, joinExpression, joinType).drop(usersDF.col("uid")).orderBy($"timestamp_id")

joinExpression: org.apache.spark.sql.Column = (uid = uid)
joinType: String = inner
locWithDateJoinUserDF: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [timestamp_id: string, accuracy: double ... 20 more fields]


In [16]:
locWithDateJoinUserDF.select("email", "latitude", "longitude", "ts_date").sample(false, 0.04).take(5).foreach(println)

[henrique.mageste@gmail.com,-22.9555958,-43.1880177,2020-10-07 22:52:37.435]
[wallace.mendes.rj@gmail.com,-22.8381043,-43.2622822,2020-10-07 23:10:06.52]
[viniciusmgaspar@gmail.com,-22.914351,-43.247925,2020-10-07 23:40:02.011]
[wallace.mendes.rj@gmail.com,-22.8381148,-43.2623173,2020-10-08 00:21:00.085]
[wallace.mendes.rj@gmail.com,-22.8381131,-43.2623054,2020-10-08 00:42:46.199]


## Função para calcular distância

In [17]:
import scala.math.Pi
import org.apache.spark.sql.functions._

def calculate_distance_col(lat1:org.apache.spark.sql.Column, lon1:org.apache.spark.sql.Column, lat2:org.apache.spark.sql.Column, lon2:org.apache.spark.sql.Column):org.apache.spark.sql.Column = {   
    val earth_radius = 6371e3;           // meters
    val pi_over_180 = lit(Pi/180);
    val phi1 = lat1 * pi_over_180;                  // radians
    val phi2 = lat2 * pi_over_180;                  // radians
    val delta_phi = phi2 - phi1;               // radians

    val delta_lampda = (lon2 - lon1) * pi_over_180; // radians

    val a = sin(delta_phi/2)*sin(delta_phi/2) + cos(phi1)*cos(phi2)*sin(delta_lampda/2)*sin(delta_lampda/2);
    val c = lit(2)*atan2(sqrt(a), sqrt(lit(1)-a));

    val d = lit(earth_radius)*c; // meters
    
    return d;
}

// val calculate_distance_sqlfunc = udf(calculate_distance(_,_,_,_))

import scala.math.Pi
import org.apache.spark.sql.functions._
calculate_distance_col: (lat1: org.apache.spark.sql.Column, lon1: org.apache.spark.sql.Column, lat2: org.apache.spark.sql.Column, lon2: org.apache.spark.sql.Column)org.apache.spark.sql.Column


## Cálculo Distância

In [18]:
import org.apache.spark.sql.expressions.Window

import org.apache.spark.sql.expressions.Window


In [19]:
// val lat_col = Window.partitionBy("latitude").orderBy($"timestamp_id".asc)
// val lon_col = Window.partitionBy("longitude").orderBy($"timestamp_id".asc)
val lat_col = Window.orderBy($"timestamp_id".asc)
val lon_col = Window.orderBy($"timestamp_id".asc)

val lat2 = col("latitude")
val lat1 = lag("latitude", 1).over(lat_col)
// val lat1 = when((lag("latitude", 1).over(lat_col)).isNotNull, lag("latitude", 1).over(lat_col)).otherwise(0)

val lon2 = col("longitude")
val lon1 = lag("longitude", 1).over(lat_col)
// val lon1 = when((lag("longitude", 1).over(lon_col)).isNotNull, lag("longitude", 1).over(lon_col)).otherwise(0)

lat_col: org.apache.spark.sql.expressions.WindowSpec = org.apache.spark.sql.expressions.WindowSpec@3f3135d8
lon_col: org.apache.spark.sql.expressions.WindowSpec = org.apache.spark.sql.expressions.WindowSpec@5b45f3bf
lat2: org.apache.spark.sql.Column = latitude
lat1: org.apache.spark.sql.Column = lag(latitude, 1, NULL) OVER (ORDER BY timestamp_id ASC NULLS FIRST unspecifiedframe$())
lon2: org.apache.spark.sql.Column = longitude
lon1: org.apache.spark.sql.Column = lag(longitude, 1, NULL) OVER (ORDER BY timestamp_id ASC NULLS FIRST unspecifiedframe$())


In [20]:
val emails = "viniciusmgaspar@gmail.com"

val joinExpression = locationsWithDateDF.col("uid") === usersDF.col("uid")
var joinType = "inner"
val locationsWithDatePerUserDF = locationsWithDateDF.join(usersDF, joinExpression, joinType).drop(usersDF.col("uid")).filter($"Email" === emails).orderBy($"timestamp_id")

val locDatePerUserDistDF = locationsWithDatePerUserDF.withColumn("distance", when(calculate_distance_col(lat1, lon1, lat2, lon2).isNotNull,calculate_distance_col(lat1, lon1, lat2, lon2)).otherwise(0.0))

emails: String = viniciusmgaspar@gmail.com
joinExpression: org.apache.spark.sql.Column = (uid = uid)
joinType: String = inner
locationsWithDatePerUserDF: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [timestamp_id: string, accuracy: double ... 20 more fields]
locDatePerUserDistDF: org.apache.spark.sql.DataFrame = [timestamp_id: string, accuracy: double ... 21 more fields]


# Mostrar distância

In [21]:
locDatePerUserDistDF.orderBy($"ts_date".asc)
.select("username","ts_date","latitude","longitude","distance")
    .show(false)

+---------------+-----------------------+-----------+-----------+------------------+
|username       |ts_date                |latitude   |longitude  |distance          |
+---------------+-----------------------+-----------+-----------+------------------+
|viniciusmgaspar|2020-10-07 23:32:51.945|-22.9143424|-43.2479267|0.0               |
|viniciusmgaspar|2020-10-07 23:34:51.961|-22.9143476|-43.2479225|0.720675492840136 |
|viniciusmgaspar|2020-10-07 23:36:56.689|-22.9143736|-43.2479302|2.9967018357873862|
|viniciusmgaspar|2020-10-07 23:37:57.042|-22.9143736|-43.2479302|0.0               |
|viniciusmgaspar|2020-10-07 23:38:57.116|-22.9143736|-43.2479302|0.0               |
|viniciusmgaspar|2020-10-07 23:40:02.011|-22.914351 |-43.247925 |2.5688213633650445|
|viniciusmgaspar|2020-10-07 23:41:02.089|-22.914351 |-43.247925 |0.0               |
|viniciusmgaspar|2020-10-07 23:42:09.072|-22.9143672|-43.2479284|1.8347079899956038|
|viniciusmgaspar|2020-10-07 23:43:09.149|-22.9143672|-43.2479284|

## Distância por dia

In [22]:
locDatePerUserDistDF.groupBy($"email",date_format(col("ts_date"),"dd-MM-yyyy").as("date")).sum("distance").orderBy($"email".asc, $"date".asc)
.show(false)

+-------------------------+----------+------------------+
|email                    |date      |sum(distance)     |
+-------------------------+----------+------------------+
|viniciusmgaspar@gmail.com|07-10-2020|27.519468577232455|
|viniciusmgaspar@gmail.com|08-10-2020|27236.48613099252 |
|viniciusmgaspar@gmail.com|09-10-2020|9797.745650086414 |
|viniciusmgaspar@gmail.com|11-10-2020|617.8502728067592 |
|viniciusmgaspar@gmail.com|12-10-2020|2232.8849177640095|
+-------------------------+----------+------------------+



## Distância entre duas pessoas

In [24]:
val user1 = "henrique.mageste@gmail.com"
val user2 = "wallace.mendes.rj@gmail.com"

val user1_DF = locWithDateJoinUserDF.withColumn("formatted_date", date_format($"ts_date","dd-MM-yyyy HH:mm")).filter($"email" === user1).as("user1")
val user2_DF = locWithDateJoinUserDF.withColumn("formatted_date", date_format($"ts_date","dd-MM-yyyy HH:mm")).filter($"email" === user2).as("user2")

val joinExpression = ( user1_DF.col("formatted_date") === user2_DF.col("formatted_date"))
val joinType = "inner"
val c = user1_DF.join(user2_DF, joinExpression, joinType)
    
val c2= c.select(
    col("user1.formatted_date")
    ,col("user2.formatted_date")
    ,col("user1.email")
    ,col("user2.email")
    ,col("user1.longitude")
    ,col("user2.longitude")
    ,col("user1.latitude")
    ,col("user2.latitude")
    ,col("user1.address")
    ,col("user2.address")
    )
    .withColumn("distance_between", calculate_distance_col(col("user1.latitude"), col("user1.longitude"), col("user2.latitude"), col("user2.longitude")) )

user1: String = henrique.mageste@gmail.com
user2: String = wallace.mendes.rj@gmail.com
user1_DF: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [timestamp_id: string, accuracy: double ... 21 more fields]
user2_DF: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [timestamp_id: string, accuracy: double ... 21 more fields]
joinExpression: org.apache.spark.sql.Column = (formatted_date = formatted_date)
joinType: String = inner
c: org.apache.spark.sql.DataFrame = [timestamp_id: string, accuracy: double ... 44 more fields]
c2: org.apache.spark.sql.DataFrame = [formatted_date: string, formatted_date: string ... 9 more fields]


In [25]:
// AGREGADO POR DATA
c2.groupBy(col("user1.formatted_date"),col("user2.formatted_date")).agg(min("distance_between"),max("distance_between")).orderBy($"user1.formatted_date").show(false)

// AGREGADO POR USUÁRIO    
c2.groupBy(col("user1.email"),col("user2.email")).agg(min("distance_between"),max("distance_between")).show(false)

+----------------+----------------+---------------------+---------------------+
|formatted_date  |formatted_date  |min(distance_between)|max(distance_between)|
+----------------+----------------+---------------------+---------------------+
|07-10-2020 22:49|07-10-2020 22:49|15138.752694453176   |15138.752694453176   |
|07-10-2020 22:50|07-10-2020 22:50|15367.886012191113   |15367.886012191113   |
|07-10-2020 22:52|07-10-2020 22:52|15111.625491525283   |15111.625491525283   |
|07-10-2020 22:54|07-10-2020 22:54|15117.047889628033   |15119.267698828335   |
|07-10-2020 22:57|07-10-2020 22:57|15118.077374728518   |15118.077374728518   |
|07-10-2020 22:59|07-10-2020 22:59|15118.90405992265    |15121.055502987601   |
|07-10-2020 23:00|07-10-2020 23:00|15122.087961512667   |15122.087961512667   |
|07-10-2020 23:02|07-10-2020 23:02|15122.191299320195   |15122.191299320195   |
|07-10-2020 23:03|07-10-2020 23:03|15122.539946497187   |15122.539946497187   |
|07-10-2020 23:04|07-10-2020 23:04|15122