### Location related modules:

    This module contains a few applications for location related use cases.
    Since location data sources are often huge, this module is designed to work on spark dataframe.
    
    Including use cases:
    
    1. Approximate lat long points into bins using geohash.
    2. Find co-location based on geohash.
    3. Find stationary location points, to infer special location.

In [2]:
import os
import random
import pandas as pd
import numpy as np
import findspark
#spark path using default value
findspark.init()


import pyspark
import pyarrow
from pyspark.sql import SQLContext
    
from pyspark.ml import Pipeline
from pyspark.ml.feature import StandardScaler
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.feature import StringIndexer, VectorAssembler
from pyspark.ml.feature import OneHotEncoderEstimator, StringIndexer, VectorAssembler
from pyspark.sql.types import IntegerType
from pyspark.sql.functions import when, lit
from distutils.version import LooseVersion
from importlib import reload
import pyspark.sql.functions as func
import pyspark.sql.types as typ

import dateutil
from dateutil.relativedelta import relativedelta
from dateutil.parser import parse

from imblearn.over_sampling import SMOTE

import pyspark.sql.functions as F
from pyspark.sql.functions import col, countDistinct, when, row_number
from pyspark.ml import Pipeline, PipelineModel
from pyspark.sql.window import Window
from pyspark.sql.types import *
from pyspark.sql import SparkSession,SQLContext

import geohash as gh
from collections import Counter

from pyspark.sql import functions as F
from pyspark.sql.functions import collect_list,struct,col,substring,lit,udf


pd.options.display.max_columns=None
pd.options.display.max_rows=None

def initialize_spark(app_name='location'):
    
    
    #broadcastTimeout is purposedly set to be large due to development on single machine
    conf = pyspark.SparkConf()\
        .setAppName(app_name)\
        .setMaster('local')\
        .set('spark.driver.memory', '8g')\
        .set('spark.executor.memory', '8g')\
        .set('spark.executor.instances', 4)\
        .set('spark.executor.cores', 4)\
        .set('spark.driver.maxResultSize', '8g')\
        .set('spark.sql.shuffle.partitions', 100)\
        .set('spark.default.parallelism', 200)\
        .set('spark.sql.broadcastTimeout', 36000)\
        .set('spark.kryoserializer.buffer.max', '1024m')\
        .set('spark.sql.execution.arrow.enabled', 'false')\
        .set('spark.dynamicAllocation.enabled', "False")\
        .set('spark.port.maxRetries',30) 

    sc = pyspark.SparkContext.getOrCreate(conf)
    spark = pyspark.sql.SparkSession(sc)
    sqlContext = SQLContext.getOrCreate(sc)    
    return sc,spark,sqlContext

In [3]:
sc,spark,sqlContext = initialize_spark()

In [6]:
def equivalent_type(f):
    '''
    add more spark sql types like bigint ...
    '''
    if f == 'datetime64[ns]': return DateType()
    elif f == 'int64': return LongType()
    elif f == 'int32': return IntegerType()
    elif f == 'float64': return FloatType()
    else: return StringType()

def define_structure(string, format_type):
    try: typo = equivalent_type(format_type)
    except: typo = StringType()
    return StructField(string, typo)

def pandas_to_spark(sqlcontext,pandas_df):
    columns = list(pandas_df.columns)
    types = list(pandas_df.dtypes)
    struct_list = []
    for column, typo in zip(columns, types): 
        struct_list.append(define_structure(column, typo))
    p_schema = StructType(struct_list)
    return sqlcontext.createDataFrame(pandas_df, p_schema)

In [11]:
location_pandas_df.head()

Unnamed: 0,id,latitude,longitude,track_id,time
0,1,-10.939341,-37.062742,1,2014-09-13 07:24:32
1,2,-10.939341,-37.062742,1,2014-09-13 07:24:37
2,3,-10.939324,-37.062765,1,2014-09-13 07:24:42
3,4,-10.939211,-37.062843,1,2014-09-13 07:24:47
4,5,-10.938939,-37.062879,1,2014-09-13 07:24:53


### Note: track_id is the person's id.

In [12]:
location_spark_df = pandas_to_spark(sqlContext,location_pandas_df)

In [16]:
location_spark_df.show()

+---+-----------+----------+--------+-------------------+
| id|   latitude| longitude|track_id|               time|
+---+-----------+----------+--------+-------------------+
|  1| -10.939342| -37.06274|       1|2014-09-13 07:24:32|
|  2| -10.939342| -37.06274|       1|2014-09-13 07:24:37|
|  3| -10.939324|-37.062763|       1|2014-09-13 07:24:42|
|  4| -10.939211|-37.062843|       1|2014-09-13 07:24:47|
|  5| -10.938939|-37.062878|       1|2014-09-13 07:24:53|
|  6| -10.938543| -37.06284|       1|2014-09-13 07:24:59|
|  7| -10.938346|-37.062588|       1|2014-09-13 07:25:04|
|  8| -10.938449|   -37.062|       1|2014-09-13 07:25:10|
|  9| -10.938666|  -37.0615|       1|2014-09-13 07:25:15|
| 10| -10.938986|-37.060818|       1|2014-09-13 07:25:21|
| 11|-10.9393425|-37.060085|       1|2014-09-13 07:25:27|
| 12| -10.939641|  -37.0595|       1|2014-09-13 07:25:32|
| 13|-10.9398575| -37.05912|       1|2014-09-13 07:25:38|
| 14| -10.940077|-37.058727|       1|2014-09-13 07:25:43|
| 15| -10.9403

In [18]:
path = 'datasets/go_track_trackspoints.csv'

In [49]:
from pyspark.sql.functions import unix_timestamp, from_unixtime, to_timestamp, to_date
from pyspark.sql.functions import udf

def load_basic_location_data(path,sqlContext):
    '''
    get location raw data given a period
    inputs:
    * path: path to csv location data
    * sqlContext
    output:
    * location_df: spark df, containing location info
    '''
    location_pandas_df = pd.read_csv(path)
    
    location_spark_df = pandas_to_spark(sqlContext,location_pandas_df)
    
    location_spark_df = location_spark_df.withColumn("points",struct(col("latitude").cast("Float"),col("longitude").cast("Float")))
    # convert into spark df's time dtype
    location_spark_df = location_spark_df.withColumn("datetime", to_timestamp(location_spark_df.time, 'yyyy-MM-dd HH:mm:ss'))
    return location_spark_df.select(['track_id','points','datetime'])

@udf("string")
def geohash_w_time(struct_input):
    # using geohash to put points to bins
    encoded = gh.encode(struct_input[0],struct_input[1],precision=6)
    return encoded



In [50]:
A=load_basic_location_data(path,sqlContext)

In [51]:
A.show()

+--------+--------------------+-------------------+
|track_id|              points|           datetime|
+--------+--------------------+-------------------+
|       1|[-10.939342, -37....|2014-09-13 07:24:32|
|       1|[-10.939342, -37....|2014-09-13 07:24:37|
|       1|[-10.939324, -37....|2014-09-13 07:24:42|
|       1|[-10.939211, -37....|2014-09-13 07:24:47|
|       1|[-10.938939, -37....|2014-09-13 07:24:53|
|       1|[-10.938543, -37....|2014-09-13 07:24:59|
|       1|[-10.938346, -37....|2014-09-13 07:25:04|
|       1|[-10.938449, -37....|2014-09-13 07:25:10|
|       1|[-10.938666, -37....|2014-09-13 07:25:15|
|       1|[-10.938986, -37....|2014-09-13 07:25:21|
|       1|[-10.9393425, -37...|2014-09-13 07:25:27|
|       1|[-10.939641, -37....|2014-09-13 07:25:32|
|       1|[-10.9398575, -37...|2014-09-13 07:25:38|
|       1|[-10.940077, -37....|2014-09-13 07:25:43|
|       1|[-10.940389, -37....|2014-09-13 07:25:49|
|       1|[-10.940746, -37....|2014-09-13 07:25:54|
|       1|[-

In [36]:
A.select('time',squared_udf('points')).show()

+-------------------+-------------------+
|               time|squared_udf(points)|
+-------------------+-------------------+
|2014-09-13 07:24:32|             7nj9u8|
|2014-09-13 07:24:37|             7nj9u8|
|2014-09-13 07:24:42|             7nj9u8|
|2014-09-13 07:24:47|             7nj9u8|
|2014-09-13 07:24:53|             7nj9u8|
|2014-09-13 07:24:59|             7nj9u8|
|2014-09-13 07:25:04|             7nj9u8|
|2014-09-13 07:25:10|             7nj9u8|
|2014-09-13 07:25:15|             7nj9u8|
|2014-09-13 07:25:21|             7nj9u8|
|2014-09-13 07:25:27|             7nj9u8|
|2014-09-13 07:25:32|             7nj9u8|
|2014-09-13 07:25:38|             7nj9u8|
|2014-09-13 07:25:43|             7nj9u8|
|2014-09-13 07:25:49|             7nj9u8|
|2014-09-13 07:25:54|             7nj9u8|
|2014-09-13 07:26:00|             7nj9u8|
|2014-09-13 07:26:06|             7nj9u8|
|2014-09-13 07:26:12|             7nj9u8|
|2014-09-13 07:26:18|             7nj9u8|
+-------------------+-------------

In [None]:
display(df.select("id", squared_udf("id").alias("id_squared")))

In [6]:
def row_processor_geohash_w_time(row,day_hours_list,night_hours_list):
    '''
    geohashing function to encode location points into
    hashed str values, geohash precision set to 6
    input: 
    * row: list of tuples
    output:
    * encoded_points: set of str
    '''
    #using set, to store unique points
    daytime_locs = []
    nighttime_locs = []
    
    for tup in row:
        time = tup[0]
        encoded = gh.encode(tup[1][0],tup[1][1],precision=6)
        
        if time[-2:] in day_hours_list:
            daytime_locs.append(encoded)
            
        elif time[-2:] in night_hours_list:
            nighttime_locs.append(encoded)
        
        else:
            #not in special hours, ignore
            continue
        
    daytime_dict = Counter(daytime_locs)
    nighttime_dcit = Counter(nighttime_locs)
    
    daytime_points = list(map(list, daytime_dict.items()))
    nighttime_points = list(map(list, nighttime_dcit.items()))

    return daytime_points,nighttime_points

def retrieve_major_stationary_points(row):
    '''
    row: tuple of lists
    '''
    daytime_stationary = row[0]
    nighttime_stationary = row[1]
    
    #only select the most voted point
    major_daytime_pt, major_nighttime_pt = 'none','none'
    
    if len(daytime_stationary) > 0:
        #str values
        major_daytime_pt = sorted(daytime_stationary, key=lambda x: x[1], reverse = True)[0][0]
    if len(nighttime_stationary) > 0:
        major_nighttime_pt = sorted(nighttime_stationary, key=lambda x: x[1], reverse = True)[0][0]
    
    return major_daytime_pt,major_nighttime_pt

def generate_stationary_points(period,spark,day_hours_list,night_hours_list):
    '''
    based on the special hours defined
    find daytime nighttime stay points
    '''
    period_agg_special_hours = agg_geohash_w_time(period,spark)
    
    period_agg_special_hours_pdf = period_agg_special_hours.toPandas()
    
    period_agg_special_hours_pdf['day_night_locations'] = period_agg_special_hours_pdf['mobility_gene_points_w_time'].apply(lambda x: row_processor_geohash_w_time(x,day_hours_list,night_hours_list))
    
    period_agg_special_hours_pdf['major_day_night_location'] = period_agg_special_hours_pdf['day_night_locations'].apply(lambda x:retrieve_major_stationary_points(x))
    
    return period_agg_special_hours_pdf

ModuleNotFoundError: No module named 'pyspark'