## Data cleansing

Prepare the data

In [1]:
from pyspark import SparkContext
from pyspark.sql import SparkSession
try:
    sc = SparkContext('local[*]')
    sc.setLogLevel("OFF")
    
    spark = SparkSession(sc)
except ValueError:
    # Prevent the notebook to crash if this block is executed more then once
    pass

In [2]:
import re
import glob
import functools
from datetime import datetime
from pyspark.sql.functions import *

mac_address = re.compile(r"([0-9A-Fa-f]{2}[:-]){5}([0-9A-Fa-f]{2})")

cols = ['timestamp', 'level', 'mac', 'ssid', 'rssi']

def loadWifiLog(path):
    return (
        sc
        .textFile(path)
        .filter(lambda x: mac_address.search(x))
        .map(lambda x: x.split('\t'))
        .filter(lambda x: len(x) == len(cols))
        .toDF().toDF(*cols)
    )

# .toDF('Timestamp','Level','Lat','Long','Altitude','Speed','Satellites','Mode','Time')

wifi_log_files = glob.glob("./data/wifi-apc.*.log")

dfs = map(lambda x: loadWifiLog(x), wifi_log_files)

df = functools.reduce(lambda x, y: x.union(y), dfs)

dates = df.withColumn("date", to_date("timestamp")).select('date')
dates.distinct().orderBy("date").show()

+----------+
|      date|
+----------+
|2017-11-29|
|2017-11-30|
|2017-12-06|
|2017-12-07|
|2017-12-10|
|2017-12-11|
|2017-12-12|
+----------+

