# Prepare data for analysis

Data cleansing, conversions and aggregations preparing the datasets for further analysis.

In [17]:
from pyspark import SparkContext
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.window import Window

from datetime import datetime

try:
    sc = SparkContext('local[*]')
    sc.setLogLevel("OFF")
    
    spark = SparkSession(sc)
except ValueError:
    # Prevent the notebook to crash if this block is executed more then once
    pass

import re
import glob
import functools
import shutil

In [26]:
def save_dataset(df, name):
    try:
        shutil.rmtree("./data/ds/%s" % name)
    except FileNotFoundError:
        pass
    df.write.parquet("./data/ds/%s" % name)

## OUI file

Parse IEEE Organizationally unique identifier (OUI) file that uniquely identifies a vendors based on the MAC address prefix.

Output dataset at: `data/ds/oui.parquet`

In [3]:
# From: https://www.gsmarena.com/makers.php3
# .main-makers table
# ((<tr>)?<td><a (.+?)>|</a></td>(</tr>)?)
# (<br><span>|</span>)
makers = spark.read.csv("./data/makers.csv", header=True, inferSchema=True)

makers.orderBy(desc("devices")).toPandas().head(10)

Unnamed: 0,maker,devices
0,Samsung,1133
1,LG,602
2,Nokia,459
3,Motorola,454
4,alcatel,376
5,Micromax,276
6,Huawei,269
7,BLU,262
8,HTC,255
9,ZTE,240


In [4]:
oui_parser = re.compile(r"(?P<prefix>([0-9A-F]{2}-?){3})\s{1,}\(hex\)\t{2}(?P<organization>.*)")

def tupled(txt):
    m = oui_parser.search(txt)
    p, o = (m.group("prefix").replace("-", ":"), m.group("organization"))
    return (p, o)
    
oui = (
    sc.textFile("./data/oui.txt")
    .filter(lambda x: oui_parser.search(x))
    .map(tupled)
    .toDF().toDF("prefix", "organization")
)

(
    oui
    .groupBy("organization")
    .count()
    .orderBy(desc("count"))
).toPandas().head(20)

Unnamed: 0,organization,count
0,"Cisco Systems, Inc",802
1,"Apple, Inc.",631
2,"Samsung Electronics Co.,Ltd",474
3,"HUAWEI TECHNOLOGIES CO.,LTD",426
4,"ARRIS Group, Inc.",276
5,Intel Corporate,226
6,Texas Instruments,173
7,zte corporation,147
8,Hewlett Packard,140
9,"Hon Hai Precision Ind. Co.,Ltd.",129


In [28]:
import sys

# makers_oui = 

stop_words = ["", "inc", "co", "ltd", "coltd", "llc", "oo", "oy", "cisco", "shenzhen", "electromechanics"]


msx = makers.withColumn("soundex", soundex("maker")).drop("devices")

osx = (    
    oui
    .withColumn("bits", split("organization", " "))
    .select(col("prefix"), col("organization"), explode("bits").alias("bit"))
    .withColumn("safeBit", lower(regexp_replace("bit", "[^A-Za-z0-9]", "")))
    .filter(~col("safeBit").isin(*stop_words))
    .withColumn("soundex", soundex("safeBit"))
    .drop("bit")
)

makers_oui = (
    osx
    .join(msx, msx.soundex == osx.soundex)
    .withColumn("distance", levenshtein("safeBit", lower(col("maker"))))
    .filter(col("distance") == 0)
    .drop("soundex", "safeBit", "distance", "maker")
    .distinct()
    .orderBy("organization")
)

save_dataset(makers_oui, "oui.parquet")

makers_oui.groupBy("organization").count().orderBy(desc("count")).sample(True, 0.123).toPandas().head(50)

Unnamed: 0,organization,count
0,Microsoft Corporation,37
1,"GIGA-BYTE TECHNOLOGY CO.,LTD.",15
2,SHARP Corporation,11
3,Microsoft,9
4,LG Electronics,7
5,Ericsson,6
6,LG ELECTRONICS INC,6
7,NEC Corporation,6
8,Mitsubishi Electric Corporation,4
9,KYOCERA Display Corporation,3


## WiFi logs

Filter and concatenate all data from WiFi logs

In [27]:
mac_address = re.compile(r"([0-9A-Fa-f]{2}[:-]){5}([0-9A-Fa-f]{2})")

cols = ['timestamp', 'level', 'mac', 'ssid', 'rssi']

def loadWifiLog(path):
    return (
        sc
        .textFile(path)
        .filter(lambda x: mac_address.search(x))
        .map(lambda x: x.split('\t'))
        .filter(lambda x: len(x) == len(cols))
        .toDF().toDF(*cols)
    )

wifi_log_files = glob.glob("./data/wifi-apc.*.log")

dfs = map(lambda x: loadWifiLog(x), wifi_log_files)

df = functools.reduce(lambda x, y: x.union(y), dfs)

save_dataset(df, "wifi-log.parquet")

dates = df.withColumn("date", to_date("timestamp")).groupBy("date").count()
dates.distinct().orderBy("date").toPandas().head()

Unnamed: 0,date,count
0,2017-11-29,147397
1,2017-11-30,8008
2,2017-12-06,183987
3,2017-12-07,255035
4,2017-12-10,95707


## GPS logs

Filter and concatenate data from GPS logs.

In [50]:
cols = ['timestamp','level','lat','long','altitude','speed','satellites','mode','datetime']

def loadGpsLog(path):
    return (
        sc
        .textFile(path)
        .map(lambda x: x.split('\t'))
        .filter(lambda x: len(x) == len(cols))
        .toDF().toDF(*cols)
    )

gps_log_files = glob.glob("./data/gps.*.log")

dfs = map(lambda x: loadGpsLog(x), gps_log_files)

df = functools.reduce(lambda x, y: x.union(y), dfs).filter(col("lat") != "n/a")

save_dataset(df, "gps-log.parquet")

dates = df.withColumn("date", to_date("timestamp")).groupBy("date").count()
dates.distinct().orderBy("date").toPandas().head()

Unnamed: 0,date,count
0,2017-11-29,2067
1,2017-11-30,15742
2,2017-12-06,11129
3,2017-12-07,17779
4,2017-12-10,7567
