In [None]:
import numpy as np
import pandas as pd
import pyspark
import re
import matplotlib.pyplot as plt

from pyspark import SparkContext
from pyspark.sql import SparkSession, Row, DataFrame
from pyspark.sql import functions as F
from pyspark.sql.types import StructField, StructType, BooleanType, StringType, DoubleType, IntegerType, ArrayType

from stpm.visualization import plot_pattern
from stpm.statistics import sum_delta_distrib, compute_mean_delta, count_delta_diff_from_zero, count_n_elements
from collections import defaultdict

from typing import List, Union, Dict, Optional, Tuple

In [None]:
!zip -r stpm.zip stpm

In [3]:
sc.addPyFile('stpm.zip')

In [4]:
def split(s: str) -> List[List[str]]:
    res = []
    stack = []
    accumulator = []
    str_acc = ''
    for c in s:
        if c == '[':
            stack.append(c)
        elif c == ']':
            stack.pop()
            if len(stack) > 0:
                if len(str_acc) > 0:
                    accumulator.append(str_acc)
                    str_acc = ''
                res.append(accumulator)
                accumulator = []
        elif c == ',':
            if len(str_acc) > 0:
                accumulator.append(str_acc)
                str_acc = ''
        elif c == ' ':
            continue
        else:
            str_acc += c
            
    return res

In [5]:
splitter = spark.udf.register('seqparser', split, ArrayType(ArrayType(StringType())))
filter_center = spark.udf.register('filter_center', lambda it: any('S0_T0' in x for x in it[0]), 'boolean')

def load_files(paths: List[str], filter_data: bool=True) -> List[DataFrame]:
    res = []
    schema = StructType([
        StructField('freq', IntegerType()),
        StructField('relFreq', DoubleType()),
        StructField('confidence', DoubleType()),
        StructField('sequence', StringType())
    ])
    for f in paths:
        df = spark.read.csv(f, header=None, inferSchema=False, schema=schema) \
                        .withColumn('sequence', splitter(F.col('sequence')))
        if filter_data:
            df = df.filter('filter_center(sequence)')
        res.append(df)
    return res

In [6]:
basepath = 'datasets/ShortLongTermTrafficIncidents/patterns_REVISED/'
files = [
#     'patterns_global.csv',
#     'patterns_nyc.csv',
    'patterns_boston.csv',
#     'patterns_la.csv'
]

fullpaths = [basepath + x for x in files]

In [7]:
dfs = load_files(fullpaths, filter_data=True)

In [8]:
local_df = dfs[0].toPandas()
local_df.sort_values('relFreq', ascending=False).to_csv('patterns_boston.csv', index=False)

In [8]:
def generate_nrules_table(file_list: List[str], df_list: List[DataFrame]) -> pd.DataFrame:
    res_dict = {'filename': [], 'count':[]}
    for f, df in zip(file_list, df_list):
        c = df.count()
        print(f'{f} - Number of rules: {c}')
        res_dict['filename'].append(f)
        res_dict['count'].append(c)
        
    return pd.DataFrame(res_dict)

def compute_statistics(file_list: List[str], df_list: List[DataFrame]) -> pd.DataFrame:
    res_dict = defaultdict(list)
    for f, df in zip(file_list, df_list):
        df = df.cache()
        mean_n_elements = df.rdd.map(lambda it: count_n_elements(it['sequence'])).mean()
        mean_spatial_delta = compute_mean_delta(df, delta_type='spatial')
        mean_temporal_delta = compute_mean_delta(df, delta_type='temporal')
        npatterns_spat_delta_gt0 = df.rdd.filter(lambda it: count_delta_diff_from_zero(it['sequence'], delta_type='spatial') > 0).count()
        npatterns_temp_delta_gt0 = df.rdd.filter(lambda it: count_delta_diff_from_zero(it['sequence'], delta_type='temporal') > 0).count()
        npatterns_both_delta_gt0 = df.rdd.filter(lambda it: count_delta_diff_from_zero(it['sequence'], delta_type='both') > 0).count()
        df = df.unpersist()
        
        res_dict['files'].append(f)
        res_dict['tot_patterns'].append(df.count())
        res_dict['mean_num_el'].append(mean_n_elements)
        res_dict['mean_spat_delta'].append(mean_spatial_delta)
        res_dict['mean_temp_delta'].append(mean_temporal_delta)
        res_dict['patterns_spat_delta_gt0'].append(npatterns_spat_delta_gt0)
        res_dict['patterns_temp_delta_gt0'].append(npatterns_temp_delta_gt0)
        res_dict['patterns_both_delta_gt0'].append(npatterns_both_delta_gt0)
        
        print('#' * 50)
        print(f)
        print(f'total patterns: {df.count()}')
        print(f'mean number of elements: {mean_n_elements}')
        print(f'mean spatial delta: {mean_spatial_delta}')
        print(f'mean temporal delta: {mean_temporal_delta}')
        print(f'number of patterns with at least one element with spatial delta > 0: {npatterns_spat_delta_gt0} ({npatterns_spat_delta_gt0/df.count()})')
        print(f'number of patterns with at least one element with temporal delta > 0: {npatterns_temp_delta_gt0} ({npatterns_temp_delta_gt0/df.count()})')
        print(f'number of patterns with at least one element with spatial AND temporal delta > 0: {npatterns_both_delta_gt0} ({npatterns_both_delta_gt0/df.count()})')
        
    return pd.DataFrame(dict(res_dict))

In [None]:
nrules_table = generate_nrules_table(files, dfs)

In [None]:
compute_statistics(files, dfs)