In [None]:
import numpy as np
import pandas as pd


In [None]:
def flag_outliers(series, iqr_multiplier=1.5):
    """Use Tukey's boxplot criterion for outlier identification.
    :returns:
    """
    top_quartile_cutoff = np.percentile(series.get_values(), 75)
    bottom_quartile_cutoff = np.percentile(series.get_values(), 25)
    # Compute interquartile range
    iqr = top_quartile_cutoff - bottom_quartile_cutoff
    top_outlier_cutoff = top_quartile_cutoff + iqr * iqr_multiplier
    bottom_outlier_cutoff = bottom_quartile_cutoff - iqr * iqr_multiplier
    return series[
        (series < bottom_outlier_cutoff) | (series > top_outlier_cutoff)]

def to_keyed_ts(tuple_rdd):
    """Transforms a rdd of tuples in the form ((store_nbr, item_nbr),
    (day_dt, pos_sales_qty)) into an rdd keyed by (store_nbr, item_nbr) and
    an associated value that is a pandas Series of counts of sales
    indexed by day.
    """
    split_line_data = tuple_rdd.map(lambda line: line.split('\t'))
    select_data = split_line_data.filter(enough_records) \
        .map(lambda row: ((row[0], row[1]), (row[4], row[8])))
    all_as_rdd = select_data.groupByKey().mapValues(lambda iterable:
                                                    to_series(iterable.data))
    return all_as_rdd.filter(lambda tuple: tuple[1] is not None) \
        .filter(lambda tuple: filter_on_min_events(tuple, 30))

def to_series(list_of_tuples):
    """Transforms a list of tuples of the form (date_dt, count) in to a pandas
    series indexed by dt.
    """
    time_val_tuples = [(pd.to_datetime(dt, format='%Y/%m/%d', coerce=True),
                        convert_numeric(val)) for (dt, val) in list_of_tuples]
    cleaned_time_val_tuples = [tuple for tuple in time_val_tuples if not (
        tuple[0] is pd.NaT or tuple[1] is None)]
    if len(cleaned_time_val_tuples) > 0:
        unzipped_cleaned_time_values = zip(*cleaned_time_val_tuples)
        values = unzipped_cleaned_time_values[1]
        index_vals = unzipped_cleaned_time_values[0]
        ret_val = pd.Series(values, index=index_vals).sort_index()
    else:
        ret_val = None
    return ret_val