In [None]:
import json

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from eventdetector.util import get_top_k

In [None]:
paths = {
    "london": "../data/output/mergestatslondon",
    "ny": "../data/output/mergestatsny",
    "baselondon": "../data/output/eventstatsbaselondon",
    "baseny": "../data/output/eventstatsbaseny",
    "embeddinglondon": "../data/output/eventstatsembeddinglondon",
    "embeddingny": "../data/output/eventstatsembeddingny",
}

In [None]:
def print_tops(dictionary, k, normalize=True):
    keys = []
    vals = []
    
    total = sum(dictionary.values())
    
    for key in dictionary:
        keys.append(key)
        if normalize:
            vals.append(round(dictionary[key] / total, 4))
        else:
            vals.append(dictionary[key])

    
    top_vals, top_keys = get_top_k(vals, keys, k)
    print(list(zip(top_vals, top_keys)))


def print_data_stats(stats, kind):
    print("Retweet " + kind, stats[kind]["rt"])
    print("Spam " + kind, stats[kind]["spam"])
    print("Not enough entity " + kind, stats[kind]["noentity"])
    print("Not enough token " + kind, stats[kind]["notoken"])
    print("Valid " + kind, stats[kind]["n"])
    
    print("Top entity kinds "+kind+":")
    print_tops(stats[kind]["kind"], 20)
    
    
def print_stats(stats):
    print("-"*10 + " Data stats " + "-"*10)
    print("-"*10 + " Geo " + "-"*10)
    print_data_stats(stats, "geo")
    print("-"*10 + " Non-geo " + "-"*10)
    print_data_stats(stats, "nongeo")
    
    print("-"*10 + " Clustering stats " + "-"*10)
    print("Total clusters:", stats["n_clusters"])
    print("Bursting clusters:", stats["n_bursting"])
    
    print("-"*10 + " Performance stats " + "-"*10)
    mean_spam_time = stats["spam_time"] / stats["spam_proc"]
    print("Mean spam time:", mean_spam_time)
    print("Total spam time:", stats["spam_time"])
    
    mean_spam_time = stats["ent_time"] / stats["ent_proc"]
    print("Mean NER time:", mean_spam_time)
    print("Total NER time:", stats["ent_time"])

    mean_ed_time = np.mean(stats["ed_bust_time"] + stats["ed_norm_time"])
    print("Mean ed time:", mean_ed_time)
    total_ed_time = np.sum(stats["ed_bust_time"] + stats["ed_norm_time"])
    print("Total ed time", total_ed_time)

In [None]:
def print_raw_stats(stats):
    print("Geo count:", stats["geo"])
    print("Nongeo count:", stats["nongeo"])
    
    print("Top sources geo:")
    print_tops(stats["geosource"], 40, normalize=False)
    print("Top sources nongeo:")
    print_tops(stats["nongeosource"], 40, normalize=False)

In [None]:
with open(paths["london"], "r") as f:
    stats_london = json.load(f)
    
with open(paths["ny"], "r") as f:
    stats_ny = json.load(f)

with open(paths["baselondon"], "r") as f:
    stats_baselondon = json.load(f)

with open(paths["baseny"], "r") as f:
    stats_baseny = json.load(f)

with open(paths["embeddinglondon"], "r") as f:
    stats_emblondon = json.load(f)
    
with open(paths["embeddingny"], "r") as f:
    stats_embny = json.load(f)

In [None]:
print_raw_stats(stats_london)

In [None]:
print_raw_stats(stats_ny)

In [None]:
print_stats(stats_baselondon)

In [None]:
print_stats(stats_baseny)

In [None]:
print_stats(stats_emblondon)

In [None]:
print_stats(stats_embny)

In [None]:
def plot_running_time(stats_london, stats_ny):
    fig = plt.figure(figsize=(10, 6))
    
    def plot(s, title, xlabel, ylabel):
        plt.plot(range(len(s)), s)
        plt.title(title)
        plt.xlabel(xlabel)
        plt.ylabel(ylabel)
    
    plt.subplot(2, 2, 1)
    rolling = pd.Series(stats_london["ed_bust_time"]).rolling(10).median()
    plot(rolling, "Burst Detection (London)", "# of Function Calls", "Time (seconds)")

    plt.subplot(2, 2, 2)
    rolling = pd.Series(stats_london["ed_norm_time"]).rolling(100).median()
    plot(rolling, "Sliding Window (London)", "# of Function Calls", "Time (seconds)")
    
    plt.subplot(2, 2, 3)
    rolling = pd.Series(stats_ny["ed_bust_time"]).rolling(10).median()
    plot(rolling, "Burst Detection (New York)", "# of Function Calls", "Time (seconds)")
    
    plt.subplot(2, 2, 4)
    rolling = pd.Series(stats_ny["ed_norm_time"]).rolling(100).median()
    plot(rolling, "Sliding Window (New York)", "# of Function Calls", "Time (seconds)")
    
    fig.tight_layout()
    plt.show()

In [None]:
plot_running_time(stats_baselondon, stats_baseny)

In [None]:
plot_running_time(stats_emblondon, stats_embny)