In [None]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt

import calendar
from datetime import datetime as dt
from dateutil import tz

In [None]:
messages = pd.read_csv("../logs/master.csv")
messages

In [None]:
users = messages.groupby(["user", "author_id"])["id"].agg("count").reset_index()
users.nlargest(10, "id")

In [None]:
sample = list(users.nlargest(5, "id")["user"].values)
tz_str = {
    # "user":  "GMT-6",
}
sample

In [None]:
def snowflake_to_unix(snowflake):
    return (snowflake >> 22) + 1420070400000

def snowflake_to_datetime(snowflake, s_tz = tz.gettz("GMT+0")):
    return dt.fromtimestamp(snowflake_to_unix(snowflake) / 1000, tz = s_tz)

def snowflake_to_hour(snowflake, s_tz):
    return snowflake_to_datetime(snowflake, s_tz = s_tz).hour

def snowflake_to_hour_helper(row):
    snowflake = row["id"]
    s_tz = tz.gettz(tz_str.get(row["user"]))
    return snowflake_to_datetime(snowflake, s_tz)

def snowflake_to_month(snowflake):
    return snowflake_to_datetime(snowflake).month

def snowflake_to_min(snowflake):
    d = snowflake_to_datetime(snowflake)
    return d.minute

def snowflake_to_hour_min(snowflake):
    d = snowflake_to_datetime(snowflake)
    return d.hour + d.minute / 60

def snowflake_to_dayofyear(snowflake):
    d = snowflake_to_datetime(snowflake)
    return d.timetuple().tm_yday

def snowflake_to_weekday_str(snowflake):
    d = snowflake_to_datetime(snowflake)
    return d.strftime("%A")
    
def snowflake_to_weekday(snowflake):
    d = snowflake_to_datetime(snowflake)
    return d.weekday()

def snowflake_to_weekday_hour(snowflake):
    d = snowflake_to_datetime(snowflake)
    return d.weekday() + d.hour / 24

In [None]:
def display_hour_histograms_ontop(query, aggr, vals, figsize = (24, 5), multiple = "layer", per_hour = 1):
    users_messages = query[query[aggr].isin(vals)].copy()
    users_messages["hour"] = users_messages["id"].apply(snowflake_to_hour_min)

    fig, axs = plt.subplots(1, 1, figsize = figsize)

    sns.set(rc = { "figure.figsize": figsize })
    plot = sns.histplot(data = users_messages, x = "hour", bins = int(24 * per_hour),
    hue = "user", ax = axs, stat = "percent", common_norm = False, multiple = multiple)
    plot.set(xticks = list(range(24)))
    plot.margins(x = 0)

    sns.move_legend(axs, "upper left", bbox_to_anchor = (1, 1))
    # fig.show()

# display_hour_histograms_ontop("author_id", [  ], figsize = (6, 3))
# display_hour_histograms_ontop(aggr = "user", vals = users, figsize = (24, 6), multiple = "layer")

## NOTE: The below code does not run if the data does not contain information below august.
before_august = messages[messages.id < 1003527599613542400]
display_hour_histograms_ontop(query = before_august, aggr = "user", vals = sample, figsize = (24, 5), multiple = "stack" , per_hour = 1)
after_august = messages[messages.id > 1003527599613542400]
display_hour_histograms_ontop(query = after_august, aggr = "user", vals = sample, figsize = (24, 5), multiple = "stack", per_hour = 1)
# display_hour_histograms_ontop(aggr = "user", vals = users, figsize = (24, 6), multiple = "fill")


In [None]:
def display_hour_histograms_byside(aggr, vals, figsize = (24, 5), stat = "percent"):
    fig, axs = plt.subplots(1, len(vals), figsize = figsize)

    for i in range(len(vals)):
        val = vals[i]

        user_messages = messages[messages[aggr] == val].copy()
        user_messages["hour"] = messages["id"].apply(snowflake_to_hour, s_tz = tz.gettz(tz_str.get(val)))

        if len(vals) == 1:
            axis = axs
        else:
            axis = axs[i]

        plot = sns.histplot(data = user_messages, x = "hour", bins = 24, ax = axis, stat = stat)
        plot.margins(x = 0)
        plot.set(title = f"{val} - {tz_str.get(val, 'GMT+0')}")
        plot.set(xticks = list(range(24)))
    fig.suptitle(f"hour vs {stat}")

display_hour_histograms_byside(aggr = "user", vals = sample, figsize = (len(sample) * 12, 4))
display_hour_histograms_byside(aggr = "user", vals = sample, figsize = (len(sample) * 12, 4), stat = "count")

In [None]:
def display_hour_cml_histograms_ontop(aggr, vals, figsize = (24, 5)):
    users_messages = messages[messages[aggr].isin(vals)].copy()
    users_messages["hour"] = users_messages["id"].apply(snowflake_to_hour_min)

    fig, axs = plt.subplots(1, 1, figsize = figsize)

    sns.set(rc = { "figure.figsize": figsize })
    plot = sns.histplot(
        data = users_messages, x = "hour", bins = 240,
        hue = "user", ax = axs,
        stat = "density", element = "step", fill = False, cumulative = True,
        common_norm = False
    )
    plot.set(xticks = list(range(24)))

display_hour_cml_histograms_ontop("user", sample, figsize = (12, 4))

def display_hour_cml_histograms_ontop2(aggr, vals, figsize = (24, 5)):
    users_messages = messages[messages[aggr].isin(vals)].copy()
    users_messages["day"] = users_messages["id"].apply(snowflake_to_dayofyear)

    fig, axs = plt.subplots(1, 1, figsize = figsize)

    sns.set(rc = { "figure.figsize": figsize })
    plot = sns.histplot(
        data = users_messages, x = "day", bins = 365,
        hue = "user", ax = axs,
        stat = "count", element = "step", fill = False, cumulative = True,
        common_norm = False
    )
    plot.margins(x = 0)
    plot.tick_params(axis = "x", rotation = -90)
    plot.set(xticks = list(range(0, 365, 2)))

display_hour_cml_histograms_ontop2("user", sample, figsize = (48, 10))

In [None]:
# NOTE: ANDREW YOU ARE PROBABLY LOOKING FOR THIS CODE

def display_hour_by_month_histograms_ontop(query, aggr, vals, figsize = (24, 5), multiple = "layer", per_hour = 1, stat = "percent", common_norm = False):
    for m in range(0, 13): #[0, 13)
        users_messages = query[query[aggr].isin(vals)].copy()
        users_messages["hour"] = users_messages["id"].apply(snowflake_to_hour_min)
        users_messages["month"] = users_messages["id"].apply(snowflake_to_month)
        users_messages = users_messages[users_messages.month == m]

        if (len(users_messages) < 1): continue

        fig, axs = plt.subplots(1, 1, figsize = figsize)
        fig.suptitle(f"Month: {calendar.month_name[m]}")

        sns.set(rc = { "figure.figsize": figsize })
        plot = sns.histplot(data = users_messages, x = "hour", bins = int(24 * per_hour),
        hue = "user", ax = axs, stat = stat, common_norm = common_norm, multiple = multiple)
        plot.set(xticks = list(range(24)))
        plot.margins(x = 0)
        sns.move_legend(axs, "upper left", bbox_to_anchor = (1, 1))

        plt.savefig(f"figures/hour_by_month-{stat}-{multiple}-{per_hour}-{calendar.month_name[m]}-{','.join(vals)}.png")

display_hour_by_month_histograms_ontop(query = messages, aggr = "user", vals = sample, figsize = (24, 5), multiple = "stack" , per_hour = 1)
display_hour_by_month_histograms_ontop(query = messages, aggr = "user", vals = sample, figsize = (24, 5), multiple = "stack" , per_hour = 1, stat = "count")
display_hour_by_month_histograms_ontop(query = messages, aggr = "user", vals = sample, figsize = (24, 5), multiple = "fill" , per_hour = 1, common_norm = True, stat = "count")

In [None]:
def display_month_histograms_ontop(query, aggr, vals, figsize = (24, 5), multiple = "layer", per_hour = 1):
        users_messages = query[query[aggr].isin(vals)].copy()
        users_messages["month"] = users_messages["id"].apply(snowflake_to_month)

        fig, axs = plt.subplots(1, 1, figsize = figsize)

        sns.set(rc = { "figure.figsize": figsize })
        plot = sns.histplot(data = users_messages, x = "month", bins = int(12),
        hue = "user", ax = axs, stat = "count", common_norm = False, multiple = multiple)
        plot.set(xticks = list(range(24)))
        plot.margins(x = 0)
        sns.move_legend(axs, "upper left", bbox_to_anchor = (1, 1))
display_month_histograms_ontop(query = messages, aggr = "user", vals = sample, figsize = (24, 5), multiple = "stack" , per_hour = 1)

def display_minute_histograms_ontop(query, aggr, vals, figsize = (24, 5), multiple = "layer", per_hour = 1):
        users_messages = query[query[aggr].isin(vals)].copy()
        users_messages["min"] = users_messages["id"].apply(snowflake_to_min)

        fig, axs = plt.subplots(1, 1, figsize = figsize)

        sns.set(rc = { "figure.figsize": figsize })
        plot = sns.histplot(data = users_messages, x = "min", bins = int(12),
        hue = "user", ax = axs, stat = "count", common_norm = False, multiple = multiple)
        plot.set(xticks = list(range(60)))
        plot.margins(x = 0)
        sns.move_legend(axs, "upper left", bbox_to_anchor = (1, 1))

display_minute_histograms_ontop(query = messages, aggr = "user", vals = sample, figsize = (24, 5), multiple = "stack" , per_hour = 1)

In [None]:
def display_hour_weekday_bivariate_histogram(query, aggr, vals, figsize = (24, 5)):
    users_messages = query[query[aggr].isin(vals)].copy()
    users_messages["hour"] = users_messages["id"].apply(snowflake_to_hour_min)
    users_messages["weekday"] = users_messages["id"].apply(snowflake_to_weekday)

    fig, axs = plt.subplots(1, len(vals), figsize = figsize)
    for i in range(len(vals)):
        val = vals[i]
        axis = axs
        if len(vals) > 1:
            axis = axs[i]
        plot = sns.histplot(
            data = users_messages[users_messages.user == vals[i]], x = "hour", y = "weekday",
            hue = "user", ax = axis, discrete = True, stat = "count", cbar = True, palette = "vlag",
            legend = False)
        plot.set(xticks = list(range(25)))
        plot.set_yticklabels(["Monday", "Monday", "Wednesday", "Tuesday", "Thursday", "Friday", "Saturday", "Sunday"])
        plot.margins(x = 0)
        plot.margins(y = 0)
        plot.set(title = val)

    plt.savefig(f"figures/bivariate_hourweekay-{','.join(vals)}.png")
display_hour_weekday_bivariate_histogram(query = messages, aggr = "user", vals = sample, figsize = (len(sample) * 20, 5))