In [None]:
import os, re, warnings, joblib
from pathlib import Path
from math import log1p

from dotenv import load_dotenv
import polars as pl
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import IsolationForest

import numpy as np

import sqlalchemy as sa
import tldextract, rapidjson as rj
warnings.filterwarnings(
    "ignore",
    message="The 'registered_domain' property is deprecated",
    category=DeprecationWarning,
    module=tldextract.__name__,
)

load_dotenv()
engine = sa.create_engine(os.environ["WAREHOUSE_COOLIFY_URL"], pool_pre_ping=True)

In [None]:
import time
import polars as pl

DAYS_BACK = 1
# seconds‑since‑epoch threshold in Python (no DB date math)
cutoff = int(time.time() - DAYS_BACK * 86_400)

query = f"""
SELECT
    /* convert only the rows that survive the WHERE filter */
    CASE
        WHEN time > 32503680000*1000 THEN time/1000000               -- μs → s
        WHEN time > 32503680000      THEN time/1000                  -- ms → s
        ELSE                           time                          -- s
    END::double precision                AS ts_sec,

    *
FROM hackatime.heartbeats
/* pre‑filter in native units so the index on `time` is usable */
WHERE (
        (time <= 32503680000                  AND time >= {cutoff})           -- seconds
     OR (time >  32503680000  AND time <= 32503680000*1000
                                         AND time >= {cutoff*1000})          -- milliseconds
     OR (time >  32503680000*1000          AND time >= {cutoff*1000000})      -- microseconds
)
AND category = 'coding'
AND entity NOT IN ('test.txt', 'welcome.txt')
ORDER BY heartbeats.user_id, heartbeats.time;
"""

with engine.begin() as conn:
    heartbeats = pl.read_database(query, connection=conn, infer_schema_length=None)

# already ordered correctly, but keep for safety
heartbeats = heartbeats.sort(["user_id", "ts_sec"])

heartbeats.head()

In [None]:
ext = tldextract.TLDExtract(cache_dir=False)

def dep_len(j: str) -> int:
    try: return len(rj.loads(j)) if j else 0
    except Exception: return 0

def ua_dom(ua: str) -> str:
    if not ua: return ""
    m = re.search(r"https?://([^ /]+)", ua)
    host = m.group(1) if m else ua.split()[-1]
    td = ext(host)
    return td.top_domain_under_public_suffix or host

hb = (
    heartbeats
    # numeric safe‑casts
    .with_columns(
        [pl.col(c).fill_null(0).cast(pl.Int32) for c in [
            "line_additions","line_deletions","lineno","lines",
            "cursorpos","project_root_count","source_type"
        ]]
    )
    # json / UA parsing
    .with_columns([
        pl.col("dependencies").map_elements(dep_len, return_dtype=pl.Int32).alias("dep_count"),
        pl.col("user_agent").map_elements(ua_dom, return_dtype=pl.String).alias("ua_domain"),
    ])
    # log‑scaled big counts
    .with_columns([
        pl.col("lines").log1p().cast(pl.Float32).alias("log_lines"),
        pl.col("cursorpos").log1p().cast(pl.Float32).alias("log_cursor"),
    ])
    # delta‑t
    .with_columns(
        (pl.col("ts_sec") - pl.col("ts_sec").shift(1))
        .over("user_id")
        .alias("delta_t")
    )
    # 🆕 Add per-heartbeat zero change flag
    .with_columns(
        ((pl.col("line_additions") == 0) & (pl.col("line_deletions") == 0))
        .cast(pl.Int8)
        .alias("hb_zero_change_flag") # Renamed to avoid confusion later
    )
    # Fill NaN delta_t for the first heartbeat of each user
    .with_columns(
        pl.col("delta_t").fill_null(0) # Or another sensible default like -1 or mean? 0 seems okay here.
    )
)

hb.head()

In [None]:
import polars as pl

# ---------------------------------------------------------------------
#  Feature engineering
#  – session-level features  ➜  duration-weighted user-level features
# ---------------------------------------------------------------------
TIMEOUT_SECONDS = 120  # 2 minutes

# ── 1.  Identify the start of each session ───────────────────────────
hb_with_flag = hb.with_columns(
    pl.when((pl.col("delta_t") > TIMEOUT_SECONDS) | (pl.col("delta_t") == 0))
      .then(1).otherwise(0)
      .alias("is_session_start_flag")
)

hb_sessions = hb_with_flag.with_columns(
    pl.col("is_session_start_flag").cum_sum().over("user_id").alias("duration_id")
)

# ── 2.  File switches inside a session ───────────────────────────────
hb_sessions = (
    hb_sessions
    .with_columns(
        (pl.col("entity") != pl.col("entity").shift(1))
            .over(["user_id", "duration_id"])
            .fill_null(False)
            .cast(pl.Int8)
            .alias("is_file_switch_raw")
    )
    .with_columns(
        pl.when(pl.col("is_session_start_flag") == 1)
          .then(0)
          .otherwise(pl.col("is_file_switch_raw"))
          .alias("is_file_switch")
    )
    .drop("is_file_switch_raw")
)

# ── 3.  Helper columns that drive the heuristics ─────────────────────
hb_sessions = hb_sessions.with_columns([
    # Does the heartbeat belong to a Git branch?
    ((pl.col("branch").fill_null("")) != "").cast(pl.Int8).alias("has_branch"),

    # Depth of the file path  («/» count + 1)
    (pl.col("entity").fill_null("").str.count_matches("/") + 1).alias("path_depth"),

    # Change-size columns (null → 0)
    pl.col("line_additions").fill_null(0).alias("line_additions_filled"),
    pl.col("line_deletions").fill_null(0).alias("line_deletions_filled"),
])

# ── 4.  SESSION-LEVEL aggregation ────────────────────────────────────
sessions = (
    hb_sessions
    .group_by(["user_id", "duration_id"])
    .agg([
        # Timing
        (pl.col("ts_sec").min() * 1_000_000).cast(pl.Datetime("us")).alias("start_time"),
        (pl.col("ts_sec").max() * 1_000_000).cast(pl.Datetime("us")).alias("end_time"),
        (pl.col("ts_sec").max() - pl.col("ts_sec").min() + TIMEOUT_SECONDS).alias("duration_seconds"),

        # Counters
        pl.len().alias("hb_count"),
        pl.col("is_write").filter(pl.col("is_write") == True).count().alias("write_count"),
        pl.col("is_file_switch").sum().alias("file_switches"),

        # Diversity
        pl.col("language").n_unique().alias("language_diversity"),
        pl.col("editor").n_unique().alias("editor_diversity"),
        pl.col("project").n_unique().alias("project_diversity"),

        # Source-control context
        pl.col("has_branch").mean().alias("branch_presence_pct"),

        # Path & change-size stats
        pl.col("path_depth").mean().alias("avg_path_depth"),
        pl.col("line_additions_filled").mean().alias("avg_lines_added"),
        pl.col("line_deletions_filled").mean().alias("avg_lines_deleted"),
        pl.col("line_additions_filled").var().alias("var_lines_added"),
        pl.col("line_deletions_filled").var().alias("var_lines_deleted"),
    ])
    # Ignore “micro-sessions”
    .filter(pl.col("hb_count") >= 3)
)

# ── 5.  Derived per-session ratios ───────────────────────────────────
session_features = (
    sessions.with_columns([
        # Average gap between heartbeats
        (pl.col("duration_seconds") / pl.col("hb_count").replace(0, 1e-9))
            .fill_nan(0).alias("avg_seconds_between_hb"),

        # Write-vs-read mix
        (pl.col("write_count") / pl.col("hb_count").replace(0, 1e-9) * 100)
            .fill_nan(0).alias("write_percentage"),

        # Context-switch entropy & save cadence
        (pl.col("file_switches") / (pl.col("duration_seconds") / 60))
            .fill_nan(0).alias("file_switches_per_minute"),

        (pl.col("hb_count") / (pl.col("duration_seconds") / 60))
            .fill_nan(0).alias("saves_per_minute"),
    ])
    .sort(["user_id", "duration_id"])
)

# ── 6.  USER-LEVEL, duration-weighted aggregation ────────────────────
user_agg = (
    session_features
    .group_by("user_id")
    .agg([
        # Total time spent coding
        pl.col("duration_seconds").sum().alias("total_duration_seconds"),

        # Duration-weighted sums  (later turned into weighted averages)
        (pl.col("write_percentage") * pl.col("duration_seconds")).sum().alias("write_pct_wt_sum"),
        (pl.col("saves_per_minute") * pl.col("duration_seconds")).sum().alias("saves_per_min_wt_sum"),
        (pl.col("file_switches_per_minute") * pl.col("duration_seconds")).sum().alias("switches_per_min_wt_sum"),
        (pl.col("branch_presence_pct") * pl.col("duration_seconds")).sum().alias("branch_pct_wt_sum"),
        (pl.col("language_diversity") * pl.col("duration_seconds")).sum().alias("lang_div_wt_sum"),
        (pl.col("avg_path_depth") * pl.col("duration_seconds")).sum().alias("path_depth_wt_sum"),

        # “Worst-case” extremes (independent of session length)
        pl.col("write_percentage").max().alias("max_write_percentage"),
        pl.col("saves_per_minute").max().alias("max_saves_per_minute"),
        pl.col("avg_seconds_between_hb").min().alias("min_avg_seconds_between_hb"),
    ])
)

user_features = (
    user_agg.with_columns([
        (pl.col("write_pct_wt_sum")      / pl.col("total_duration_seconds")).alias("write_percentage_wt"),
        (pl.col("saves_per_min_wt_sum")  / pl.col("total_duration_seconds")).alias("saves_per_minute_wt"),
        (pl.col("switches_per_min_wt_sum")/ pl.col("total_duration_seconds")).alias("file_switches_per_minute_wt"),
        (pl.col("branch_pct_wt_sum")     / pl.col("total_duration_seconds")).alias("branch_presence_pct_wt"),
        (pl.col("lang_div_wt_sum")       / pl.col("total_duration_seconds")).alias("language_diversity_wt"),
        (pl.col("path_depth_wt_sum")     / pl.col("total_duration_seconds")).alias("avg_path_depth_wt"),
    ])
    .drop([
        "write_pct_wt_sum","saves_per_min_wt_sum","switches_per_min_wt_sum",
        "branch_pct_wt_sum","lang_div_wt_sum","path_depth_wt_sum",
    ])
    .sort("user_id")
)

# ── 7.  Quick sanity-check ───────────────────────────────────────────
print("\nSession-level features (sample):")
print(session_features.filter(pl.col("user_id").is_in([2, 1613])).head())

print("\nDuration-weighted USER-level features:")
user_features.filter(pl.col("user_id").is_in([2, 1613]))

In [None]:
# Ensure NUMERIC_COLS includes the new/renamed features and excludes dropped ones
NUMERIC_COLS = [
    # c for c in user_features.columns if c not in (
    #     "user_id", "total_duration_seconds"
    # )
    "max_saves_per_minute",
    "write_percentage_wt",
    "saves_per_minute_wt",
    "file_switches_per_minute_wt",
    "language_diversity_wt",
]
print(f"Using {len(NUMERIC_COLS)} numeric features:")
print(NUMERIC_COLS)

print(user_features.filter(pl.col("user_id") == 2).select(NUMERIC_COLS))

X = user_features.select(NUMERIC_COLS).to_numpy()

# Verify shape
print(f"Feature matrix X shape: {X.shape}")

In [None]:
GOOD_USERS = [
    1, # max
    2, # zrl
    104, # acon
    69, # malted
    864, # thomas
    664, # lux
    10, # annabel

    # hack clubbers that look ok
    1256, 
    1309,
    1460,
    1561,
    40,
    48,
    1729,
    1591
]

BAD_USERS = [
    1613,
    1728,
    18,
    1688
]

TRUSTED = (
    user_features.filter(~pl.col("user_id").is_in(BAD_USERS))
            .select(NUMERIC_COLS)
            .to_numpy()
)

print(f"Training on {TRUSTED.shape[0]} entries")

model = Pipeline([
    # 🆕 Use default StandardScaler (centers data)
    ("scale", StandardScaler()),
    ("iso", IsolationForest(
        n_estimators=800, contamination=0.05, # contamination on trusted set
        bootstrap=True, random_state=42
    ))
]).fit(TRUSTED)

print("Model training complete.")

In [None]:
import polars as pl
import numpy as np

# Assume 'features', 'model', 'X', and 'TRUSTED' are already defined
# For example:
# features = pl.DataFrame({...}) # Your existing features DataFrame
# TRUSTED = np.array([...])     # Your existing TRUSTED numpy array
# X = np.array([...])           # Your existing X numpy array
# model = ...                   # Your trained model

# bottom k‑percent of trusted windows define "too weird"
PCTL = 5
# Calculate threshold based on the scores of the TRUSTED data
scores_trusted = model.decision_function(TRUSTED)
threshold = np.percentile(scores_trusted, PCTL)
print(f"cut‑off at {PCTL}th percentile of trusted scores → {threshold:.4f}")

# Score all data (X)
scores = model.decision_function(X)

# Add/overwrite 'anomaly_score' and then 'is_anomaly' columns
# Using with_columns for 'anomaly_score' ensures it's overwritten if it exists.
features = user_features.with_columns(
    pl.Series("anomaly_score", scores)  # Adds or overwrites the 'anomaly_score' column
)

print(features.head())
# Now, use the (potentially new) 'anomaly_score' column to create/overwrite 'is_anomaly'
features = user_features.with_columns(
    (pl.col("anomaly_score") < threshold).alias("is_anomaly") # Adds or overwrites 'is_anomaly'
)

# summary
user_stats = (
    features.group_by("user_id")
            .agg([
                pl.len().alias("windows"),
                pl.col("is_anomaly").mean().alias("anomaly_rate"),
                pl.col("anomaly_score").mean().alias("avg_score"),
                pl.col("anomaly_score").min().alias("min_score"),
                pl.col("anomaly_score").max().alias("max_score"),
            ])
    .sort("user_id") # Sort for consistency
)
print(user_stats)

In [None]:
user_stats.filter(pl.col("user_id").is_in(GOOD_USERS))

In [None]:
user_stats.filter(pl.col("user_id").is_in(BAD_USERS))

In [None]:
user_stats.filter(pl.col("anomaly_rate") > 0.4, pl.col("windows") > 3, ~pl.col('user_id').is_in(BAD_USERS + GOOD_USERS))

In [None]:
import matplotlib.pyplot as plt
import polars as pl
import pandas as pd # Ensure pandas is imported

# Assuming 'user_stats' DataFrame is already computed and available
# And GOOD_USERS and BAD_USERS lists are defined from your previous cells

# First, we need to get the total duration for each user
# Assuming features DataFrame has user_id and duration_seconds columns
user_durations = features.group_by("user_id").agg(
    pl.sum("duration_seconds").alias("total_duration")
)

# Join the durations with user_stats
user_stats_with_duration = user_stats.join(user_durations, on="user_id")

# Convert to pandas DataFrame for easier plotting with annotations
user_stats_pd = user_stats_with_duration.to_pandas()

# Define colors for users
def get_color(user_id):
    if user_id in GOOD_USERS:
        return 'green'
    elif user_id in BAD_USERS:
        return 'red'
    else:
        return 'blue' # Default color for other users

user_stats_pd['color'] = user_stats_pd['user_id'].apply(get_color)

plt.figure(figsize=(14, 10)) # Increased figure size a bit for clarity

# Scatter plot using the 'color' column and total_duration on x-axis
plt.scatter(user_stats_pd['total_duration'], user_stats_pd['anomaly_rate'], c=user_stats_pd['color'])

plt.xlabel("Total Duration (seconds)")
plt.ylabel("Anomaly Rate")
plt.title("Anomaly Rate vs. Total Duration by User (Good=Green, Bad=Red)")
plt.grid(True)

# Create a legend
# Since scatter doesn't directly support legend items for different colors from a column in this way,
# we create proxy artists for the legend.
import matplotlib.lines as mlines
green_patch = mlines.Line2D([], [], color='green', marker='o', linestyle='None', markersize=10, label='Good Users')
red_patch = mlines.Line2D([], [], color='red', marker='o', linestyle='None', markersize=10, label='Bad Users')
blue_patch = mlines.Line2D([], [], color='blue', marker='o', linestyle='None', markersize=10, label='Other Users')
plt.legend(handles=[green_patch, red_patch, blue_patch])

plt.show()

user_stats_pd