In [None]:
from dataclasses import dataclass
from dateutil import parser as date_parser

import numpy as np

import matplotlib.pyplot as plt
import pandas as pd
from pathlib import Path
from fuzzywuzzy import fuzz

%matplotlib inline

In [None]:
data_dir = Path(".").parent.resolve().parent / "data"
assert data_dir.is_dir()

In [None]:
fnames = list(data_dir.glob("participants_66360376183*.csv"))
print(fnames)
df = pd.concat([pd.read_csv(fn) for fn in fnames]).reset_index()
df.drop_duplicates(inplace=True)
df.reset_index(inplace=True)
print(f"Zoom connection rows: {len(df)}")
indico_df = pd.read_csv(data_dir / "indico_registrations.csv")
print(f"Indico registrants: {len(indico_df)}")

In [None]:
indico_df.rename(
    columns={"Name": "name", "Experiment or Affiliation": "experiment"},
    inplace=True,
)

In [None]:
df.rename(
    columns={"Name (Original Name)": "name", "Join Time": "start", "Leave Time": "end"},
    inplace=True,
)
df["start"] = pd.to_datetime(df["start"])
df["end"] = pd.to_datetime(df["end"])

In [None]:
df["start"].hist(figsize=(16, 5))

In [None]:
names = df.name.unique()

In [None]:
def no_middle_names(name: str):
    if name.count(" ") < 2:
        return name
    first, *_, last = name.split(" ")
    return " ".join([first, last])


def match_names(
    index_names: np.ndarray, target_names: np.ndarray, no_middle=True
) -> tuple[np.ndarray, np.ndarray]:
    """Match names from index_names to target_names using fuzzy matching.

    args:
        index_names: Names to match.
        target_names: Names to match to.

    returns:
        Array of indices of target_names that match index_names
        array of levenshtein distances
    """

    def preproc(name: str) -> str:
        if no_middle:
            return no_middle_names(name)
        return name

    matches = []
    scores = []
    for i, name in enumerate(index_names):
        fuzzies = [
            fuzz.ratio(preproc(name), preproc(target_name))
            for target_name in target_names
        ]
        idx = np.argmax(fuzzies)
        matches.append(target_names[idx])
        scores.append(fuzzies[idx])
    return matches, scores

In [None]:
df["indico_name"], df["indico_name_fuzzy_match_score"] = match_names(
    df.name.to_numpy(), indico_df.name.to_numpy()
)
_, df["indico_name_fuzzy_match_score_with_middle"] = match_names(
    df.name.to_numpy(), indico_df.name.to_numpy(), no_middle=False
)

In [None]:
no_middle = df.groupby("name")["indico_name_fuzzy_match_score"].first()
with_middle = df.groupby("name")["indico_name_fuzzy_match_score_with_middle"].first()
_, ax = plt.subplots()
h_kwargs = dict(
    histtype="step",
    lw=1.5,
)
_, bins, _ = ax.hist(no_middle.to_numpy(), label="no middle", **h_kwargs)
ax.hist(with_middle.to_numpy(), label="with middle", bins=bins, **h_kwargs)
ax.set_xlabel("Levenshtein distance")
ax.legend(loc="upper left")

In [None]:
joined_names = df.query("indico_name_fuzzy_match_score > 80")["indico_name"].unique()

In [None]:
sel_df = indico_df[indico_df.name.isin(joined_names)]
sel_df.experiment.value_counts() / len(sel_df)

In [None]:
indico_df.experiment.value_counts() / len(indico_df)

In [None]:
# df_merged = df.merge(indico_df, left_on="indico_name", right_index=True).drop(columns=["name_y"]).rename(columns={"name_x": "name"})

In [None]:
@dataclass
class Session:
    def __init__(self, name, start, end):
        self.name = name
        self.start = date_parser.parse(start)
        self.end = date_parser.parse(end)

    @property
    def duration(self):
        return self.end - self.start

In [None]:
def get_participants_of_session(
    df: pd.DataFrame, session: Session, min_time_frac=0.3
) -> np.ndarray:
    # session = sessions[0]
    # min_time_frac = 0.3
    _a = np.minimum(df["end"], pd.to_datetime([session.end] * len(df)))
    _b = np.maximum(df["start"], pd.to_datetime([session.start] * len(df)))
    df["clipped_end"] = _a
    df["clipped_start"] = _b
    df["tmp_overlap_min"] = (_a - _b).dt.total_seconds() / 60
    min_time_min = (session.duration * min_time_frac).total_seconds() / 60
    # print(df.query("name == 'Kilian Lieret'")[["clipped_end", "clipped_start", "start", "end", "tmp_overlap_min"]])
    grouped = df.groupby("name")["tmp_overlap_min"]
    # print(grouped.get_group("Kilian Lieret"))
    name2time = grouped.apply(lambda x: x[x > 0].sum())
    # print(name2time[name2time > 0])
    mask = name2time > min_time_min
    return list(name2time[mask].index.unique())

In [None]:
sessions = [
    Session("bash", "2022-09-28 10:00", "2022-09-28 12:30"),
    Session("python 1", "2022-09-28 13:45", "2022-09-28 17:00"),
    Session("git", "2022-09-29 10:00", "2022-09-29 13:00"),
    Session("python 2", "2022-09-29 13:45", "2022-09-29 17:00"),
    Session("ROOT", "2022-09-30 08:30", "2022-09-30 12:00"),
    Session("Scikit-HEP", "2022-09-30 13:15", "2022-09-30 16:15"),
]

In [None]:
# mask = (df.end < pd.to_datetime("2022-09-29 16:15")) & (pd.to_datetime("2022-09-30 13:15") < df.start)
# df["yes"] = mask
# df[["start", "end", "yes"]]

In [None]:
for s in sessions:
    print(f"{s.name}: {len(get_participants_of_session(df, s))}")

In [None]:
def get_overlap_matrix(df, sessions: list[Session]) -> np.ndarray:
    n = len(sessions)
    overlap_matrix = np.zeros((n, n))
    for i, s1 in enumerate(sessions):
        participants_s1 = get_participants_of_session(df, s1)
        for j, s2 in enumerate(sessions):
            participants_s2 = get_participants_of_session(df, s2)
            overlap_matrix[i, j] = len(np.intersect1d(participants_s1, participants_s2))
    return overlap_matrix

In [None]:
get_overlap_matrix(df, sessions)