In [None]:
from dataclasses import dataclass
from dateutil import parser as date_parser

import numpy as np

import matplotlib.pyplot as plt
import pandas as pd
from pathlib import Path
from fuzzywuzzy import fuzz

%matplotlib inline

In [None]:
data_dir = Path(".").parent.resolve().parent / "data"
assert data_dir.is_dir()

In [None]:
df = pd.read_csv(data_dir / "participants_66360376183.csv")
indico_df = pd.read_csv(data_dir / "indico_registrations.csv")

In [None]:
indico_df.rename(
    columns={"Name": "name", "Experiment or Affiliation": "experiment"},
    inplace=True,
)

In [None]:
df.rename(
    columns={"Name (Original Name)": "name", "Join Time": "start", "Leave Time": "end"},
    inplace=True,
)
df["start"] = pd.to_datetime(df["start"])
df["end"] = pd.to_datetime(df["end"])

In [None]:
names = df.name.unique()

In [None]:
def match_names(index_names: np.ndarray, target_names: np.ndarray):
    """Match names from index_names to target_names using fuzzy matching.

    Parameters
    ----------
    index_names : np.ndarray
        Names to match.
    target_names : np.ndarray
        Names to match to.

    Returns
    -------
    np.ndarray
        Array of indices of target_names that match index_names.
    """
    matches = []
    scores = []
    for i, name in enumerate(index_names):
        fuzzies = [fuzz.ratio(name, target_name) for target_name in target_names]
        idx = np.argmax(fuzzies)
        matches.append(target_names[idx])
        scores.append(fuzzies[idx])
    return matches, scores

In [None]:
df["indico_name"], df["indico_name_fuzzy_match_score"] = match_names(
    df.name.to_numpy(), indico_df.name.to_numpy()
)

In [None]:
joined_names = df.query("indico_name_fuzzy_match_score > 80")["indico_name"].unique()

In [None]:
sel_df = indico_df[indico_df.name.isin(joined_names)]
sel_df.experiment.value_counts() / len(sel_df)

In [None]:
indico_df.experiment.value_counts() / len(indico_df)

In [None]:
# df_merged = df.merge(indico_df, left_on="indico_name", right_index=True).drop(columns=["name_y"]).rename(columns={"name_x": "name"})

In [None]:
@dataclass
class Session:
    def __init__(self, name, start, end):
        self.name = name
        self.start = date_parser.parse(start)
        self.end = date_parser.parse(end)

    @property
    def duration(self):
        return self.end - self.start

In [None]:
def get_participants_of_session(
    df: pd.DataFrame, session: Session, min_time_frac=0.3
) -> np.ndarray:
    _a = np.minimum(df["end"], pd.to_datetime([session.end] * len(df)))
    _b = np.maximum(df["start"], pd.to_datetime([session.start] * len(df)))
    df["tmp_overlap_min"] = (_a - _b).dt.total_seconds() / 60
    min_time_min = (session.duration * min_time_frac).total_seconds() / 60
    return df.query(f"tmp_overlap_min > {min_time_min}").name.unique()

In [None]:
sessions = [
    Session("bash", "2022-09-28 10:00", "2022-09-28 12:30"),
    Session("python 1", "2022-09-28 13:45", "2022-09-28 17:00"),
    Session("git", "2022-09-29 10:00", "2022-09-28 13:00"),
    Session("python 2", "2022-09-29 13:45", "2022-09-28 17:00"),
]

In [None]:
for s in sessions:
    print(f"{s.name}: {len(get_participants_of_session(df, s))}")

In [None]:
def get_overlap_matrix(df, sessions: list[Session]) -> np.ndarray:
    n = len(sessions)
    overlap_matrix = np.zeros((n, n))
    for i, s1 in enumerate(sessions):
        for j, s2 in enumerate(sessions):
            participants_s1 = get_participants_of_session(df, s1)
            participants_s2 = get_participants_of_session(df, s2)
            overlap_matrix[i, j] = len(np.intersect1d(participants_s1, participants_s2))
    return overlap_matrix

In [None]:
get_overlap_matrix(df, sessions)