In [None]:
!pip install imbalanced-learn seaborn

## Dataset Formation

In [5]:
import pandas as pd
import ast
import numpy as np
from glob import glob

In [6]:
import numpy as np
import pandas as pd

def compute_features(group):
    features = {}
    # Number of events
    
    # Extract move events
    move_events = group[group["event_type"] == "move"]
    features["num_moves"] = len(move_events)
    # Extract click events
    click_events = group[group["event_type"] == "click"]
    features["num_clicks"] = len(click_events)
    # Extract scroll events
    scroll_events = group[group["event_type"] == "scroll"]
    features["num_scrolls"] = len(scroll_events)

    # Click positions
    if len(click_events) >= 1:
        features["click_x_mean"] = click_events["x"].mean()
        features["click_y_mean"] = click_events["y"].mean()
    else:
        features["click_x_mean"] = 0
        features["click_y_mean"] = 0

    # Compute movement distance and related features
    if len(move_events) >= 2:
        move_events = move_events.sort_values("time")
        x = move_events["x"].values
        y = move_events["y"].values
        t = move_events["time"].values
        dx = np.diff(x)
        dy = np.diff(y)
        dt = np.diff(t)

        # Handle zeros in dt to prevent division by zero
        valid = dt != 0
        dx = dx[valid]
        dy = dy[valid]
        dt = dt[valid]
        distances = np.sqrt(dx**2 + dy**2)
        total_distance = np.sum(distances)
        features["movement_distance"] = total_distance

        if len(distances) > 0:
            # Velocity calculations
            velocities = distances / dt
            features["velocity_mean"] = np.mean(velocities)
            features["velocity_max"] = np.max(velocities)
            features["velocity_min"] = np.min(velocities)
            features["velocity_sd"] = np.std(velocities)
            features["velocity_x_mean"] = np.mean(dx / dt)
            features["velocity_y_mean"] = np.mean(dy / dt)
        else:
            # No valid velocities
            features.update({
                "velocity_mean": 0,
                "velocity_max": 0,
                "velocity_min": 0,
                "velocity_sd": 0,
                "velocity_x_mean": 0,
                "velocity_y_mean": 0,
            })
            velocities = np.array([])

        # Acceleration calculations
        if len(velocities) >= 2:
            dv = np.diff(velocities)
            dt_acc = dt[1:]  # Time intervals for acceleration
            valid_acc = dt_acc != 0
            dv = dv[valid_acc]
            dt_acc = dt_acc[valid_acc]
            if len(dv) > 0:
                dv_dt = dv / dt_acc
                features["acceleration_mean"] = np.mean(dv_dt)
                features["acceleration_max"] = np.max(dv_dt)
                features["acceleration_min"] = np.min(dv_dt)
                features["acceleration_sd"] = np.std(dv_dt)
            else:
                features.update({
                    "acceleration_mean": 0,
                    "acceleration_max": 0,
                    "acceleration_min": 0,
                    "acceleration_sd": 0,
                })
                dv_dt = np.array([])
        else:
            features.update({
                "acceleration_mean": 0,
                "acceleration_max": 0,
                "acceleration_min": 0,
                "acceleration_sd": 0,
            })
            dv_dt = np.array([])

        # Jerk calculations
        if len(dv_dt) >= 2:
            da = np.diff(dv_dt)
            dt_jerk = dt_acc[1:]
            valid_jerk = dt_jerk != 0
            da = da[valid_jerk]
            dt_jerk = dt_jerk[valid_jerk]
            if len(da) > 0:
                da_dt = da / dt_jerk
                features["jerk_mean"] = np.mean(da_dt)
                features["jerk_sd"] = np.std(da_dt)
            else:
                features.update({
                    "jerk_mean": 0,
                    "jerk_sd": 0,
                })
        else:
            features.update({
                "jerk_mean": 0,
                "jerk_sd": 0,
            })

        # Angular velocity calculations
        angles = np.arctan2(dy, dx)
        d_angle = np.diff(angles)
        d_angle = (d_angle + np.pi) % (2 * np.pi) - np.pi  # Normalize angles
        dt_ang = dt[1:]
        valid_ang = dt_ang != 0
        d_angle = d_angle[valid_ang]
        dt_ang = dt_ang[valid_ang]
        if len(d_angle) > 0:
            angular_velocity = d_angle / dt_ang
            features["angular_velocity_mean"] = np.mean(angular_velocity)
            features["angular_velocity_sd"] = np.std(angular_velocity)
        else:
            features.update({
                "angular_velocity_mean": 0,
                "angular_velocity_sd": 0,
            })
    else:
        features.update({
            "movement_distance": 0,
            "velocity_mean": 0,
            "velocity_max": 0,
            "velocity_min": 0,
            "velocity_sd": 0,
            "velocity_x_mean": 0,
            "velocity_y_mean": 0,
            "acceleration_mean": 0,
            "acceleration_max": 0,
            "acceleration_min": 0,
            "acceleration_sd": 0,
            "jerk_mean": 0,
            "jerk_sd": 0,
            "angular_velocity_mean": 0,
            "angular_velocity_sd": 0,
        })

    # Movement duration
    if len(move_events) >= 1:
        features["movement_duration"] = move_events["time"].max() - move_events["time"].min()
    else:
        features["movement_duration"] = 0

    # Pause time (idle cursor time)
    total_time = group["time"].max() - group["time"].min()
    features["total_time"] = total_time
    features["pause_time"] = total_time - features["movement_duration"]

    # Flips (directional changes)
    if len(move_events) >= 2 and len(dx) >= 2:
        features["flips_x"] = np.sum(np.diff(np.sign(dx)) != 0)
        features["flips_y"] = np.sum(np.diff(np.sign(dy)) != 0)
    else:
        features["flips_x"] = 0
        features["flips_y"] = 0

    # Number of pauses (idle periods)
    if len(move_events) >= 2 and len(dt) >= 1:
        idle_threshold = 0.2  # Define a threshold for idle time
        pauses = dt[dt > idle_threshold]
        features["pause_count"] = len(pauses)
    else:
        features["pause_count"] = 0

    # Hold time for clicks
    if len(click_events) >= 1:
        pressed_events = click_events[click_events["pressed"] == True]
        released_events = click_events[click_events["pressed"] == False]
        if len(pressed_events) == len(released_events):
            hold_times = released_events["time"].values - pressed_events["time"].values
            features["hold_time_mean"] = np.mean(hold_times) if len(hold_times) > 0 else 0
            features["hold_time_sd"] = np.std(hold_times) if len(hold_times) > 1 else 0
        else:
            features["hold_time_mean"] = 0
            features["hold_time_sd"] = 0
    else:
        features["hold_time_mean"] = 0
        features["hold_time_sd"] = 0

    return pd.Series(features)

In [None]:
files = glob('recordings/*.txt')

for file in files:
    events = []
    monitors = []
    with open(file, 'r') as f:
        monitors.append(ast.literal_eval(f.readline().strip()))
        for line in f:
            event = ast.literal_eval(line.strip())
            events.append(event)

    # Convert events to a list of dictionaries
    width, height = monitors[0][1], monitors[0][2]
    event_list = []
    for event in events:
        if event[0] == 'move':
            event_dict = {'event_type': 'move', 'time': event[1], 'x': event[2]/width, 'y': event[3]/height}
        elif event[0] == 'click':
            event_dict = {'event_type': 'click', 'time': event[1], 'x': event[2]/width, 'y': event[3]/height,
                        'button': event[4], 'pressed': event[5]}
        elif event[0] == 'scroll':
            event_dict = {'event_type': 'scroll', 'time': event[1], 'x': event[2]/width, 'y': event[3]/height,
                        'dx': event[4], 'dy': event[5]}
        else:
            continue
        event_list.append(event_dict)

    # Convert to DataFrame and sort by time
    df = pd.DataFrame(event_list)
    df = df.sort_values('time').reset_index(drop=True)

    # Step 2: Organize events into time windows
    start_time = df['time'].min()
    df['timestamp'] = ((df['time'] - start_time) // 0.1).astype(int)
    
    unique_timestamps = df['timestamp'].unique()
    blank_timestamps = [i for i in range(unique_timestamps.min(), unique_timestamps.max() + 1) if i not in unique_timestamps]
    
    # insert blank rows for missing timestamps
    new_rows = []
    for ts in blank_timestamps:
        new_rows.append({'timestamp': ts})
    df = pd.concat([df, pd.DataFrame(new_rows)], ignore_index=True)
    df = df.sort_values('timestamp').reset_index(drop=True).fillna(0)
    

    # Step 3: Extract features for each time window
    # Apply the feature extraction function to each time window
    features_df = df.groupby('timestamp').apply(compute_features, include_groups=False).reset_index(drop=True)
    

    features_df.to_csv(file.replace('.txt', '_features.csv').replace("recordings", "train_data"), index=False)
    print(f"Features saved to {file.replace('.txt', '_features.csv')}")

Features saved to recordings/web_browsing_mouse_events_HJ_features.csv
Features saved to recordings/web_browsing_mouse_events_jungwoo_features.csv
Features saved to recordings/youtube_mouse_events_0_features.csv
Features saved to recordings/chess_mouse_events_HJ_features.csv
Features saved to recordings/web_browsing_mouse_events_0_features.csv
Features saved to recordings/finding_mines_mouse_events_jungwoo_features.csv
Features saved to recordings/youtube_mouse_events_jungwoo_features.csv
Features saved to recordings/reading_mouse_events_HJ_features.csv
Features saved to recordings/web_browsing_mouse_events_woohyun_features.csv
Features saved to recordings/chatting_mouse_events_HJ_features.csv
Features saved to recordings/reading_thesis_mouse_events_woohyun_features.csv
Features saved to recordings/youtube_mouse_events_HJ_features.csv
Features saved to recordings/finding_mines_mouse_events_HJ_features.csv
Features saved to recordings/ppt_mouse_events_HJ_features.csv
Features saved to r

## Trainer

In [1]:
classification_classes = {
  "web_browsing": 0, # web
  # "ppt": 1, # design tools,
  # "reading": 2, # reading papers
  "finding_mines": 3, # game
  "chess": 3, # game
  "youtube": 4, # youtube
  # "chatting": 5, # chatting
}

In [2]:
import pandas as pd
import numpy as np
from sklearn.metrics import f1_score, accuracy_score
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import make_pipeline
from imblearn.over_sampling import RandomOverSampler

def process_class_data(file, window_size):
    df = pd.read_csv(file)
    df = df.ffill().bfill()
    df = df.dropna()
    # Extract windows
    windows = []
    num_rows = df.shape[0]
    window_interval = max(window_size // 2, 1)
    for i in range(0, num_rows - window_size + 1, window_interval):
        window = df.iloc[i:i+window_size]
        # Extract features
        features = window.values.flatten()
        windows.append(features)
    return windows

def prepare_data(window_size=50):
    # Process the data for each class
    X_train = []
    X_test = []
    y_train = []
    y_test = []
    for tag, label in classification_classes.items():
        files = glob(f'train_data/{tag}_*.csv')
        for file in files:
            # print(f'Processing {file} for class {tag}')
            windows = process_class_data(file, window_size)
            index = int(0.8 * len(windows))
            X_train.extend(windows[:index])
            X_test.extend(windows[index:])
            y_train.extend([label] * index)
            y_test.extend([label] * (len(windows) - index))

    X_train = np.array(X_train)
    X_test = np.array(X_test)
    y_train = np.array(y_train)
    y_test = np.array(y_test)

    X_train = np.nan_to_num(X_train, posinf=np.finfo(np.float32).max, neginf=np.finfo(np.float32).min)
    X_test = np.nan_to_num(X_test, posinf=np.finfo(np.float32).max, neginf=np.finfo(np.float32).min)

    # Normalize the data
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)

    # Print per-class counts
    # for label, count in zip(*np.unique(y_train, return_counts=True)):
    #     print(f'Class {label}: {count} samples')

    oversampler = RandomOverSampler(random_state=42)
    X_resampled, y_resampled = oversampler.fit_resample(X_train, y_train)

    # print(f'X_train shape: {X_train.shape} -> {X_resampled.shape}')
    # print(f'X_test shape: {X_test.shape}')
    return X_resampled, X_test, y_resampled, y_test

In [3]:
X_resampled, X_test, y_resampled, y_test = prepare_data(1)
X_resampled.shape, X_test.shape, y_resampled.shape, y_test.shape

NameError: name 'glob' is not defined

In [52]:
from sklearn.metrics import confusion_matrix

def train(model_name, window_size):
    if model_name == 'boosting':
        model = GradientBoostingClassifier(random_state=42, verbose=1)
    elif model_name == 'random_forest':
        model = RandomForestClassifier(random_state=42)
    elif model_name == 'svm':
        model = SVC(random_state=42)
    elif model_name == 'logistic_regression':
        model = LogisticRegression(tol=1e-3)
    elif model_name == 'poly_regression':
        model = make_pipeline(PolynomialFeatures(degree=2), LogisticRegression())
    elif model_name == 'knn':
        model = KNeighborsClassifier()
    else:
        raise ValueError(f"Unknown MODEL: {model_name}")

    X_resampled, X_test, y_resampled, y_test = prepare_data(window_size)
    
    model.fit(X_resampled, y_resampled)

    # Predict on the test set and evaluate performance
    y_pred = model.predict(X_test)
    f1 = f1_score(y_test, y_pred, average='weighted')
    accuracy = accuracy_score(y_test, y_pred)
    # return conf matrix
    cm = confusion_matrix(y_test, y_pred)
    
    return accuracy, f1, cm

In [4]:
import seaborn as sns
import matplotlib.pyplot as plt

category_names = []
for label in range(6):
  name = ""
  for tag, l in classification_classes.items():
    if l == label:
      name += tag + "/"
  category_names.append(name)

# Options: 'boosting', 'random_forest', 'svm', 'logistic_regression', 'poly_regression', 'knn'
for model_name in ['boosting', 'random_forest', 'logistic_regression', 'knn', 'poly_regression', 'svm']: 
  for window_size in [50, 100, 600]:
    acc, f1, cm = train(model_name, window_size)
    print(f"Model: {model_name}, Window Size: {window_size}, Accuracy: {acc}, F1: {f1}")
    # plot cm
    sns.heatmap(cm, annot=True, fmt='g', xticklabels=category_names, yticklabels=category_names)
    plt.show()

NameError: name 'train' is not defined