## Dataset Formation

In [19]:
import pandas as pd
import ast
import numpy as np
from glob import glob

In [27]:
def compute_features(group):
    features = {}
    # Time window identifier
    # features["timestamp"] = group.name
    
    # Number of events
    features["num_events"] = len(group)
    # Extract move events
    move_events = group[group["event_type"] == "move"]
    features["num_moves"] = len(move_events)
    # Extract click events
    click_events = group[group["event_type"] == "click"]
    features["num_clicks"] = len(click_events)
    # Extract scroll events
    scroll_events = group[group["event_type"] == "scroll"]
    features["num_scrolls"] = len(scroll_events)

    # Compute movement distance and related features
    if len(move_events) >= 2:
        move_events = move_events.sort_values("time")
        x = move_events["x"].values
        y = move_events["y"].values
        t = move_events["time"].values
        dx = np.diff(x)
        dy = np.diff(y)
        dt = np.diff(t)
        distances = np.sqrt(dx**2 + dy**2)
        total_distance = np.sum(distances)
        features["movement_distance"] = total_distance

        # Velocity calculations
        velocities = distances / dt
        features["velocity_mean"] = np.mean(velocities)
        features["velocity_max"] = np.max(velocities)
        features["velocity_min"] = np.min(velocities)
        features["velocity_sd"] = np.std(velocities)
        features["velocity_x_mean"] = np.mean(dx / dt)
        features["velocity_y_mean"] = np.mean(dy / dt)

        # Acceleration calculations
        if len(velocities) >= 2:
            dv = np.diff(velocities)
            dv_dt = dv / dt[1:]  # Adjust time differences
            features["acceleration_mean"] = np.mean(dv_dt)
            features["acceleration_max"] = np.max(dv_dt)
            features["acceleration_min"] = np.min(dv_dt)
            features["acceleration_sd"] = np.std(dv_dt)

            # Jerk calculations
            if len(dv_dt) >= 2:
                da = np.diff(dv_dt)
                da_dt = da / dt[2:]
                features["jerk_mean"] = np.mean(da_dt)
                features["jerk_sd"] = np.std(da_dt)
            else:
                features["jerk_mean"] = np.nan
                features["jerk_sd"] = np.nan
        else:
            features.update(
                {
                    "acceleration_mean": np.nan,
                    "acceleration_max": np.nan,
                    "acceleration_min": np.nan,
                    "acceleration_sd": np.nan,
                    "jerk_mean": np.nan,
                    "jerk_sd": np.nan,
                }
            )

        # Angular velocity calculations
        angles = np.arctan2(dy, dx)
        d_angle = np.diff(angles)
        d_angle = (d_angle + np.pi) % (2 * np.pi) - np.pi  # Normalize angles
        angular_velocity = d_angle / dt[1:]
        features["angular_velocity_mean"] = np.mean(angular_velocity)
        features["angular_velocity_sd"] = np.std(angular_velocity)
    else:
        features.update(
            {
                "movement_distance": 0,
                "velocity_mean": np.nan,
                "velocity_max": np.nan,
                "velocity_min": np.nan,
                "velocity_sd": np.nan,
                "velocity_x_mean": np.nan,
                "velocity_y_mean": np.nan,
                "acceleration_mean": np.nan,
                "acceleration_max": np.nan,
                "acceleration_min": np.nan,
                "acceleration_sd": np.nan,
                "jerk_mean": np.nan,
                "jerk_sd": np.nan,
                "angular_velocity_mean": np.nan,
                "angular_velocity_sd": np.nan,
            }
        )

    # Movement duration
    if len(move_events) >= 1:
        features["movement_duration"] = (
            move_events["time"].max() - move_events["time"].min()
        )
    else:
        features["movement_duration"] = 0

    # Pause time (idle cursor time)
    total_time = group["time"].max() - group["time"].min()
    features["total_time"] = total_time
    features["pause_time"] = total_time - features["movement_duration"]

    # Flips (directional changes)
    if len(move_events) >= 2 and len(dx) >= 2:
        features["flips_x"] = np.sum(np.diff(np.sign(dx)) != 0)
        features["flips_y"] = np.sum(np.diff(np.sign(dy)) != 0)
    else:
        features["flips_x"] = 0
        features["flips_y"] = 0

    # Number of pauses (idle periods)
    if len(move_events) >= 2:
        time_diffs = np.diff(move_events["time"].values)
        idle_threshold = 0.2  # Define a threshold for idle time
        pauses = time_diffs[time_diffs > idle_threshold]
        features["pause_count"] = len(pauses)
    else:
        features["pause_count"] = 0

    # Hold time for clicks
    if len(click_events) >= 1:
        pressed_events = click_events[click_events["pressed"] == True]
        released_events = click_events[click_events["pressed"] == False]
        if len(pressed_events) == len(released_events):
            hold_times = released_events["time"].values - pressed_events["time"].values
            features["hold_time_mean"] = np.mean(hold_times)
            features["hold_time_sd"] = np.std(hold_times)
        else:
            features["hold_time_mean"] = np.nan
            features["hold_time_sd"] = np.nan
    else:
        features["hold_time_mean"] = np.nan
        features["hold_time_sd"] = np.nan

    return pd.Series(features)

In [29]:
files = glob('recordings/*.txt')

for file in files:
    events = []
    monitors = []
    with open(file, 'r') as f:
        monitors.append(ast.literal_eval(f.readline().strip()))
        for line in f:
            event = ast.literal_eval(line.strip())
            events.append(event)

    # Convert events to a list of dictionaries
    width, height = monitors[0][1], monitors[0][2]
    event_list = []
    for event in events:
        if event[0] == 'move':
            event_dict = {'event_type': 'move', 'time': event[1], 'x': event[2]/width, 'y': event[3]/height}
        elif event[0] == 'click':
            event_dict = {'event_type': 'click', 'time': event[1], 'x': event[2]/width, 'y': event[3]/height,
                        'button': event[4], 'pressed': event[5]}
        elif event[0] == 'scroll':
            event_dict = {'event_type': 'scroll', 'time': event[1], 'x': event[2]/width, 'y': event[3]/height,
                        'dx': event[4], 'dy': event[5]}
        else:
            continue
        event_list.append(event_dict)

    # Convert to DataFrame and sort by time
    df = pd.DataFrame(event_list)
    df = df.sort_values('time').reset_index(drop=True)

    # Step 2: Organize events into time windows
    start_time = df['time'].min()
    df['timestamp'] = ((df['time'] - start_time) // 0.1).astype(int)

    # Step 3: Extract features for each time window
    # Apply the feature extraction function to each time window
    features_df = df.groupby('timestamp').apply(compute_features).reset_index(drop=True)

    features_df.to_csv(file.replace('.txt', '_features.csv'), index=False)
    print(f"Features saved to {file.replace('.txt', '_features.csv')}")

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  ret = _var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  arrmean = um.true_divide(arrmean, div, out=arrmean,
  ret = ret.dtype.type(ret / rcount)
  features_df = df.groupby('timestamp').apply(compute_features).reset_index(drop=True)
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  ret = _var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  arrmean = um.true_divide(arrmean, div, out=arrmean,
  ret = ret.dtype.type(ret / rcount)


Features saved to recordings/youtube_mouse_events_0_features.csv
Features saved to recordings/web_browsing_mouse_events_0_features.csv


  features_df = df.groupby('timestamp').apply(compute_features).reset_index(drop=True)


## Trainer

In [36]:
import pandas as pd
import numpy as np
from sklearn.metrics import f1_score, accuracy_score
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.preprocessing import StandardScaler


def process_class_data(file, label, K):
    df = pd.read_csv(file)
    df['label'] = label
    # Handle missing values
    df = df.fillna(method='ffill').fillna(method='bfill')
    df = df.dropna()
    # Extract windows
    windows = []
    num_rows = df.shape[0]
    for i in range(0, num_rows - K + 1):
        window = df.iloc[i:i+K]
        # Extract features
        features = window.values.flatten()
        windows.append(features)
    return windows

# Define the class files and window length
class_files = {
    0: '/Users/jangsus1/Homeworks/ML-Fall24-Team7/recordings/web_browsing_mouse_events_0_features.csv',
    1: '/Users/jangsus1/Homeworks/ML-Fall24-Team7/recordings/youtube_mouse_events_0_features.csv',
}
K = 10  # Window length

# Process the data for each class
X_train = []
X_test = []
y_train = []
y_test = []
for label, file in class_files.items():
    windows = process_class_data(file, label, K)
    index = int(0.8 * len(windows))
    X_train.extend(windows[:index])
    X_test.extend(windows[index:])
    y_train.extend([label] * index)
    y_test.extend([label] * (len(windows) - index))

X_train = np.array(X_train)
X_test = np.array(X_test)
y_train = np.array(y_train)
y_test = np.array(y_test)

print(f'X_train shape: {X_train.shape}')
print(f'X_test shape: {X_test.shape}')

# normalize the data
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)


# Train the Gradient Boosting Classifier
clf = GradientBoostingClassifier()
clf.fit(X_train, y_train)

# Predict on the test set and evaluate performance
y_pred = clf.predict(X_test)
f1 = f1_score(y_test, y_pred, average='weighted')
accuracy = accuracy_score(y_test, y_pred)
print(f'F1 Score: {f1}')
print(f'Accuracy: {accuracy}')

  df = df.fillna(method='ffill').fillna(method='bfill')
  df = df.fillna(method='ffill').fillna(method='bfill')


(715, 280) (715,) (179, 280) (179,)
X_train shape: (715, 280)
X_test shape: (179, 280)
F1 Score: 1.0
Accuracy: 1.0
