In [1]:
import logging
import warnings

warnings.filterwarnings("ignore", category=FutureWarning)
warnings.filterwarnings("ignore", category=UserWarning)

# disable scikit-learn warnings
logger = logging.getLogger('sklearn')
logger.setLevel(logging.ERROR)

import requests
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.cluster import KMeans

detail_url = 'http://minio-api.kubby.ninja/fix-dive-storage/projects/1234567890/playlists/low_details.json'

video_details = requests.get(detail_url).json()

eye_url = 'http://minio-api.kubby.ninja/fix-dive-storage/projects/1234567890/features/eye_tracking/eye_tracking.gzip.parquet'

df = pd.read_parquet(eye_url)

#columns = ['frame_index', 'progress', 'eye_x', 'eye_y']

# set column names
# df.columns = columns

df.sort_values(by=['progress'], inplace=True)

# if duration is 1 minute 10 seconds, get progress_secs column
df['progress_secs'] = df['progress'] * float(video_details['format']['duration'])


df['true_x'] = df['eye_x'] * video_details['streams'][0]['width']
df['true_y'] = df['eye_y'] * video_details['streams'][0]['height']

df

Unnamed: 0,progress,eye_x,eye_y,progress_secs,true_x,true_y
0,3.640777e-07,0.037497,0.918590,0.000025,71.994026,992.076714
1,5.840971e-04,0.049825,0.905528,0.039456,95.664704,977.970319
2,1.213573e-03,0.058277,0.905399,0.081977,111.891105,977.830419
3,1.798660e-03,0.067983,0.909321,0.121499,130.527208,982.066648
4,2.382437e-03,0.042592,0.941887,0.160934,81.776300,1017.238239
...,...,...,...,...,...,...
1627,9.975233e-01,0.533198,0.340407,67.382698,1023.739597,367.639689
1628,9.981646e-01,0.540227,0.360381,67.426020,1037.235194,389.211002
1629,9.987713e-01,0.547215,0.385601,67.467001,1050.652846,416.449541
1630,9.993781e-01,0.567614,0.384825,67.507990,1089.818181,415.610479


In [2]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.cluster import KMeans
import seaborn as sns

def kmeans_inertia(scaled_data, k, alpha_k=0.02):
    inertia_o = np.square((scaled_data - scaled_data.mean(axis=0))).sum()
    kmeans = KMeans(n_clusters=k, random_state=0).fit(scaled_data)
    scaled_inertia = kmeans.inertia_ / inertia_o + alpha_k * k
    return scaled_inertia


def best_kmeans(scaled_data, k_range):
    ans = []
    for k in k_range:
        scaled_inertia = kmeans_inertia(scaled_data, k)
        ans.append((k, scaled_inertia))
    results = pd.DataFrame(ans, columns = ['k','Scaled Inertia']).set_index('k')
    best_k = results.idxmin()[0]
    return best_k, results

In [9]:
# group every 5 seconds
def cluster(subdf):
    scaler = StandardScaler()

    x = subdf[['true_x', 'true_y']].values
    #x = scaler.fit_transform(x)
    
    best_k, _ = best_kmeans(x, range(2, 10))
    model = KMeans(n_clusters=best_k, random_state=42)
    model.fit(x)
    
    # sns.scatterplot(
    #     x=x[:,0],
    #     y=x[:,1],
    #     hue=model.labels_,
    #     s=10,
    #     edgecolor='black',
    #     palette='rainbow',
    # )
    # sns.scatterplot(
    #     x=model.cluster_centers_[:,0],
    #     y=model.cluster_centers_[:,1],
    #     s=100,
    #     color='black',
    #     marker='x',
    #     edgecolor='black',
    # )
    # plt.show()

    return model.cluster_centers_

video_duration = float(video_details['format']['duration'])

groups = df\
    .groupby(pd.cut(df['progress_secs'], np.arange(0, video_duration, 3)))\
    .apply(cluster)\
    .reset_index()
    
groups

#for group in groups:
#    subdf = group[1]
    
    # sns.scatterplot(
    #     x=x[:,0],
    #     y=x[:,1],
    #     hue=model.labels_,
    #     s=10,
    #     edgecolor='black',
    #     palette='rainbow',
    # )
    # sns.scatterplot(
    #     x=model.cluster_centers_[:,0],
    #     y=model.cluster_centers_[:,1],
    #     s=100,
    #     color='black',
    #     marker='x',
    #     edgecolor='black',
    # )
        
    # plt.show()

Unnamed: 0,progress_secs,0
0,"(0.0, 3.0]","[[769.0625808120327, 473.44312124304656], [107..."
1,"(3.0, 6.0]","[[1256.0468474479387, 528.913453937847], [944...."
2,"(6.0, 9.0]","[[846.8916111142593, 504.1573101452816], [882...."
3,"(9.0, 12.0]","[[898.0892731685129, 665.89492956348], [802.28..."
4,"(12.0, 15.0]","[[850.2564608221638, 361.8593614731144], [906...."
5,"(15.0, 18.0]","[[921.1950375699279, 216.0116131488576], [1357..."
6,"(18.0, 21.0]","[[1145.2867646247994, 687.1491453831051], [875..."
7,"(21.0, 24.0]","[[1257.5405227786464, 861.0410283381934], [684..."
8,"(24.0, 27.0]","[[1251.8306091216753, 434.6496999986857], [945..."
9,"(27.0, 30.0]","[[1027.532614689692, 462.4347442754419], [1314..."


In [15]:
import s3fs

# rename columns
groups.columns = ['progress_spans', 'cluster_center']

# explode cluster_center column
gdf = groups.explode('cluster_center')

gdf['cluster_x'] = gdf['cluster_center'].apply(lambda x: x[0])
gdf['cluster_y'] = gdf['cluster_center'].apply(lambda x: x[1])
gdf['progress_start'] = gdf['progress_spans'].apply(lambda x: x.left)
gdf['progress_end'] = gdf['progress_spans'].apply(lambda x: x.right)
gdf.drop(columns=['cluster_center', 'progress_spans'], inplace=True)

gdf

Unnamed: 0,cluster_x,cluster_y,progress_start,progress_end
0,769.062581,473.443121,0.0,3.0
0,1070.917407,385.534587,0.0,3.0
0,132.946071,978.535767,0.0,3.0
0,1249.960953,478.164831,0.0,3.0
1,1256.046847,528.913454,3.0,6.0
...,...,...,...,...
21,801.465248,341.289662,63.0,66.0
21,137.645475,408.026506,63.0,66.0
21,461.357594,420.266261,63.0,66.0
21,850.862048,449.651615,63.0,66.0


In [None]:
s3_client = s3fs.S3FileSystem(
    anon=False,
    key='admin',
    secret='admin123',
    use_ssl=False,
    client_kwargs={
        'endpoint_url': 'http://172.23.0.100:30140',
    }
)

cluster_file = 'fix-dive-storage/projects/1234567890/features/eye_tracking/eye_tracking_clusters.parquet.gzip'

with s3_client.open(cluster_file, 'wb') as f:
    gdf.to_parquet(f, index=False, compression='gzip')

In [4]:
import cv2 

video_url = 'http://minio-api.kubby.ninja/fix-dive-storage/projects/1234567890/playlists/low.m3u8'

capture = cv2.VideoCapture(video_url)

group_index = 0

# play video at native speed
while capture.isOpened():
    success, frame = capture.read()

    if not success:
        break
        
    # get current progress
    progress = capture.get(cv2.CAP_PROP_POS_MSEC) / 1000
    
    if group_index+1 < len(groups) and progress >= groups.index[group_index+1].left:
        group_index += 1
        print(group_index, groups.index[group_index].left)
        
    # get current cluster
    group = groups.iloc[group_index]
    
    for point in group:
        x = point[0]
        y = point[1]
        # draw circle at point
        cv2.circle(frame, (int(x), int(y)), 10, (0, 0, 255), -1)
    
    # cv2.imshow('frame', frame)
    # if cv2.waitKey(1) & 0xFF == ord('q'):
    #     break

qt.qpa.plugin: Could not find the Qt platform plugin "wayland" in "/home/jack/.local/share/virtualenvs/datadev-5ox7fytP/lib/python3.10/site-packages/cv2/qt/plugins"


1 3.0
2 6.0
3 9.0
4 12.0
5 15.0
6 18.0
7 21.0
8 24.0
9 27.0
10 30.0
11 33.0
12 36.0
13 39.0
14 42.0
15 45.0
16 48.0
17 51.0
18 54.0
19 57.0
20 60.0
21 63.0


: 