In [1]:
!pip install lightning torchvision

You should consider upgrading via the '/Users/haily/.pyenv/versions/3.10.4/bin/python3.10 -m pip install --upgrade pip' command.[0m[33m
[0m

In [2]:
import torch
import torch.nn as nn
import lightning as L
import pandas as pd
import torchvision
import numpy as np
import torch.nn.functional as F  
from torch.autograd import Variable
import math
from functools import partial
import pathlib
from torch.utils.data import DataLoader, Dataset
from torchvision.io import read_video
import lightning as L
from lightning.pytorch.loggers import CSVLogger
import torchmetrics
from lightning.pytorch.callbacks import EarlyStopping
from sklearn.metrics import classification_report, confusion_matrix, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
from torchvision.transforms import CenterCrop, v2
from datetime import datetime
import csv
import glob
import os

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
id2Label = ['[i]', 'BackSpace', ',', '[s]', '.', 
            'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 
            'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 
            'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 
            'y', 'z']
label2Id  = {label: i for i, label in enumerate(id2Label)}

NUM_WORKERS = 4
f_after = 2 # number of frames after
f_before = 2 # number of frames before
gap = 2 # gap between idle video segment and non-idle video segment
total_window = f_after + f_before + 1

# Resnet

In [4]:
#### RESNET 3D #### 
def conv3x3x3(in_planes, out_planes, stride=1):
    # 3x3x3 convolution with padding
    return nn.Conv3d(in_planes, out_planes, kernel_size=3, stride=stride, padding=1, bias=False)


def downsample_basic_block(x, planes, stride):
    out = F.avg_pool3d(x, kernel_size=1, stride=stride)
    zero_pads = torch.Tensor(out.size(0), planes - out.size(1), out.size(2), out.size(3), out.size(4) ).zero_()
    
    if isinstance(out.data, torch.cuda.FloatStorage): zero_pads = zero_pads.cuda()
    out = Variable(torch.cat([out.data, zero_pads], dim=1))
    return out

class BasicBlock(nn.Module):
    expansion = 1

    def __init__(self, inplanes, planes, stride=1, downsample=None):
        super(BasicBlock, self).__init__()
        self.conv1 = conv3x3x3(inplanes, planes, stride)
        self.bn1 = nn.BatchNorm3d(planes)
        self.relu = nn.ReLU(inplace=True)
        self.conv2 = conv3x3x3(planes, planes)
        self.bn2 = nn.BatchNorm3d(planes)
        self.downsample = downsample
        self.stride = stride

    def forward(self, x):
        residual = x

        out = self.conv1(x)
        out = self.bn1(out)
        out = self.relu(out)

        out = self.conv2(out)
        out = self.bn2(out)

        if self.downsample is not None:
            residual = self.downsample(x)

        out += residual
        out = self.relu(out)

        return out


class Bottleneck(nn.Module):
    expansion = 4

    def __init__(self, inplanes, planes, stride=1, downsample=None):
        super(Bottleneck, self).__init__()
        self.conv1 = nn.Conv3d(inplanes, planes, kernel_size=1, bias=False)
        self.bn1 = nn.BatchNorm3d(planes)

        self.conv2 = nn.Conv3d(
            planes, planes, 
            kernel_size=3, stride=stride, padding=1, 
            bias=False)
        self.bn2 = nn.BatchNorm3d(planes)
        
        self.conv3 = nn.Conv3d(planes, planes * 4, kernel_size=1, bias=False)
        self.bn3 = nn.BatchNorm3d(planes * 4)
        
        self.relu = nn.ReLU(inplace=True)
        self.downsample = downsample
        self.stride = stride

    def forward(self, x):
        residual = x

        out = self.conv1(x)
        out = self.bn1(out)
        out = self.relu(out)

        out = self.conv2(out)
        out = self.bn2(out)
        out = self.relu(out)

        out = self.conv3(out)
        out = self.bn3(out)

        if self.downsample is not None:
            residual = self.downsample(x)

        out += residual
        out = self.relu(out)

        return out


class ResNet(nn.Module):
    def __init__(self, block, layers, sample_size, sample_duration, shortcut_type='B', num_classes=400):
        """
        block: basic block or bottle neck
        layers: define Resnet architecture 34, 101, 152 etc
        sample size: image size
        shortcut_type: 'A' or 'B'
        num_classes: ...
        """
        self.inplanes = 64
        super(ResNet, self).__init__()
        self.conv1 = nn.Conv3d(3, 64, kernel_size=7, stride=(1, 2, 2), padding=(3, 3, 3), bias=False)
        self.bn1 = nn.BatchNorm3d(64)
        self.relu = nn.ReLU(inplace=True)
        self.maxpool = nn.MaxPool3d(kernel_size=(3, 3, 3), stride=2, padding=1)
        self.layer1 = self._make_layer(block, 64, layers[0], shortcut_type)
        self.layer2 = self._make_layer(block, 128, layers[1], shortcut_type, stride=2)
        self.layer3 = self._make_layer(block, 256, layers[2], shortcut_type, stride=2)
        self.layer4 = self._make_layer(block, 512, layers[3], shortcut_type, stride=2)
        
        
        last_duration = int(math.ceil(sample_duration / 16))
        last_size = int(math.ceil(sample_size / 32))
        self.avgpool = nn.AvgPool3d(
            (last_duration, last_size, last_size), stride=1)
        
        self.fc = nn.Linear(512 * block.expansion, num_classes)

        for m in self.modules():
            if isinstance(m, nn.Conv3d):
                m.weight = nn.init.kaiming_normal(m.weight, mode='fan_out')
            elif isinstance(m, nn.BatchNorm3d):
                m.weight.data.fill_(1)
                m.bias.data.zero_()

    def _make_layer(self, block, planes, blocks, shortcut_type, stride=1):
        downsample = None
        if stride != 1 or self.inplanes != planes * block.expansion:
            if shortcut_type == 'A':
                downsample = partial(
                    downsample_basic_block,
                    planes=planes * block.expansion,
                    stride=stride)
            else:
                downsample = nn.Sequential(
                    nn.Conv3d(self.inplanes,planes * block.expansion,kernel_size=1,stride=stride,bias=False), 
                    nn.BatchNorm3d(planes * block.expansion))

        layers = []
        layers.append(block(self.inplanes, planes, stride, downsample))
        self.inplanes = planes * block.expansion
        for i in range(1, blocks):
            layers.append(block(self.inplanes, planes))

        return nn.Sequential(*layers)

    def forward(self, x):
        x = self.conv1(x)
        x = self.bn1(x)
        x = self.relu(x)
        x = self.maxpool(x)

        x = self.layer1(x)
        x = self.layer2(x)
        x = self.layer3(x)
        x = self.layer4(x)

        x = self.avgpool(x)

        x = x.view(x.size(0), -1)
        x = self.fc(x)

        return x

def resnet10(**kwargs): return ResNet(BasicBlock, [1, 1, 1, 1], **kwargs)
def resnet18(**kwargs): return ResNet(BasicBlock, [2, 2, 2, 2], **kwargs)
def resnet34(**kwargs): return ResNet(BasicBlock, [3, 4, 6, 3], **kwargs)
def resnet50(**kwargs): return ResNet(Bottleneck, [3, 4, 6, 3], **kwargs)
def resnet101(**kwargs): return ResNet(Bottleneck, [3, 4, 23, 3], **kwargs)
def resnet152(**kwargs): return ResNet(Bottleneck, [3, 8, 36, 3], **kwargs)
def resnet200(**kwargs): return ResNet(Bottleneck, [3, 24, 36, 3], **kwargs)

# Lightning dataset

In [14]:
class KeyDataset(torch.utils.data.Dataset):
    def __init__(self, video_name, labels_dir, videos_dir):
        segments = []
        # Infer idle frames.
        self.labels_dir = labels_dir
        self.videos_dir = videos_dir
        df = pd.read_csv(f'{self.labels_dir}/{video_name}.csv')
        for index, row in df.iterrows():
            key_frame = int(row['Frame'])  # Frame number where key was pressed
            key_value = row['Key']  # Key pressed
            if key_value not in id2Label:
                key_value = '[s]'
            
            is_idle_before = False
            if index == 0:
                pos_start = max(key_frame - f_before, 0)
                pos_end = key_frame + f_after
                neg_start = 0
                neg_end = pos_start - gap
                is_idle_before = True
            else:
                prev_key_frame = df.iloc[index - 1]['Frame']
                pos_start = max(key_frame - f_before, 0)
                pos_end = key_frame + f_after
                prev_pos_end = prev_key_frame + f_after
                if (pos_start - prev_pos_end) - 1 >= (f_after + f_before + 1 + gap * 2):
                    neg_start =  prev_pos_end + gap
                    neg_end = pos_start - gap
                    is_idle_before = True
            
            
            # Negative class video segments before
            if is_idle_before:
                j = neg_start
                while (j + total_window - 1) <= neg_end:
                    segments.append(([j, j + total_window - 1], "[i]"))
                    j += total_window
            
            # Current video with keystroke
            segments.append(([pos_start, pos_end], key_value))
        
        self.video_name = video_name
        self.segments = segments
    def __len__(self):
        return len(self.segments)

    def __getitem__(self, idx):
        (start, end), label = self.segments[idx]
        
        frames = []
        for i in range(start, end + 1):
            image = torchvision.io.read_image(f"{self.videos_dir}/{self.video_name}/frame_{i}.jpg")
            
            frames.append(image)
       
        return torch.stack(frames), label2Id[label]
    
    def get_class_counts(self):
        labels = [segment[1] for segment in self.segments]
        unique_elements, counts = np.unique(labels, return_counts=True)
        occurrences = dict(zip(unique_elements, counts))
        weights = np.zeros(len(id2Label))
        for label, count in occurrences.items():
            weights[label2Id[label]] = count
        return weights

class KeyDataModule(L.LightningDataModule):
    def __init__(self, batch_size, labels_dir, videos_dir, train_vids, val_vids, test_vids):
        super().__init__()
        self.batch_size = batch_size
        self.train_datasets = [KeyDataset(video_name, labels_dir, videos_dir) for video_name in train_vids]
        self.val_datasets = [KeyDataset(video_name, labels_dir, videos_dir) for video_name in val_vids]
        self.test_datasets = [KeyDataset(video_name, labels_dir, videos_dir) for video_name in test_vids]
        
        self.train_dataset = torch.utils.data.ConcatDataset(self.train_datasets)
        self.test_dataset = torch.utils.data.ConcatDataset(self.test_datasets)
        self.val_dataset = torch.utils.data.ConcatDataset(self.val_datasets)
        
        
        
        print(f"Train: {len(self.train_dataset)}; Val: {len(self.val_dataset)}; Test: {len(self.test_dataset)}")
        
        train_counts = np.array(
            [d.get_class_counts() for d in self.train_datasets]).sum(axis=0)
        print(f"Train counts: {train_counts}")
        train_total_samples = np.array([len(d) for d in self.train_datasets]).sum(axis=0)
        self.train_weights = train_counts / (train_total_samples * len(id2Label))
                                        
    def train_dataloader(self):
        return DataLoader(self.train_dataset, 
                          batch_size=self.batch_size, 
                          num_workers=NUM_WORKERS)
    
    def val_dataloader(self):
        return DataLoader(self.val_dataset, 
                          batch_size=self.batch_size, 
                          num_workers=NUM_WORKERS,
                          shuffle=False)
    
    def test_dataloader(self):
        return DataLoader(self.test_dataset, 
                          batch_size=self.batch_size, 
                          num_workers=NUM_WORKERS,
                          shuffle=False)

class KeyClf(L.LightningModule):
    def __init__(self, img_size, num_classes, learning_rate, weights):
        super().__init__()
        self.model = resnet152(sample_size=img_size, 
                               sample_duration=total_window,
                               shortcut_type='B', 
                               num_classes=num_classes)
        
        self.loss_fn = torch.nn.CrossEntropyLoss(torch.tensor(weights).float())
        self.accuracy = torchmetrics.Accuracy(task="multiclass", num_classes=num_classes)
        self.lr = learning_rate
        self.transforms = v2.Compose([
            v2.CenterCrop(img_size),
            v2.ToDtype(torch.float32, scale=True),
        ])
        
        self.test_preds = []
        self.test_targets = []
        self.save_hyperparameters()


    def test_step(self, batch):
        videos, targets = batch
        videos = self.transforms(videos)
        videos = videos.permute(0, 2, 1, 3, 4)
        preds = self.model(videos)

        pred_ids = torch.argmax(self.model(videos), dim=1)
        pred_labels = [id2Label[_id] for _id in pred_ids]
        self.test_preds += pred_labels
        self.test_targets += [id2Label[_id] for _id in targets]
        
        loss = self.loss_fn(preds, targets.long())
        self.log_dict({'test_loss': loss, 'test_acc': self.accuracy(preds, targets)})
       
    
    def on_test_end(self):
        print(classification_report(self.test_targets, self.test_preds))
        
    def training_step(self, batch):
        videos, targets = batch
        videos = self.transforms(videos)
        videos = videos.permute(0, 2, 1, 3, 4)
        preds = self.model(videos)
        loss = self.loss_fn(preds, targets.long())
        self.log_dict({'train_loss': loss, 'train_acc': self.accuracy(preds, targets)})
        return loss

    def validation_step(self, batch):
        videos, targets = batch
        videos = self.transforms(videos)
        videos = videos.permute(0, 2, 1, 3, 4)
        preds = self.model(videos)
        loss = self.loss_fn(preds, targets.long())
        self.log_dict({'val_loss': loss, 'val_acc': self.accuracy(preds, targets)})
        return loss

    def configure_optimizers(self):
        return torch.optim.Adam(self.model.parameters(), lr=self.lr)

## Train

In [None]:
# dm = KeyDataModule(batch_size=8, 
#                    labels_dir='/kaggle/input/keystroke/labels',
#                    videos_dir="/kaggle/input/keystroke/raw_frames_320",
#                    train_vids=[
#                        'video_1', 'video_2', 'video_3', 'video_4', 'video_5', 
#                         'video_6', 'video_7', 'video_8', 'video_9', 'video_10',
#                        'video_11', 'video_12', 'video_13', 'video_14', 'video_15', 
#                        'video_16', 'video_17', 'video_18', 'video_19',
#                        'video_21', 'video_22', 'video_23', 'video_24', 'video_25', 
#                        'video_26', 'video_27', 'video_28', 'video_29', 'video_30'], 
#                    val_vids=['video_31', 'video_32', 'video_33'],
#                    test_vids=['video_34','video_35', 'video_36'])
# model = KeyClf(img_size=320, num_classes=len(id2Label), learning_rate=0.001, weights=dm.train_weights)
# logger = CSVLogger("logs", name=f"resnet_152_320_8", flush_logs_every_n_steps=1)
# trainer = L.Trainer(
#     # deterministic=True,
#     devices=[0, 1],
#     accelerator="gpu",
#     fast_dev_run=False,
#     callbacks=[EarlyStopping(monitor="val_loss", patience=10)],
#     logger=logger
# )
# trainer.fit(model, dm, ckpt_path="/kaggle/input/keyclf/pytorch/v3/1/epoch20-step101850.ckpt")
# trainer.test(model, dm)

Train: 77589; Val: 7168; Test: 4377
Train counts: [29360.  4182.   532.  7038.   376.  2938.   590.  1222.  1179.  4348.
   682.   734.  1228.  2825.   407.   363.  1573.   850.  2352.  2539.
   917.   346.  2248.  1843.  2776.  1417.   405.   587.   497.   779.
   456.]


  m.weight = nn.init.kaiming_normal(m.weight, mode='fan_out')
INFO: GPU available: True (cuda), used: True
INFO: TPU available: False, using: 0 TPU cores
INFO: HPU available: False, using: 0 HPUs
/opt/conda/lib/python3.10/site-packages/lightning/pytorch/loops/utilities.py:73: `max_epochs` was not set. Setting it to 1000 epochs. To train without an epoch limit, set `max_epochs=-1`.
/opt/conda/lib/python3.10/site-packages/lightning/pytorch/loops/utilities.py:73: `max_epochs` was not set. Setting it to 1000 epochs. To train without an epoch limit, set `max_epochs=-1`.
INFO: Initializing distributed: GLOBAL_RANK: 0, MEMBER: 1/2
INFO: Initializing distributed: GLOBAL_RANK: 1, MEMBER: 2/2
INFO: ----------------------------------------------------------------------------------------------------
distributed_backend=nccl
All distributed processes registered. Starting with 2 processes
----------------------------------------------------------------------------------------------------

INFO: Rest

Sanity Checking: |          | 0/? [00:00<?, ?it/s]

/opt/conda/lib/python3.10/site-packages/lightning/pytorch/trainer/connectors/logger_connector/result.py:439: It is recommended to use `self.log('val_loss', ..., sync_dist=True)` when logging on epoch level in distributed setting to accumulate the metric across devices.
/opt/conda/lib/python3.10/site-packages/lightning/pytorch/trainer/connectors/logger_connector/result.py:439: It is recommended to use `self.log('val_acc', ..., sync_dist=True)` when logging on epoch level in distributed setting to accumulate the metric across devices.


Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

INFO: Initializing distributed: GLOBAL_RANK: 0, MEMBER: 1/2
INFO: Initializing distributed: GLOBAL_RANK: 1, MEMBER: 2/2
INFO: ----------------------------------------------------------------------------------------------------
distributed_backend=nccl
All distributed processes registered. Starting with 2 processes
----------------------------------------------------------------------------------------------------

INFO: LOCAL_RANK: 1 - CUDA_VISIBLE_DEVICES: [0,1]
INFO: LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]
/opt/conda/lib/python3.10/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:215: Using `DistributedSampler` with the dataloaders. During `trainer.test()`, it is recommended to use `Trainer(devices=1, num_nodes=1)` to ensure each sample/batch gets evaluated exactly once. Otherwise, multi-device settings use `DistributedSampler` that replicates some samples to make sure all devices have same batch size in case of uneven inputs.


Testing: |          | 0/? [00:00<?, ?it/s]

/opt/conda/lib/python3.10/site-packages/lightning/pytorch/trainer/connectors/logger_connector/result.py:439: It is recommended to use `self.log('test_loss', ..., sync_dist=True)` when logging on epoch level in distributed setting to accumulate the metric across devices.
/opt/conda/lib/python3.10/site-packages/lightning/pytorch/trainer/connectors/logger_connector/result.py:439: It is recommended to use `self.log('test_acc', ..., sync_dist=True)` when logging on epoch level in distributed setting to accumulate the metric across devices.
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           ,       0.78      0.88      0.82         8
           .       0.00      0.00      0.00        21
   BackSpace       0.93      0.97      0.95       134
         [i]       0.75      0.93      0.83       567
         [s]       0.95      0.77      0.85       231
           a       0.86      0.79      0.82        98
           b       0.78      0.78      0.78         9
           c       0.92      0.82      0.87        40
           d       0.97      0.84      0.90        43
           e       0.86      0.90      0.88       163
           f       0.92      0.96      0.94        23
           g       0.85      0.55      0.67        20
           h       0.90      0.81      0.85        32
           i       0.86      0.98      0.92        89
           j       0.00      0.00      0.00         2
           k       1.00      0.38      0.56        13
           l       0.94      0.66      0.78        47
           m       0.75    

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           ,       0.86      0.86      0.86         7
           .       0.00      0.00      0.00        11
   BackSpace       0.94      0.97      0.96       147
         [i]       0.77      0.95      0.85       549
         [s]       0.97      0.78      0.87       246
           a       0.81      0.84      0.83       109
           b       0.70      0.64      0.67        11
           c       0.94      0.92      0.93        51
           d       0.97      0.75      0.85        40
           e       0.86      0.93      0.89       148
           f       0.88      0.78      0.82        18
           g       0.95      0.69      0.80        26
           h       0.89      0.82      0.85        38
           i       0.95      0.90      0.93       113
           j       0.00      0.00      0.00         2
           k       1.00      0.75      0.86         8
           l       1.00      0.55      0.71        42
           m       0.77    

[{'test_loss': 0.3296966552734375, 'test_acc': 0.8542713522911072}]

## Resume training state

In [None]:
# dm = KeyDataModule(batch_size=8, 
#                    labels_dir='/kaggle/input/keystroke/labels',
#                    videos_dir="/kaggle/input/keystroke/raw_frames_320",
#                    train_vids=[
#                        'video_1', 'video_2', 'video_3', 'video_4', 'video_5', 
#                         'video_6', 'video_7', 'video_8', 'video_9', 'video_10',
#                        'video_11', 'video_12', 'video_13', 'video_14', 'video_15', 
#                        'video_16', 'video_17', 'video_18', 'video_19',
#                        'video_21', 'video_22', 'video_23', 'video_24', 'video_25', 
#                        'video_26', 'video_27', 'video_28', 'video_29', 'video_30'], 
#                    val_vids=['video_31', 'video_32', 'video_33'], 
#                    test_vids=['video_34','video_35', 'video_36'])
# model = KeyClf.load_from_checkpoint("/kaggle/input/key-clf-smaller/pytorch/v1/1/smaller-epoch10-step53350.ckpt")
# trainer = L.Trainer(
#     # deterministic=True,
#     accelerator="tpu",
#     fast_dev_run=False,
#     max_time="00:11:00:00",
#     callbacks=[EarlyStopping(monitor="val_loss", patience=10)]
# )
# trainer.fit(model, dm)
# trainer.test(model, dm)

## Test

In [15]:
def levenshtein_distance(s1, s2):
    # Initialize a matrix with zeros
    dp = np.zeros((len(s1) + 1, len(s2) + 1), dtype=int)
    
    # Initialize first row and column
    for i in range(len(s1) + 1):
        dp[i, 0] = i
    for j in range(len(s2) + 1):
        dp[0, j] = j
    
    # Fill the matrix
    for i in range(1, len(s1) + 1):
        for j in range(1, len(s2) + 1):
            cost = 0 if s1[i - 1] == s2[j - 1] else 1
            dp[i, j] = min(dp[i - 1, j] + 1,      # deletion
                           dp[i, j - 1] + 1,      # insertion
                           dp[i - 1, j - 1] + cost)  # substitution
    
    # Return the edit distance
    return dp[len(s1), len(s2)]

def similarity_percentage(s1, s2):
    # Calculate the Levenshtein distance
    distance = levenshtein_distance(s1, s2)
    # Calculate the maximum possible distance
    max_distance = max(len(s1), len(s2))
    # Calculate similarity as a percentage
    similarity = 1 - distance / max_distance  
    # Convert to percentage
    similarity_percentage = similarity * 100
    return similarity_percentage

def process_prediction(preds, prob_min = 0.5):
    pred_text = []
    all_keys = preds.iloc[:,1].to_list()
    all_probs = preds.iloc[:, 2].to_list()
    i = 0
    while i < len(preds):
        # Get all the similar keys next to it
        curr_key = all_keys[i]
        # print(''.join(pred_text))
        # print('curr_key: ', curr_key)
        j = i + 1
        while j < len(preds):
            if all_keys[j] != curr_key: 
                j = j - 1
                break
            j += 1
        
        occurences = j - i + 1
        # print('occurences: ', occurences)
        
        # Idle keys => skip
        if curr_key == '[i]':
            i = j + 1
            continue

        # Backspace => count how many consecutive bs keys. For 4, remove one prev key 
        elif curr_key == 'BackSpace':
            deletions = occurences // 4
            
            for _ in range(deletions):
                if len(pred_text):
                    pred_text.pop()
            

        # Other character, less than 4 occurces append once. 
        # Else, for the every next 2 occurrence append once (because people press 2 same keys faster)
        else:
            if curr_key == '[s]': curr_key = " "
            elif curr_key == '[,]': curr_key = ','
            elif curr_key == '[.]': curr_key = '.'
            
            if occurences == 1: 
                if all_probs[i] >= prob_min:
                    pred_text.append(curr_key)
                
            
            elif occurences <= 4:
                if max(all_probs[i:j+1]) >= prob_min:
                    pred_text.append(curr_key)
                
            else:
                pred_text.append(curr_key)
                for _ in range((occurences - 4) // 4):
                    pred_text.append(curr_key)
        
        i = j + 1
    

    return ''.join(pred_text)

def test_video(trained_model, video_dir, ground_truth_dir, video_name, total_window=5):
    images = glob.glob(f'{video_dir}/{video_name}/*.jpg')
    frames = []
    for i in range(len(images)):
        image = torchvision.io.read_image(f"{video_dir}/{video_name}/frame_{i}.jpg")
        frames.append(image)
    frames = torch.stack(frames)

    keys = []
    frame_no = []
    probs = []
    trained_model.freeze()
    for i in range(0, len(frames)):
        video = trained_model.transforms(frames[i: i + total_window])
        video = video.permute(1, 0, 2, 3)
        out = F.softmax(trained_model.model(video.unsqueeze(0)), 1)[0]
        _id = torch.argmax(out).item()
        label = id2Label[_id]
        
        print(f"Frame {i};{label};{out[_id]}")
        frame_no.append(i)
        keys.append(label)
        probs.append(out[_id].item())
    
    df = pd.DataFrame({
        'Frame': frame_no,
        'Key': keys,
        'Prob': probs,
    })
    os.makedirs('./datasets/test-result', exist_ok=True)
    df.to_csv(f'./datasets/test-result/{video_name}.csv', sep=';', index=False)

    pred_text = process_prediction(df)
    print('pred_text: ', pred_text)
    
    if ground_truth_dir:
        ground_truth = open(f'{ground_truth_dir}/{video_name}.txt', 'r').read()
        similarity = similarity_percentage(pred_text, ground_truth)
        print('similarity: ', similarity)
    
    open(f"./datasets/test-result/{video_name}_processed.txt", 'w').write(pred_text)

trained_model = KeyClf.load_from_checkpoint("ckpts/resnet-epoch=22-step=111550.ckpt")

test_video(trained_model, video_dir = 'datasets/raw_frames', 
           video_name='video_37',
           ground_truth_dir = 'datasets/ground_truths')

/Users/haily/.pyenv/versions/3.10.4/lib/python3.10/site-packages/lightning/pytorch/utilities/migration/utils.py:56: The loaded checkpoint was produced with Lightning v2.3.3, which is newer than your current Lightning version: v2.3.0
  m.weight = nn.init.kaiming_normal(m.weight, mode='fan_out')


Frame 0;[i];0.9902792572975159
Frame 1;[i];0.9919143915176392
Frame 2;[i];0.990016758441925
Frame 3;[i];0.9899107813835144
Frame 4;[i];0.9924821257591248
Frame 5;[i];0.9955222606658936
Frame 6;[i];0.9942843317985535
Frame 7;[i];0.9969844222068787
Frame 8;[i];0.9920719265937805
Frame 9;[i];0.9966993927955627
Frame 10;[i];0.9978340268135071
Frame 11;[i];0.9973853230476379
Frame 12;[i];0.9934138655662537
Frame 13;[i];0.9949801564216614


In [13]:
class OldKeyClf(L.LightningModule):
    def __init__(self, img_size, num_classes, learning_rate, weights):
        super().__init__()
        self.model = resnet101(sample_size=img_size, 
                               sample_duration=8,
                               shortcut_type='B', 
                               num_classes=num_classes)
        
        self.loss_fn = torch.nn.CrossEntropyLoss(torch.tensor(weights).float())
        self.accuracy = torchmetrics.Accuracy(task="multiclass", num_classes=num_classes)
        self.lr = learning_rate
        self.transforms = v2.Compose([
            v2.CenterCrop(img_size),
            v2.ToDtype(torch.float32, scale=True),
        ])
        
        self.test_preds = []
        self.test_targets = []
        self.save_hyperparameters()


    def test_step(self, batch):
        videos, targets = batch
        videos = self.transforms(videos)
        videos = videos.permute(0, 2, 1, 3, 4)
        preds = self.model(videos)

        pred_ids = torch.argmax(self.model(videos), dim=1).squeeze()
        pred_labels = [id2Label[_id] for _id in pred_ids]
        self.test_preds += pred_labels
        self.test_targets += [id2Label[_id] for _id in targets]
        
        loss = self.loss_fn(preds, targets.long())
        self.log_dict({'test_acc': self.accuracy(preds, targets), 'test_loss': loss})
    
    def on_test_end(self):
        print(classification_report(self.test_targets, self.test_preds))
        
    def training_step(self, batch):
        videos, targets = batch
        videos = self.transforms(videos)
        videos = videos.permute(0, 2, 1, 3, 4)
        preds = self.model(videos)
        loss = self.loss_fn(preds, targets.long())
        self.log_dict({"train_loss": loss, "train_acc": self.accuracy(preds, targets)})
        return loss

    def validation_step(self, batch):
        videos, targets = batch
        videos = self.transforms(videos)
        videos = videos.permute(0, 2, 1, 3, 4)
        preds = self.model(videos)
        loss = self.loss_fn(preds, targets.long())
        self.log_dict({"val_loss": loss, "val_acc": self.accuracy(preds, targets)})
        return loss

    def configure_optimizers(self):
        return torch.optim.Adam(self.model.parameters(), lr=self.lr)
    
trained_model = OldKeyClf.load_from_checkpoint("ckpts/res-8-frames-epoch=7-step=34979-full.ckpt")

test_video(trained_model, video_dir = 'datasets/raw_frames', 
           video_name='video_37',
           ground_truth_dir = 'datasets/ground_truths',
           total_window = 8)

  m.weight = nn.init.kaiming_normal(m.weight, mode='fan_out')


Frame 0;[i];0.9882051944732666
Frame 1;[i];0.9879647493362427
Frame 2;[i];0.9872369766235352
Frame 3;[i];0.9889483451843262
Frame 4;[i];0.9912103414535522
Frame 5;[i];0.9886900782585144
Frame 6;[i];0.9908220767974854
Frame 7;[i];0.9924551248550415
Frame 8;[i];0.989829957485199
Frame 9;[i];0.9900252819061279
Frame 10;[i];0.9926048517227173
Frame 11;[i];0.9934038519859314
Frame 12;[i];0.9928359389305115
Frame 13;[i];0.9899583458900452
Frame 14;[i];0.9843432903289795
Frame 15;[i];0.9875625371932983
Frame 16;[i];0.98972088098526
Frame 17;[i];0.9913150668144226
Frame 18;[i];0.9855330586433411
Frame 19;[i];0.9746261239051819
Frame 20;[i];0.977071225643158
Frame 21;[i];0.9798228144645691
Frame 22;[i];0.9858381152153015
Frame 23;[i];0.9883211255073547
Frame 24;[i];0.9895350337028503
Frame 25;[i];0.9882431626319885
Frame 26;[i];0.9884276986122131
Frame 27;[i];0.9878906011581421
Frame 28;[i];0.9897312521934509
Frame 29;[i];0.9880654811859131
Frame 30;[i];0.9886268973350525
Frame 31;[i];0.9869325

FileNotFoundError: [Errno 2] No such file or directory: './test-result/video_37_processed.txt'

In [None]:
# pred_text = 'dr team, i op ouou al ha  fnnasatic time t our recent empohye pprcition ent  t s ta tryuuuly truy a plarsr to s eerne scome to ggehr nd nouy fte estivities . our entunnusism nd ositi enrgy md th ent a resounding succes   we nt ed to ta e  moment to eextend our  herrt  et grrduttidtde to ec of o r hrd ror and ddicttiion  cbent s e thes r sa sml tlon of pprcition or  thde incrdil eort uou put intol ouyourrr ros eer  d d s e mo forrd  s crrt forrdd ths sprit of cmrderies  nd temor  o r contriutions r inuatl to our commpny succs  and  loo orwrd to chingieing  eem greate moile satoonnes gtotogter once gain  , thnn uo fo bining an intergra  prt of our tem here s to continu d scscuc c es nd mn  more mmorae moments esd b st reggards t'
# ground_truth = open('/kaggle/input/keystroke/ground_truths/video_35.txt', 'r').read()
# similarity = similarity_percentage(pred_text, ground_truth)
# similarity

In [None]:
# corrected = "dr team, i hope you all had a fantastic time at our recent employee appreciation event. it was truly a pleasure to see everyone come together and enjoy the festivities. your enthusiasm and positive energy made the event a resounding success. we wanted to take a moment to extend our heartfelt gratitude to each of you for your hard work and dedication. events like these are a small token of appreciation for the incredible effort you put into your roles every day. we look forward to carrying forward this spirit of camaraderie and teamwork. your contributions are invaluable to our company's success, and we look forward to achieving even greater milestones together once again. thank you for being an integral part of our team. here's to continued success and many more memorable moments. best regards."
# similarity = similarity_percentage(corrected, ground_truth)
# similarity