In [1]:
!pip install lightning torchvision

Collecting lightning
  Downloading lightning-2.3.3-py3-none-any.whl.metadata (35 kB)
Downloading lightning-2.3.3-py3-none-any.whl (808 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m808.5/808.5 kB[0m [31m6.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: lightning
Successfully installed lightning-2.3.3


In [1]:
import torch
import torch.nn as nn
import lightning as L
import pandas as pd
import torchvision
import numpy as np
import torch.nn.functional as F  
from torch.autograd import Variable
import math
from functools import partial
import pathlib
from torch.utils.data import DataLoader, Dataset
from torchvision.io import read_video
import lightning as L
from lightning.pytorch.loggers import CSVLogger
import torchmetrics
from lightning.pytorch.callbacks import EarlyStopping
from sklearn.metrics import classification_report, confusion_matrix, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
from torchvision.transforms import CenterCrop, v2
from datetime import datetime
import csv
import glob
import os

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
id2Label = ['[i]', 'BackSpace', ',', '[s]', '.', 
            'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 
            'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 
            'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 
            'y', 'z']
label2Id  = {label: i for i, label in enumerate(id2Label)}

NUM_WORKERS = 4
f_before = 3 # number of frames before
f_after = 4 # number of frames after
gap = 2 # gap between idle video segment and non-idle video segment
total_window = f_after + f_before + 1

# Resnet

In [3]:
#### RESNET 3D #### 
def conv3x3x3(in_planes, out_planes, stride=1):
    # 3x3x3 convolution with padding
    return nn.Conv3d(in_planes, out_planes, kernel_size=3, stride=stride, padding=1, bias=False)


def downsample_basic_block(x, planes, stride):
    out = F.avg_pool3d(x, kernel_size=1, stride=stride)
    zero_pads = torch.Tensor(out.size(0), planes - out.size(1), out.size(2), out.size(3), out.size(4) ).zero_()
    
    if isinstance(out.data, torch.cuda.FloatStorage): zero_pads = zero_pads.cuda()
    out = Variable(torch.cat([out.data, zero_pads], dim=1))
    return out

class BasicBlock(nn.Module):
    expansion = 1

    def __init__(self, inplanes, planes, stride=1, downsample=None):
        super(BasicBlock, self).__init__()
        self.conv1 = conv3x3x3(inplanes, planes, stride)
        self.bn1 = nn.BatchNorm3d(planes)
        self.relu = nn.ReLU(inplace=True)
        self.conv2 = conv3x3x3(planes, planes)
        self.bn2 = nn.BatchNorm3d(planes)
        self.downsample = downsample
        self.stride = stride

    def forward(self, x):
        residual = x

        out = self.conv1(x)
        out = self.bn1(out)
        out = self.relu(out)

        out = self.conv2(out)
        out = self.bn2(out)

        if self.downsample is not None:
            residual = self.downsample(x)

        out += residual
        out = self.relu(out)

        return out


class Bottleneck(nn.Module):
    expansion = 4

    def __init__(self, inplanes, planes, stride=1, downsample=None):
        super(Bottleneck, self).__init__()
        self.conv1 = nn.Conv3d(inplanes, planes, kernel_size=1, bias=False)
        self.bn1 = nn.BatchNorm3d(planes)

        self.conv2 = nn.Conv3d(
            planes, planes, 
            kernel_size=3, stride=stride, padding=1, 
            bias=False)
        self.bn2 = nn.BatchNorm3d(planes)
        
        self.conv3 = nn.Conv3d(planes, planes * 4, kernel_size=1, bias=False)
        self.bn3 = nn.BatchNorm3d(planes * 4)
        
        self.relu = nn.ReLU(inplace=True)
        self.downsample = downsample
        self.stride = stride

    def forward(self, x):
        residual = x

        out = self.conv1(x)
        out = self.bn1(out)
        out = self.relu(out)

        out = self.conv2(out)
        out = self.bn2(out)
        out = self.relu(out)

        out = self.conv3(out)
        out = self.bn3(out)

        if self.downsample is not None:
            residual = self.downsample(x)

        out += residual
        out = self.relu(out)

        return out


class ResNet(nn.Module):
    def __init__(self, block, layers, sample_size, sample_duration, shortcut_type='B', num_classes=400):
        """
        block: basic block or bottle neck
        layers: define Resnet architecture 34, 101, 152 etc
        sample size: image size
        shortcut_type: 'A' or 'B'
        num_classes: ...
        """
        self.inplanes = 64
        super(ResNet, self).__init__()
        self.conv1 = nn.Conv3d(3, 64, kernel_size=7, stride=(1, 2, 2), padding=(3, 3, 3), bias=False)
        self.bn1 = nn.BatchNorm3d(64)
        self.relu = nn.ReLU(inplace=True)
        self.maxpool = nn.MaxPool3d(kernel_size=(3, 3, 3), stride=2, padding=1)
        self.layer1 = self._make_layer(block, 64, layers[0], shortcut_type)
        self.layer2 = self._make_layer(block, 128, layers[1], shortcut_type, stride=2)
        self.layer3 = self._make_layer(block, 256, layers[2], shortcut_type, stride=2)
        self.layer4 = self._make_layer(block, 512, layers[3], shortcut_type, stride=2)
        
        
        last_duration = int(math.ceil(sample_duration / 16))
        last_size = int(math.ceil(sample_size / 32))
        self.avgpool = nn.AvgPool3d(
            (last_duration, last_size, last_size), stride=1)
        
        self.fc = nn.Linear(512 * block.expansion, num_classes)

        for m in self.modules():
            if isinstance(m, nn.Conv3d):
                m.weight = nn.init.kaiming_normal(m.weight, mode='fan_out')
            elif isinstance(m, nn.BatchNorm3d):
                m.weight.data.fill_(1)
                m.bias.data.zero_()

    def _make_layer(self, block, planes, blocks, shortcut_type, stride=1):
        downsample = None
        if stride != 1 or self.inplanes != planes * block.expansion:
            if shortcut_type == 'A':
                downsample = partial(
                    downsample_basic_block,
                    planes=planes * block.expansion,
                    stride=stride)
            else:
                downsample = nn.Sequential(
                    nn.Conv3d(self.inplanes,planes * block.expansion,kernel_size=1,stride=stride,bias=False), 
                    nn.BatchNorm3d(planes * block.expansion))

        layers = []
        layers.append(block(self.inplanes, planes, stride, downsample))
        self.inplanes = planes * block.expansion
        for i in range(1, blocks):
            layers.append(block(self.inplanes, planes))

        return nn.Sequential(*layers)

    def forward(self, x):
        x = self.conv1(x)
        x = self.bn1(x)
        x = self.relu(x)
        x = self.maxpool(x)

        x = self.layer1(x)
        x = self.layer2(x)
        x = self.layer3(x)
        x = self.layer4(x)

        x = self.avgpool(x)

        x = x.view(x.size(0), -1)
        x = self.fc(x)

        return x

def resnet10(**kwargs): return ResNet(BasicBlock, [1, 1, 1, 1], **kwargs)
def resnet18(**kwargs): return ResNet(BasicBlock, [2, 2, 2, 2], **kwargs)
def resnet34(**kwargs): return ResNet(BasicBlock, [3, 4, 6, 3], **kwargs)
def resnet50(**kwargs): return ResNet(Bottleneck, [3, 4, 6, 3], **kwargs)
def resnet101(**kwargs): return ResNet(Bottleneck, [3, 4, 23, 3], **kwargs)
def resnet152(**kwargs): return ResNet(Bottleneck, [3, 8, 36, 3], **kwargs)
def resnet200(**kwargs): return ResNet(Bottleneck, [3, 24, 36, 3], **kwargs)

# Lightning dataset

In [7]:
class KeyDataset(torch.utils.data.Dataset):
    def __init__(self, video_name, labels_dir, videos_dir):
        segments = []
        # Infer idle frames.
        self.labels_dir = labels_dir
        self.videos_dir = videos_dir
        df = pd.read_csv(f'{self.labels_dir}/{video_name}.csv')
        for index, row in df.iterrows():
            key_frame = int(row['Frame'])  # Frame number where key was pressed
            key_value = row['Key']  # Key pressed
            if key_value not in id2Label:
                key_value = '[s]'
            
            is_idle_before = False
            if index == 0:
                pos_start = max(key_frame - f_before, 0)
                pos_end = key_frame + f_after
                neg_start = 0
                neg_end = pos_start - gap
                is_idle_before = True
            else:
                prev_key_frame = df.iloc[index - 1]['Frame']
                pos_start = max(key_frame - f_before, 0)
                pos_end = key_frame + f_after
                prev_pos_end = prev_key_frame + f_after
                if (pos_start - prev_pos_end) - 1 >= (f_after + f_before + 1 + gap * 2):
                    neg_start =  prev_pos_end + gap
                    neg_end = pos_start - gap
                    is_idle_before = True
            
            
            # Negative class video segments before
            if is_idle_before:
                j = neg_start
                while (j + total_window - 1) <= neg_end:
                    segments.append(([j, j + total_window - 1], "[i]"))
                    j += total_window
            
            # Current video with keystroke
            segments.append(([pos_start, pos_end], key_value))
        
        self.video_name = video_name
        self.segments = segments
    def __len__(self):
        return len(self.segments)

    def __getitem__(self, idx):
        (start, end), label = self.segments[idx]
        
        frames = []
        for i in range(start, end + 1):
            image = torchvision.io.read_image(f"{self.videos_dir}/{self.video_name}/frame_{i}.jpg")
            
            frames.append(image)
       
        return torch.stack(frames) / 255.0, label2Id[label]
    
    def get_class_counts(self):
        labels = [segment[1] for segment in self.segments]
        unique_elements, counts = np.unique(labels, return_counts=True)
        occurrences = dict(zip(unique_elements, counts))
        weights = np.zeros(len(id2Label))
        for label, count in occurrences.items():
            weights[label2Id[label]] = count
        return weights

class KeyDataModule(L.LightningDataModule):
    def __init__(self, batch_size, labels_dir, videos_dir, train_vids, val_vids, test_vids):
        super().__init__()
        self.batch_size = batch_size
        self.train_datasets = [KeyDataset(video_name, labels_dir, videos_dir) for video_name in train_vids]
        self.val_datasets = [KeyDataset(video_name, labels_dir, videos_dir) for video_name in val_vids]
        self.test_datasets = [KeyDataset(video_name, labels_dir, videos_dir) for video_name in test_vids]
        
        self.train_dataset = torch.utils.data.ConcatDataset(self.train_datasets)
        self.test_dataset = torch.utils.data.ConcatDataset(self.test_datasets)
        self.val_dataset = torch.utils.data.ConcatDataset(self.val_datasets)
        
        
        
        print(f"Train: {len(self.train_dataset)}; Val: {len(self.val_dataset)}; Test: {len(self.test_dataset)}")
        
        train_counts = np.array([d.get_class_counts() for d in self.train_datasets]).sum(axis=0)
        test_counts = np.array([d.get_class_counts() for d in self.test_datasets]).sum(axis=0)
    
        
        val_counts = np.array([d.get_class_counts() for d in self.val_datasets]).sum(axis=0)
        
        class_counts = pd.DataFrame({'letters': id2Label, 'train': train_counts, 'test': test_counts, 'val': val_counts})
        print('class_counts: ', class_counts)

        train_total_samples = np.array([len(d) for d in self.train_datasets]).sum(axis=0)
        self.train_weights = train_counts / (train_total_samples * len(id2Label))
                                        
    def train_dataloader(self):
        return DataLoader(self.train_dataset, 
                          batch_size=self.batch_size, 
                          persistent_workers=True,
                          num_workers=NUM_WORKERS)
    
    def val_dataloader(self):
        return DataLoader(self.val_dataset, 
                          batch_size=self.batch_size, 
                          num_workers=NUM_WORKERS,
                          persistent_workers=True,
                          shuffle=False)
    
    def test_dataloader(self):
        return DataLoader(self.test_dataset, 
                          batch_size=self.batch_size, 
                          num_workers=NUM_WORKERS,
                          persistent_workers=True,
                          shuffle=False)

class KeyClf(L.LightningModule):
    def __init__(self, img_size, num_classes, learning_rate, weights):
        super().__init__()
        self.model = resnet101(sample_size=img_size, 
                               sample_duration=total_window,
                               shortcut_type='B', 
                               num_classes=num_classes)
        
        self.loss_fn = torch.nn.CrossEntropyLoss(torch.tensor(weights).float())
        self.accuracy = torchmetrics.Accuracy(task="multiclass", num_classes=num_classes)
        self.lr = learning_rate
        self.transforms = v2.Compose([
            v2.CenterCrop(img_size),
            v2.ToDtype(torch.float32, scale=True),
        ])
        
        self.test_preds = []
        self.test_targets = []
        self.save_hyperparameters()


    def test_step(self, batch):
        videos, targets = batch
        videos = self.transforms(videos)
        videos = videos.permute(0, 2, 1, 3, 4)
        preds = self.model(videos)

        pred_ids = torch.argmax(self.model(videos), dim=1)
        pred_labels = [id2Label[_id] for _id in pred_ids]
        self.test_preds += pred_labels
        self.test_targets += [id2Label[_id] for _id in targets]
        
        loss = self.loss_fn(preds, targets.long())
        self.log_dict({'test_loss': loss, 'test_acc': self.accuracy(preds, targets)})
       
    
    def on_test_end(self):
        print(classification_report(self.test_targets, self.test_preds))
        
    def training_step(self, batch):
        videos, targets = batch
        videos = self.transforms(videos)
        videos = videos.permute(0, 2, 1, 3, 4)
        preds = self.model(videos)
        loss = self.loss_fn(preds, targets.long())
        self.log_dict({'train_loss': loss, 'train_acc': self.accuracy(preds, targets)})
        return loss

    def validation_step(self, batch):
        videos, targets = batch
        videos = self.transforms(videos)
        videos = videos.permute(0, 2, 1, 3, 4)
        preds = self.model(videos)
        loss = self.loss_fn(preds, targets.long())
        self.log_dict({'val_loss': loss, 'val_acc': self.accuracy(preds, targets)})
        return loss

    def configure_optimizers(self):
        return torch.optim.Adam(self.model.parameters(), lr=self.lr)

In [6]:
test_video_1 = KeyDataset('video_1', 
                          labels_dir='/kaggle/input/keystroke/labels',
                           videos_dir="/kaggle/input/keystroke/raw_frames_320")

## Train

In [8]:
dm = KeyDataModule(batch_size=16, 
                   labels_dir='datasets/labels',
                   videos_dir="datasets/raw_frames_320",
                   train_vids=[
                       'video_1', 'video_2', 'video_3', 'video_4', 'video_5', 
                        'video_6', 'video_7', 'video_8', 'video_9', 'video_10',
                       'video_11', 'video_12', 'video_13', 'video_14', 'video_15', 
                       'video_16', 'video_17', 'video_18', 'video_19',
                       'video_21', 'video_22', 'video_23', 'video_24', 'video_25', 
                       'video_26', 'video_27', 'video_28', 'video_29', 'video_30'], 
                   val_vids=['video_31', 'video_32', 'video_33'],
                   test_vids=['video_34','video_35', 'video_36', 'video_37'])

# module = KeyClf(img_size=320, num_classes=len(id2Label), learning_rate=0.001, weights=dm.train_weights)

# logger = CSVLogger("logs", name=f"resnet_101_320_8", flush_logs_every_n_steps=1)
# trainer = L.Trainer(
#     # deterministic=True,
#     devices=[0, 1],
#     accelerator='gpu',
#     fast_dev_run=False,
#     logger=logger,
#     callbacks=[EarlyStopping(monitor="val_loss", patience=5)]
# )
# trainer.fit(module, dm, ckpt_path="/kaggle/input/keyclf/pytorch/resne101-f8/1/resnet101-f8-epoch9-step19270.ckpt")


Train: 61653; Val: 5775; Test: 4963
class_counts:        letters    train   test     val
0         [i]  13424.0  709.0  1051.0
1   BackSpace   4182.0  367.0   429.0
2           ,    532.0   25.0    86.0
3         [s]   7038.0  634.0   700.0
4           .    376.0   48.0    71.0
5           a   2938.0  247.0   251.0
6           b    590.0   38.0    72.0
7           c   1222.0  114.0   123.0
8           d   1179.0  111.0   111.0
9           e   4348.0  391.0   329.0
10          f    682.0   60.0    85.0
11          g    734.0   63.0   108.0
12          h   1228.0   92.0    98.0
13          i   2825.0  260.0   272.0
14          j    407.0   20.0    64.0
15          k    363.0   36.0    74.0
16          l   1573.0  114.0   128.0
17          m    850.0   91.0   109.0
18          n   2352.0  203.0   183.0
19          o   2539.0  245.0   193.0
20          p    917.0   78.0   106.0
21          q    346.0   20.0    68.0
22          r   2248.0  221.0   174.0
23          s   1843.0  165.0   162.0

In [8]:
trainer.test(module, dm)

INFO: Initializing distributed: GLOBAL_RANK: 0, MEMBER: 1/2
INFO: Initializing distributed: GLOBAL_RANK: 1, MEMBER: 2/2
INFO: ----------------------------------------------------------------------------------------------------
distributed_backend=nccl
All distributed processes registered. Starting with 2 processes
----------------------------------------------------------------------------------------------------

INFO: LOCAL_RANK: 1 - CUDA_VISIBLE_DEVICES: [0,1]
INFO: LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]
/opt/conda/lib/python3.10/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:215: Using `DistributedSampler` with the dataloaders. During `trainer.test()`, it is recommended to use `Trainer(devices=1, num_nodes=1)` to ensure each sample/batch gets evaluated exactly once. Otherwise, multi-device settings use `DistributedSampler` that replicates some samples to make sure all devices have same batch size in case of uneven inputs.


Testing: |          | 0/? [00:00<?, ?it/s]

/opt/conda/lib/python3.10/site-packages/lightning/pytorch/trainer/connectors/logger_connector/result.py:439: It is recommended to use `self.log('test_loss', ..., sync_dist=True)` when logging on epoch level in distributed setting to accumulate the metric across devices.
/opt/conda/lib/python3.10/site-packages/lightning/pytorch/trainer/connectors/logger_connector/result.py:439: It is recommended to use `self.log('test_acc', ..., sync_dist=True)` when logging on epoch level in distributed setting to accumulate the metric across devices.
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           ,       1.00      0.75      0.86         4
           .       1.00      0.08      0.15        12
   BackSpace       0.95      0.98      0.96       140
         [i]       0.79      0.88      0.83       215
         [s]       0.78      0.98      0.87       264
           a       0.93      0.88      0.90        96
           b       1.00      0.82      0.90        11
           c       0.91      0.91      0.91        45
           d       0.97      0.86      0.91        37
           e       0.85      0.94      0.89       156
           f       1.00      0.57      0.73        14
           g       1.00      0.37      0.54        27
           h       0.96      0.69      0.81        39
           i       0.85      0.95      0.90       110
           j       0.00      0.00      0.00         2
           k       1.00      0.69      0.82        13
           l       1.00      0.89      0.94        46
           m       0.79    

              precision    recall  f1-score   support

           ,       0.88      0.64      0.74        11
           .       1.00      0.20      0.33        20
   BackSpace       0.96      0.99      0.98       141
         [i]       0.80      0.93      0.86       227
         [s]       0.75      0.97      0.85       213
           a       0.92      0.90      0.91       111
           b       1.00      0.56      0.71         9
           c       0.95      0.91      0.93        46
           d       0.98      0.96      0.97        46
           e       0.93      0.92      0.93       155
           f       1.00      0.52      0.68        27
           g       0.90      0.47      0.62        19
           h       0.96      0.77      0.86        31
           i       0.88      0.96      0.92        92
           j       1.00      0.50      0.67         2
           k       1.00      0.62      0.77         8
           l       0.97      0.84      0.90        43
           m       0.78    

[{'test_loss': 0.31299757957458496, 'test_acc': 0.8768898248672485}]

## Resume training state

In [9]:
# dm = KeyDataModule(batch_size=8, 
#                    labels_dir='/kaggle/input/keystroke/labels',
#                    videos_dir="/kaggle/input/keystroke/raw_frames_320",
#                    train_vids=[
#                        'video_1', 'video_2', 'video_3', 'video_4', 'video_5', 
#                         'video_6', 'video_7', 'video_8', 'video_9', 'video_10',
#                        'video_11', 'video_12', 'video_13', 'video_14', 'video_15', 
#                        'video_16', 'video_17', 'video_18', 'video_19',
#                        'video_21', 'video_22', 'video_23', 'video_24', 'video_25', 
#                        'video_26', 'video_27', 'video_28', 'video_29', 'video_30'], 
#                    val_vids=['video_31', 'video_32', 'video_33'], 
#                    test_vids=['video_34','video_35', 'video_36'])
# model = KeyClf.load_from_checkpoint("/kaggle/input/key-clf-smaller/pytorch/v1/1/smaller-epoch10-step53350.ckpt")
# trainer = L.Trainer(
#     # deterministic=True,
#     accelerator="tpu",
#     fast_dev_run=False,
#     max_time="00:11:00:00",
#     callbacks=[EarlyStopping(monitor="val_loss", patience=10)]
# )
# trainer.fit(model, dm)
# trainer.test(model, dm)

## Test

In [10]:
# pred_text = 'dear team, i hope this email finds you well. i wanted to provide you with an update on the ongoing it infrastructure upgrades. the project is progressing according to schedule, with significant milestones achieved in recent weeks. our focus has been on enhancing network security measures and improving system reliability. the it team has worked tirelessly to minimize disruption during the implementation phase. we appreciate your patience and cooperation during this period of transition. moving forward, we will continue to monitor the performance enhancements closely. please remain vigilant for any potential issues and report them promptly to the it helpdesk. your feedback is invaluable as we strive to create a more robust and efficient technology environment. thank you for your continued support and commitment to our shared goals. together, we are ensuring our it infrastructure meets the demands of our growing business. best regards, [name]'
# ground_truth = open('/kaggle/input/keystroke/ground_truths/video_35.txt', 'r').read()
# similarity = similarity_percentage(pred_text, ground_truth)
# similarity

In [11]:
# corrected = "dr team, i hope you all had a fantastic time at our recent employee appreciation event. it was truly a pleasure to see everyone come together and enjoy the festivities. your enthusiasm and positive energy made the event a resounding success. we wanted to take a moment to extend our heartfelt gratitude to each of you for your hard work and dedication. events like these are a small token of appreciation for the incredible effort you put into your roles every day. we look forward to carrying forward this spirit of camaraderie and teamwork. your contributions are invaluable to our company's success, and we look forward to achieving even greater milestones together once again. thank you for being an integral part of our team. here's to continued success and many more memorable moments. best regards."
# similarity = similarity_percentage(corrected, ground_truth)
# similarity