In [1]:
!pip install lightning torchvision

Collecting lightning
  Downloading lightning-2.3.3-py3-none-any.whl.metadata (35 kB)
Downloading lightning-2.3.3-py3-none-any.whl (808 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m808.5/808.5 kB[0m [31m21.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: lightning
Successfully installed lightning-2.3.3


In [2]:
import torch
import torch.nn as nn
import lightning as L
import pandas as pd
import torchvision
import numpy as np
import torch.nn.functional as F  
from torch.autograd import Variable
import math
from functools import partial
import pathlib
from torch.utils.data import DataLoader, Dataset
from torchvision.io import read_video
import lightning as L
from lightning.pytorch.loggers import CSVLogger
import torchmetrics
from lightning.pytorch.callbacks import EarlyStopping
from sklearn.metrics import classification_report, confusion_matrix, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
from torchvision.transforms import CenterCrop, v2
from datetime import datetime
import csv
import glob
import os

In [3]:
id2Label = ['[i]', 'BackSpace', ',', '[s]', '.', 
            'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 
            'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 
            'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 
            'y', 'z']
label2Id  = {label: i for i, label in enumerate(id2Label)}

NUM_WORKERS = 4
f_after = 2 # number of frames after
f_before = 2 # number of frames before
gap = 2 # gap between idle video segment and non-idle video segment
total_window = f_after + f_before + 1

# Resnet

In [4]:
#### RESNET 3D #### 
def conv3x3x3(in_planes, out_planes, stride=1):
    # 3x3x3 convolution with padding
    return nn.Conv3d(in_planes, out_planes, kernel_size=3, stride=stride, padding=1, bias=False)


def downsample_basic_block(x, planes, stride):
    out = F.avg_pool3d(x, kernel_size=1, stride=stride)
    zero_pads = torch.Tensor(out.size(0), planes - out.size(1), out.size(2), out.size(3), out.size(4) ).zero_()
    
    if isinstance(out.data, torch.cuda.FloatStorage): zero_pads = zero_pads.cuda()
    out = Variable(torch.cat([out.data, zero_pads], dim=1))
    return out

class BasicBlock(nn.Module):
    expansion = 1

    def __init__(self, inplanes, planes, stride=1, downsample=None):
        super(BasicBlock, self).__init__()
        self.conv1 = conv3x3x3(inplanes, planes, stride)
        self.bn1 = nn.BatchNorm3d(planes)
        self.relu = nn.ReLU(inplace=True)
        self.conv2 = conv3x3x3(planes, planes)
        self.bn2 = nn.BatchNorm3d(planes)
        self.downsample = downsample
        self.stride = stride

    def forward(self, x):
        residual = x

        out = self.conv1(x)
        out = self.bn1(out)
        out = self.relu(out)

        out = self.conv2(out)
        out = self.bn2(out)

        if self.downsample is not None:
            residual = self.downsample(x)

        out += residual
        out = self.relu(out)

        return out


class Bottleneck(nn.Module):
    expansion = 4

    def __init__(self, inplanes, planes, stride=1, downsample=None):
        super(Bottleneck, self).__init__()
        self.conv1 = nn.Conv3d(inplanes, planes, kernel_size=1, bias=False)
        self.bn1 = nn.BatchNorm3d(planes)

        self.conv2 = nn.Conv3d(
            planes, planes, 
            kernel_size=3, stride=stride, padding=1, 
            bias=False)
        self.bn2 = nn.BatchNorm3d(planes)
        
        self.conv3 = nn.Conv3d(planes, planes * 4, kernel_size=1, bias=False)
        self.bn3 = nn.BatchNorm3d(planes * 4)
        
        self.relu = nn.ReLU(inplace=True)
        self.downsample = downsample
        self.stride = stride

    def forward(self, x):
        residual = x

        out = self.conv1(x)
        out = self.bn1(out)
        out = self.relu(out)

        out = self.conv2(out)
        out = self.bn2(out)
        out = self.relu(out)

        out = self.conv3(out)
        out = self.bn3(out)

        if self.downsample is not None:
            residual = self.downsample(x)

        out += residual
        out = self.relu(out)

        return out


class ResNet(nn.Module):
    def __init__(self, block, layers, sample_size, sample_duration, shortcut_type='B', num_classes=400):
        """
        block: basic block or bottle neck
        layers: define Resnet architecture 34, 101, 152 etc
        sample size: image size
        shortcut_type: 'A' or 'B'
        num_classes: ...
        """
        self.inplanes = 64
        super(ResNet, self).__init__()
        self.conv1 = nn.Conv3d(3, 64, kernel_size=7, stride=(1, 2, 2), padding=(3, 3, 3), bias=False)
        self.bn1 = nn.BatchNorm3d(64)
        self.relu = nn.ReLU(inplace=True)
        self.maxpool = nn.MaxPool3d(kernel_size=(3, 3, 3), stride=2, padding=1)
        self.layer1 = self._make_layer(block, 64, layers[0], shortcut_type)
        self.layer2 = self._make_layer(block, 128, layers[1], shortcut_type, stride=2)
        self.layer3 = self._make_layer(block, 256, layers[2], shortcut_type, stride=2)
        self.layer4 = self._make_layer(block, 512, layers[3], shortcut_type, stride=2)
        
        
        last_duration = int(math.ceil(sample_duration / 16))
        last_size = int(math.ceil(sample_size / 32))
        self.avgpool = nn.AvgPool3d(
            (last_duration, last_size, last_size), stride=1)
        
        self.fc = nn.Linear(512 * block.expansion, num_classes)

        for m in self.modules():
            if isinstance(m, nn.Conv3d):
                m.weight = nn.init.kaiming_normal(m.weight, mode='fan_out')
            elif isinstance(m, nn.BatchNorm3d):
                m.weight.data.fill_(1)
                m.bias.data.zero_()

    def _make_layer(self, block, planes, blocks, shortcut_type, stride=1):
        downsample = None
        if stride != 1 or self.inplanes != planes * block.expansion:
            if shortcut_type == 'A':
                downsample = partial(
                    downsample_basic_block,
                    planes=planes * block.expansion,
                    stride=stride)
            else:
                downsample = nn.Sequential(
                    nn.Conv3d(self.inplanes,planes * block.expansion,kernel_size=1,stride=stride,bias=False), 
                    nn.BatchNorm3d(planes * block.expansion))

        layers = []
        layers.append(block(self.inplanes, planes, stride, downsample))
        self.inplanes = planes * block.expansion
        for i in range(1, blocks):
            layers.append(block(self.inplanes, planes))

        return nn.Sequential(*layers)

    def forward(self, x):
        x = self.conv1(x)
        x = self.bn1(x)
        x = self.relu(x)
        x = self.maxpool(x)

        x = self.layer1(x)
        x = self.layer2(x)
        x = self.layer3(x)
        x = self.layer4(x)

        x = self.avgpool(x)

        x = x.view(x.size(0), -1)
        x = self.fc(x)

        return x

def resnet10(**kwargs): return ResNet(BasicBlock, [1, 1, 1, 1], **kwargs)
def resnet18(**kwargs): return ResNet(BasicBlock, [2, 2, 2, 2], **kwargs)
def resnet34(**kwargs): return ResNet(BasicBlock, [3, 4, 6, 3], **kwargs)
def resnet50(**kwargs): return ResNet(Bottleneck, [3, 4, 6, 3], **kwargs)
def resnet101(**kwargs): return ResNet(Bottleneck, [3, 4, 23, 3], **kwargs)
def resnet152(**kwargs): return ResNet(Bottleneck, [3, 8, 36, 3], **kwargs)
def resnet200(**kwargs): return ResNet(Bottleneck, [3, 24, 36, 3], **kwargs)

# Lightning dataset

In [5]:
class KeyDataset(torch.utils.data.Dataset):
    def __init__(self, video_name, labels_dir, videos_dir):
        segments = []
        # Infer idle frames.
        self.labels_dir = labels_dir
        self.videos_dir = videos_dir
        df = pd.read_csv(f'{self.labels_dir}/{video_name}.csv')
        for index, row in df.iterrows():
            key_frame = int(row['Frame'])  # Frame number where key was pressed
            key_value = row['Key']  # Key pressed
            if key_value not in id2Label:
                key_value = '[s]'
            
            is_idle_before = False
            if index == 0:
                pos_start = max(key_frame - f_before, 0)
                pos_end = key_frame + f_after
                neg_start = 0
                neg_end = pos_start - gap
                is_idle_before = True
            else:
                prev_key_frame = df.iloc[index - 1]['Frame']
                pos_start = max(key_frame - f_before, 0)
                pos_end = key_frame + f_after
                prev_pos_end = prev_key_frame + f_after
                if (pos_start - prev_pos_end) - 1 >= (f_after + f_before + 1 + gap * 2):
                    neg_start =  prev_pos_end + gap
                    neg_end = pos_start - gap
                    is_idle_before = True
            
            
            # Negative class video segments before
            if is_idle_before:
                j = neg_start
                while (j + total_window - 1) <= neg_end:
                    segments.append(([j, j + total_window - 1], "[i]"))
                    j += total_window
            
            # Current video with keystroke
            segments.append(([pos_start, pos_end], key_value))
        
        self.video_name = video_name
        self.segments = segments
    def __len__(self):
        return len(self.segments)

    def __getitem__(self, idx):
        (start, end), label = self.segments[idx]
        
        frames = []
        for i in range(start, end + 1):
            image = torchvision.io.read_image(f"{self.videos_dir}/{self.video_name}/frame_{i}.jpg")
            
            frames.append(image)
       
        return torch.stack(frames), label2Id[label]
    
    def get_class_counts(self):
        labels = [segment[1] for segment in self.segments]
        unique_elements, counts = np.unique(labels, return_counts=True)
        occurrences = dict(zip(unique_elements, counts))
        weights = np.zeros(len(id2Label))
        for label, count in occurrences.items():
            weights[label2Id[label]] = count
        return weights

class KeyDataModule(L.LightningDataModule):
    def __init__(self, batch_size, labels_dir, videos_dir, train_vids, val_vids, test_vids):
        super().__init__()
        self.batch_size = batch_size
        self.train_datasets = [KeyDataset(video_name, labels_dir, videos_dir) for video_name in train_vids]
        self.val_datasets = [KeyDataset(video_name, labels_dir, videos_dir) for video_name in val_vids]
        self.test_datasets = [KeyDataset(video_name, labels_dir, videos_dir) for video_name in test_vids]
        
        self.train_dataset = torch.utils.data.ConcatDataset(self.train_datasets)
        self.test_dataset = torch.utils.data.ConcatDataset(self.test_datasets)
        self.val_dataset = torch.utils.data.ConcatDataset(self.val_datasets)
        
        
        
        print(f"Train: {len(self.train_dataset)}; Val: {len(self.val_dataset)}; Test: {len(self.test_dataset)}")
        
        train_counts = np.array(
            [d.get_class_counts() for d in self.train_datasets]).sum(axis=0)
        print(f"Train counts: {train_counts}")
        train_total_samples = np.array([len(d) for d in self.train_datasets]).sum(axis=0)
        self.train_weights = train_counts / (train_total_samples * len(id2Label))
                                        
    def train_dataloader(self):
        return DataLoader(self.train_dataset, 
                          batch_size=self.batch_size, 
                          num_workers=NUM_WORKERS)
    
    def val_dataloader(self):
        return DataLoader(self.val_dataset, 
                          batch_size=self.batch_size, 
                          num_workers=NUM_WORKERS,
                          shuffle=False)
    
    def test_dataloader(self):
        return DataLoader(self.test_dataset, 
                          batch_size=self.batch_size, 
                          num_workers=NUM_WORKERS,
                          shuffle=False)

class KeyClf(L.LightningModule):
    def __init__(self, img_size, num_classes, learning_rate, weights):
        super().__init__()
        self.model = resnet34(sample_size=img_size, 
                               sample_duration=total_window,
                               shortcut_type='B', 
                               num_classes=num_classes)
        
        self.loss_fn = torch.nn.CrossEntropyLoss(torch.tensor(weights).float())
        self.accuracy = torchmetrics.Accuracy(task="multiclass", num_classes=num_classes)
        self.lr = learning_rate
        self.transforms = v2.Compose([
            v2.CenterCrop(img_size),
            v2.ToDtype(torch.float32, scale=True),
        ])
        
        self.test_preds = []
        self.test_targets = []
        self.save_hyperparameters()


    def test_step(self, batch):
        videos, targets = batch
        videos = self.transforms(videos)
        videos = videos.permute(0, 2, 1, 3, 4)
        preds = self.model(videos)

        pred_ids = torch.argmax(self.model(videos), dim=1).squeeze()
        pred_labels = [id2Label[_id] for _id in pred_ids]
        self.test_preds += pred_labels
        self.test_targets += [id2Label[_id] for _id in targets]
        
        loss = self.loss_fn(preds, targets.long())
        self.log('test_loss', loss)
        self.log('test_acc', self.accuracy(preds, targets))
    
    def on_test_end(self):
        print(classification_report(self.test_targets, self.test_preds))
        
    def training_step(self, batch):
        videos, targets = batch
        videos = self.transforms(videos)
        videos = videos.permute(0, 2, 1, 3, 4)
        preds = self.model(videos)
        loss = self.loss_fn(preds, targets.long())
        self.log('train_loss', loss)
        self.log('train_acc', self.accuracy(preds, targets))
        return loss

    def validation_step(self, batch):
        videos, targets = batch
        videos = self.transforms(videos)
        videos = videos.permute(0, 2, 1, 3, 4)
        preds = self.model(videos)
        loss = self.loss_fn(preds, targets.long())
        self.log('val_loss', loss)
        self.log('val_acc', self.accuracy(preds, targets))
        return loss

    def configure_optimizers(self):
        return torch.optim.Adam(self.model.parameters(), lr=self.lr)

In [6]:
test_video_1 = KeyDataset('video_1', 
                          labels_dir='/kaggle/input/keystroke/labels',
                           videos_dir="/kaggle/input/keystroke/raw_frames_320")

## Train

In [7]:
# dm = KeyDataModule(batch_size=8, 
#                    labels_dir='/kaggle/input/keystroke/labels',
#                    videos_dir="/kaggle/input/keystroke/raw_frames_320",
#                    train_vids=[
#                        'video_1', 'video_2', 'video_3', 'video_4', 'video_5', 
#                         'video_6', 'video_7', 'video_8', 'video_9', 'video_10',
#                        'video_11', 'video_12', 'video_13', 'video_14', 'video_15', 
#                        'video_16', 'video_17', 'video_18', 'video_19',
#                        'video_21', 'video_22', 'video_23', 'video_24', 'video_25', 
#                        'video_26', 'video_27', 'video_28', 'video_29', 'video_30'], 
#                    val_vids=['video_31', 'video_32', 'video_33'], 
#                    test_vids=['video_34','video_35', 'video_36'])
# model = KeyClf(img_size=320, num_classes=len(id2Label), learning_rate=0.001, weights=dm.train_weights)
# trainer = L.Trainer(
#     # deterministic=True,
#     devices=[0, 1],
#     accelerator="gpu",
#     fast_dev_run=False,
#     max_time="00:05:00:00",
#     callbacks=[EarlyStopping(monitor="val_loss", patience=5)],
# )
# trainer.fit(model, dm)
# trainer.test(model, dm)

## Resume training state

In [8]:
dm = KeyDataModule(batch_size=8, 
                   labels_dir='/kaggle/input/keystroke/labels',
                   videos_dir="/kaggle/input/keystroke/raw_frames_320",
                   train_vids=[
                       'video_1', 'video_2', 'video_3', 'video_4', 'video_5', 
                        'video_6', 'video_7', 'video_8', 'video_9', 'video_10',
                       'video_11', 'video_12', 'video_13', 'video_14', 'video_15', 
                       'video_16', 'video_17', 'video_18', 'video_19',
                       'video_21', 'video_22', 'video_23', 'video_24', 'video_25', 
                       'video_26', 'video_27', 'video_28', 'video_29', 'video_30'], 
                   val_vids=['video_31', 'video_32', 'video_33'], 
                   test_vids=['video_34','video_35', 'video_36'])
model = KeyClf(img_size=320, num_classes=len(id2Label), learning_rate=0.001, weights=dm.train_weights)
trainer = L.Trainer(
    # deterministic=True,
    devices=[0, 1],
    accelerator="gpu",
    fast_dev_run=False,
    max_time="00:05:00:00",
    callbacks=[EarlyStopping(monitor="val_loss", patience=10)],
)
trainer.fit(model, dm,  ckpt_path = "/kaggle/input/key-clf-smaller/pytorch/v1/1/smaller-epoch10-step53350.ckpt")
trainer.test(model, dm)



Train: 77589; Val: 7168; Test: 4377
Train counts: [29360.  4182.   532.  7038.   376.  2938.   590.  1222.  1179.  4348.
   682.   734.  1228.  2825.   407.   363.  1573.   850.  2352.  2539.
   917.   346.  2248.  1843.  2776.  1417.   405.   587.   497.   779.
   456.]


  m.weight = nn.init.kaiming_normal(m.weight, mode='fan_out')
INFO: GPU available: True (cuda), used: True
INFO: TPU available: False, using: 0 TPU cores
INFO: HPU available: False, using: 0 HPUs
INFO: Initializing distributed: GLOBAL_RANK: 0, MEMBER: 1/2
INFO: Initializing distributed: GLOBAL_RANK: 1, MEMBER: 2/2
INFO: ----------------------------------------------------------------------------------------------------
distributed_backend=nccl
All distributed processes registered. Starting with 2 processes
----------------------------------------------------------------------------------------------------

2024-07-10 05:27:52.852553: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-07-10 05:27:52.852690: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT wh

Sanity Checking: |          | 0/? [00:00<?, ?it/s]

  self.pid = os.fork()
/opt/conda/lib/python3.10/site-packages/lightning/pytorch/trainer/connectors/logger_connector/result.py:439: It is recommended to use `self.log('val_loss', ..., sync_dist=True)` when logging on epoch level in distributed setting to accumulate the metric across devices.
/opt/conda/lib/python3.10/site-packages/lightning/pytorch/trainer/connectors/logger_connector/result.py:439: It is recommended to use `self.log('val_acc', ..., sync_dist=True)` when logging on epoch level in distributed setting to accumulate the metric across devices.
INFO: Initializing distributed: GLOBAL_RANK: 0, MEMBER: 1/2
INFO: Initializing distributed: GLOBAL_RANK: 1, MEMBER: 2/2
INFO: ----------------------------------------------------------------------------------------------------
distributed_backend=nccl
All distributed processes registered. Starting with 2 processes
----------------------------------------------------------------------------------------------------

2024-07-10 05:28:18.

Testing: |          | 0/? [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           ,       1.00      0.14      0.25         7
           .       0.00      0.00      0.00        11
   BackSpace       0.99      0.58      0.73       147
         [i]       0.53      0.98      0.69       549
         [s]       0.97      0.92      0.95       246
           a       0.97      0.36      0.52       109
           b       0.67      0.18      0.29        11
           c       0.89      0.82      0.86        51
           d       0.94      0.82      0.88        40
           e       0.96      0.54      0.69       148
           f       1.00      0.22      0.36        18
           g       0.95      0.81      0.88        26
           h       0.96      0.63      0.76        38
           i       1.00      0.69      0.82       113
           j       0.00      0.00      0.00         2
           k       0.67      0.25      0.36         8
           l       1.00      0.26      0.42        42
           m       0.94    

/opt/conda/lib/python3.10/site-packages/lightning/pytorch/trainer/connectors/logger_connector/result.py:439: It is recommended to use `self.log('test_loss', ..., sync_dist=True)` when logging on epoch level in distributed setting to accumulate the metric across devices.
/opt/conda/lib/python3.10/site-packages/lightning/pytorch/trainer/connectors/logger_connector/result.py:439: It is recommended to use `self.log('test_acc', ..., sync_dist=True)` when logging on epoch level in distributed setting to accumulate the metric across devices.
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           ,       1.00      0.38      0.55         8
           .       1.00      0.10      0.17        21
   BackSpace       0.98      0.63      0.76       134
         [i]       0.53      0.98      0.69       567
         [s]       0.96      0.94      0.95       231
           a       1.00      0.22      0.37        98
           b       0.67      0.22      0.33         9
           c       0.91      0.80      0.85        40
           d       0.97      0.88      0.93        43
           e       0.96      0.49      0.65       163
           f       1.00      0.39      0.56        23
           g       0.88      0.70      0.78        20
           h       0.94      0.53      0.68        32
           i       0.93      0.78      0.85        89
           j       1.00      0.50      0.67         2
           k       1.00      0.08      0.14        13
           l       0.94      0.32      0.48        47
           m       0.86    

[{'test_loss': 0.33599114418029785, 'test_acc': 0.7446322441101074}]

## Test

In [9]:
# device = torch.device('cuda')
# trained_model = KeyClf.load_from_checkpoint("/kaggle/input/keyclf/pytorch/v1/1/epoch7-step34979.ckpt")

In [10]:
# video = 'video_36'
# images = glob.glob(f'/kaggle/input/keycls-test/test_data/raw_frames/{video}/*.jpg')
# img = cv2.imread(os.path.join(input_dir, jpg_files[0]))
# height, width, _ = img.shape

# output_file = f'./{video}.mp4'
# # Define the codec and create VideoWriter object
# fourcc = cv2.VideoWriter_fourcc(*'mp4v')  # Codec for MP4 format
# out = cv2.VideoWriter(output_file, fourcc, framerate=25, (width, height))

# # Iterate through JPG files and write to video
# for i in range(len(images)):
#     img = cv2.imread(''/kaggle/input/keycls-test/test_data/raw_frames/{video}/frame_{i}.jpg')
#     out.write(img)

# # Release VideoWriter and close all windows
# out.release()
# cv2.destroyAllWindows()

In [11]:
# preds = []

# is_gpu = torch.cuda.is_available()

# if is_gpu: 
#     trained_model.to(device)

# trained_model.freeze()
# i = 0
# recording = True
# window = []

# while recording:
#     # Less than 8 flen(window)rames => continue to collect frames....
#     if len(window) < total_window:
#         image = torchvision.io.read_image(f"/kaggle/input/keycls-test/test_data/raw_frames/{video}/frame_{i}.jpg")
#         window.append(image)
#     if len(window) == total_window:
#         frames = torch.stack(window)
#         frames = trained_model.transforms(frames)
#         frames = frames.permute(1, 0, 2, 3)
        
#         if is_gpu: frames.to(device)
#         out = F.softmax(trained_model.model(frames.unsqueeze(0)))[0]
#         _id = torch.argmax(out)
#         label = id2Label[_id]
#         print(f"{i - total_window - 1};{label};{out[_id]}")
    
#         image = torchvision.io.read_image(f"/kaggle/input/keycls-test/test_data/raw_frames/{video}/frame_{i}.jpg")
#         window.append(image)
#         window = window[1:]
    
#     i += 1
#     if i == len(images):
#         recording = False