# Setup

In [None]:
from google.colab import auth
auth.authenticate_user()

In [None]:
project_id = 'stairnet'
!gcloud config set project {project_id}

In [None]:
# Test to see if dataset location is correct
! gsutil ls -al gs://stairnet_bucket/StairNet

In [None]:
!echo "deb http://packages.cloud.google.com/apt gcsfuse-bionic main" > /etc/apt/sources.list.d/gcsfuse.list
!curl https://packages.cloud.google.com/apt/doc/apt-key.gpg | apt-key add -
!apt -qq update
!apt -qq install gcsfuse

In [None]:
!mkdir data
!gcsfuse --implicit-dirs stairnet_bucket data

# Dataset preparation

In [None]:
import os
import numpy as np
import pandas as pd
from tqdm import tqdm
import matplotlib.pyplot as plt
from PIL import Image

In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

In [None]:
# Base folder name
FOLDER_NAME = 'data/StairNet'

# Created folder name
SEQUENCE_FOLDER = 'drive/MyDrive/StairNet/Sample_Sequences/'

# Names of the classification classes
CLASS_NAMES = ['IS', 'ISLG', 'LG', 'LGIS']

# Number of frames in each sequence
SEQ_SIZE = 5

# Type of sequences, when the frame is the first in the video
PADDING_TYPE = 'copy'

In [None]:
def count_files(folder_name):
    ''' 
        Count number of files in the folder 
    '''
    counter = 0
    for folder in list(filter(('.DS_Store').__ne__, os.listdir(folder_name))):
        curr_count = len(os.listdir(os.path.join(folder_name, folder)))
        print('Number of files in folder {}: {}'.format(folder, curr_count))
        counter += curr_count
    print('Total number of files: ', counter)


def get_frame_samples(
        class_folder_names,
        filter_ds_store=True):
    '''
        get list of all video frames
    '''
    samples = list()
    for folder in class_folder_names:
        for el in os.listdir(folder):
            samples.append(el)
    if filter_ds_store:
        return list(filter(('.DS_Store').__ne__, samples))
    return samples

def get_video_number(file_name):
    '''
        parse video number from string
        input: [IMG_#_#] frame # #CLASS#.jpg
        output: IMG_#_#
    '''
    return file_name.split(' ')[0].replace("['", '').replace("']", '').replace("'", '')


def get_class(filename):
    '''
        parse frame label from filename
        input: [IMG_#_#] frame # #CLASS#.jpg
        output: #CLASS#
    '''
    return filename.split(' ')[-1].split('.jpg')[0]

def get_frame_number(filename):
    '''
        parse frame number from filename
        input: [IMG_#_#] frame # #CLASS#.jpg
        output: frame #
    '''
    return int(filename.split (' ')[2])


def create_video2frame_dict(video_samples, data_folder, log=False):
    ''' mapping video number to corresponding frames '''
    video_names_dict = {get_video_number(el): list() for el in video_samples}

    # creating folder for each video
    for name in video_names_dict.keys():
        dir_name = os.path.join(data_folder, name)
        os.makedirs(dir_name, exist_ok=True)
        if log:
            print(f'\t Created video folder: {dir_name}')

    # mapping video with corresponding sorted frames
    for image_path in sorted(
            video_samples, key=lambda x: int(
                get_frame_number(x))):
        video_n = get_video_number(image_path)
        video_names_dict[video_n].append(image_path)

    return video_names_dict

def get_video_episodes(video_seq):
    ''' 
        Splitting video sequence by episodes
    '''
    episodes = list()
    curr_episode = list()
    counter = 0
    frame_idx = 0
    while counter < get_frame_number(video_seq[-1]):
        if counter == get_frame_number(video_seq[frame_idx]):
            curr_episode.append(video_seq[frame_idx])
            frame_idx += 1
        else:
            episodes.append(curr_episode)
            curr_episode = list()
        counter += 6
    episodes.append(curr_episode)
    return [x for x in episodes if x != []]
    

In [None]:
count_files(FOLDER_NAME)

In [None]:
class_folder_names = [
    os.path.join(FOLDER_NAME, class_name) for class_name in CLASS_NAMES
]
print('Class folders:\n', class_folder_names)

In [None]:
samples = get_frame_samples(
    class_folder_names=class_folder_names
)
print('Number of samples: ', len(samples))

In [None]:
video_dict = create_video2frame_dict(
    video_samples=samples, 
    data_folder=SEQUENCE_FOLDER
)
print('Number of videos: ', len(video_dict))

In [None]:
video_names = video_dict.keys()
video_length = [len(video_dict[name]) for name in video_names]

plt.figure(figsize=(20, 4))
plt.title('Number of frames in each video')
x = np.arange(len(video_names))
plt.bar(x, height=video_length)
plt.xticks(x, video_names, rotation = 45);
plt.savefig('video_len_distribution.png')

In [None]:
np.min(video_length), np.max(video_length), np.mean(video_length), np.std(video_length)

In [None]:
episode_dict = {key: get_video_episodes(value) for key, value in video_dict.items()}

In [None]:
video_names = episode_dict.keys()
episode_num = [len(episode_dict[name]) for name in video_names]

plt.figure(figsize=(20, 4))
plt.title('Number of episodes in each video')
x = np.arange(len(video_names))
plt.bar(x, height=episode_num)
plt.xticks(x, video_names, rotation = 45);
plt.savefig('video_episode_num.png')

In [None]:
np.min(episode_num), np.max(episode_num), np.mean(episode_num), np.std(episode_num)

In [None]:
episode_names = list()
episode_len = list()

for vid_name in tqdm(episode_dict.keys()):
  for e_idx, episode in enumerate(episode_dict[vid_name]):
    episode_names.append(f'{vid_name}_episode_{e_idx}')
    episode_len.append(len(episode))

plt.figure(figsize=(20, 4))
plt.title('Number of frames in each episodes')
x = np.arange(len(episode_names))
plt.bar(x, height=episode_len)
#plt.xticks(x, episode_names, rotation = 45);
plt.xticks([])
plt.savefig('episode_len_distribution.png')

In [None]:
np.min(episode_len), np.max(episode_len), np.mean(episode_len), np.std(episode_len)

In [None]:
def construct_samples(video, seq_len):
    samples = list()
    if len(video) < seq_len:
        for i in range(len(video)):
            subset = [video[0]] * (seq_len - i - 1) + video[:i + 1]
            samples.append(subset)

    else:
        for i in range(0, len(video)):
            subset = [video[0]] * (seq_len - i - 1) + video[:i + 1]
            samples.append(subset[-seq_len:])
    return samples

def generate_seq_dataset(video_dict, data_folder, save_folder, seq_size):
    # iterate over video in dataset
    for video_name, episodes in tqdm(video_dict.items()):
        counter = 0
        # for each episode in video construct samples of length `seq_size`
        for episode in episodes: 
            seq_samples = construct_samples(episode, seq_len=seq_size)

            # save samples
            for sample in seq_samples:
                with open(os.path.join(save_folder, video_name, f'sample_{counter}.txt'), 'rw') as f:
                    f.write(
                        '\n'.join(
                            [os.path.join(data_folder, get_class(el), el) for el in sample]
                        )
                    )
                counter += 1
    return counter

In [None]:
 num_files = generate_seq_dataset(
    video_dict=episode_dict,
    data_folder=FOLDER_NAME,
    save_folder=SEQUENCE_FOLDER,
    seq_size=SEQ_SIZE
)
print('Number of files: ', num_files)

# Train, Val and Test splits

In [None]:
train = pd.read_csv(
    '/content/data/StairNet_Split_CSV/StairNet_Train.csv')
val = pd.read_csv(
    '/content/data/StairNet_Split_CSV/StairNet_Validation.csv')
test = pd.read_csv(
    '/content/data/StairNet_Split_CSV/StairNet_Test.csv')

In [None]:
train.shape, val.shape, test.shape

In [None]:
train.head()

In [None]:
train_frames = train['filename'].values
val_frames = val['filename'].values
test_frames = test['filename'].values

In [None]:
TRAIN_FOLDER = 'drive/MyDrive/StairNet/Splits/Train'
VAL_FOLDER = 'drive/MyDrive/StairNet/Splits/Val'
TEST_FOLDER = 'drive/MyDrive/StairNet/Splits/Test'

In [None]:
os.makedirs(TRAIN_FOLDER, exist_ok=True)
os.makedirs(VAL_FOLDER, exist_ok=True)
os.makedirs(TEST_FOLDER, exist_ok=True)

In [None]:
def get_frame_name(filename):
    return filename.split('/')[-1]

def save_file(new_path, data):
    curr_folder = '/'.join(new_path.split('/')[:-1])
    if not os.path.exists(curr_folder):
      os.makedirs(curr_folder)
    with open(new_path, 'w') as f:
        f.write(
            '\n'.join(data)
        )

In [None]:
samples = list()
for video_name in list(filter(('.DS_Store').__ne__, os.listdir('drive/MyDrive/StairNet/Sequences/'))):
    for seq in tqdm(os.listdir(os.path.join('drive/MyDrive/StairNet/Sequences/', video_name))):
        if seq == 'frames.txt':
            continue
        curr_filename = os.path.join('drive/MyDrive/StairNet/Sequences/', video_name, seq)
        f = open(curr_filename, 'r')
        data = f.read().splitlines()
        f.close()
        curr_sample = get_frame_name(data[-1])
        if curr_sample in train_frames:
            save_file(os.path.join(TRAIN_FOLDER, video_name, seq), data)
        elif curr_sample in val_frames:
            save_file(os.path.join(VAL_FOLDER, video_name, seq), data)
        elif curr_sample in test_frames:
            save_file(os.path.join(TEST_FOLDER, video_name, seq), data)
        else:
            print(curr_filename)


# (Optional) Numpy sequences

In [None]:
import os 
import numpy as np
import cv2
import time

import multiprocessing
from PIL import Image
from tqdm import tqdm

In [None]:
# Base folder name
IMG_FOLDER = 'data/StairNet/'

#SAMPLE_SPLIT_TRAIN = 'data/StairNet_Seq_5/Splits_Random/Train'
#SAMPLE_SPLIT_VAL = 'data/StairNet_Seq_5/Splits_Random/Val'
#SAMPLE_SPLIT_TEST = 'data/StairNet_Seq_5/Splits_Random/Test'

# Folder name to save files
SAMPLE_SEQUENCES = '/content/data/StairNet_Seq_5/Sample_Sequences'


In [None]:
#os.makedirs('data/StairNet_Seq_5/SplitsVideo_numpy/Train')
#os.makedirs('data/StairNet_Seq_5/SplitsVideo_numpy/Val')
#os.makedirs('data/StairNet_Seq_5/SplitsVideo_numpy/Test')

In [None]:
def load_image(filename: str, img_load: str ='pil', img_size=(256, 256)):
  ''' Loading image as ndarray '''
  if img_load == 'cv2':
    img = cv2.imread(filename)
    img.resize(img_size)
  elif img_load == 'pil':
    img = Image.open(filename)
    img = img.resize(img_size, Image.ANTIALIAS)  
  return np.array(img)

def read_sample(sample_seq, img_size):
  ''' Loading sequence of frames to one ndarray '''
  buffer = np.zeros(shape=(len(sample_seq), img_size, img_size, 3))
  labels = list()
  for idx, img_path in enumerate(sample_seq):
      buffer[idx] = load_image(img_path, img_load='pil')
      labels.append(img_path.split('/')[-1].split(' ')[-1].split('.')[0])
  return buffer, labels

def read_seq_file(filename):
  ''' reading file with samples '''
  with open(filename, "r") as f:
    data = f.readlines()
  data = [d.strip() for d in data]
  return data

def save_seq(arr, labels, idx, folder_name):
  ''' Saving np ndarray as .npy '''
  label = '_'.join([el.split('/')[-1].split(' ')[-1].split('.')[0] for el in labels])
  filename = folder_name + f'/idx_{idx}:' + label + '.npy'
  np.save(filename, arr)

def sample_processing(sample):
  ''' processing samples by reading the frames and constructing sequnce ndarray '''
  filename, idx, folder_name = sample
  seq_path_arr = read_seq_file(filename)
  frames, lables = read_sample(seq_path_arr, 256)
  save_seq(frames, lables, idx, folder_name) 
  return idx

In [None]:
video_list = [
'IMG_02_1', 'IMG_02_4', 'IMG_05_1', 'IMG_11_1', 'IMG_14_2', 'IMG_20_1']

In [None]:
def process_data_split(folder_name, save_folder_name, video_list=None):
  ''' Generate sequence samples for provided data split '''
  samples = list()
  counter = 0
  for video_name in tqdm(video_list): # tqdm(list(filter(('.DS_Store').__ne__, os.listdir(folder_name)))):
    for sample in os.listdir(os.path.join(folder_name, video_name)):
      samples.append(
          (os.path.join(folder_name, video_name, sample), counter, save_folder_name)
      )
      counter += 1
  return samples

In [None]:
samples_val = process_data_split(SAMPLE_SEQUENCES, 'data/StairNet_Seq_5/SplitsVideo_numpy/Test', video_list)

In [None]:
with multiprocessing.Pool(processes = 16) as p:
    res = list(tqdm(p.imap(sample_processing, samples_val), total=len(samples_val)))