# Imports

In [None]:
import cv2
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.cluster import DBSCAN
from sklearn.decomposition import PCA
import tensorflow as tf

sns.set()

In [None]:
from google.colab import drive
drive.mount('/content/drive')

# Data Exploration

<p>We will begin by exploring the data, and preparing an array of RGB and gray scale images to be used along the notebook</p>

In [None]:
frames = []
gray_frames = []
video = cv2.VideoCapture('drive/MyDrive/video.mp4')
while video.isOpened():
    ret, frame = video.read()
    if ret == True:
        gray_frame = cv2.cvtColor(frame,cv2.COLOR_BGR2GRAY)
        rgb = cv2.cvtColor(frame,cv2.COLOR_BGR2RGB)
        gray_frames.append(gray_frame)
        frames.append(rgb)
    else:
        break
video.release()

frames = np.array(frames)
gray_frames = np.array(gray_frames)

In [None]:
image_nb = frames.shape[0]
image_shape = frames[0].shape
assert np.all([x.shape==image_shape for x in frames])
assert np.all([x.dtype==frames[0].dtype for x in frames])
print(f"{image_nb} frames found, each of type {frames[0].dtype} and shape {image_shape}.")

In [None]:
def show(frames, figsize=(20,100)):
    sns.reset_orig()
    w = 3
    h = int(np.ceil(frames.shape[0]/3))
    fig, axs = plt.subplots(h, w, figsize=figsize)
    fig.tight_layout()
    for i,frame in enumerate(frames):
        axs[i//3, i%3].set_title(f"frame {i}")
        axs[i//3, i%3].imshow(frame)
    plt.show()

In [None]:
show(frames)

# Outlier Detection

## PCA

In [None]:
pca = PCA(2)
transformed_data = pca.fit_transform(gray_frames.reshape(gray_frames.shape[0],-1))
print(f"explained variace ratio: {np.sum(pca.explained_variance_ratio_)}")

In [None]:
def scatter_plot_2d(data, figsize=(16,8), annotate=False, labels=None):
    sns.set()
    fig, ax = plt.subplots(figsize=figsize)

    if labels is not None:
        colormap = np.array(['yellow', 'green', 'blue','black','red'])
        plt.scatter(data[:,0],data[:,1], marker="o", c=colormap[labels])
    else:
        plt.scatter(data[:,0],data[:,1], marker="o")
    if annotate:
        for i,xy in enumerate(data):
            ax.annotate(str(i), xy=(xy[0],xy[1]), fontsize=10)
    plt.show()

In [None]:
scatter_plot_2d(transformed_data)

## DBSCAN

In [None]:
plt.hist(np.sqrt(((transformed_data - np.expand_dims(transformed_data, axis=1))**2).sum(-1)).flatten(), bins=30)
plt.show()

In [None]:
clustering = DBSCAN(eps=10000, n_jobs=-1).fit(transformed_data)
labels = clustering.labels_
unique_labels = np.unique(labels)

In [None]:
print(f"""We found {unique_labels.shape[0]} clusters, where outliers get the cluster -1.
The clusters indices are: {tuple(unique_labels)}.
There are {np.sum(labels==-1)} outliers.""")

In [None]:
show(frames[labels==-1], figsize=(20,10))

In [None]:
clean_frames = frames[labels!=-1]
clean_gray_frames = gray_frames[labels!=-1]
clean_transformed_data = transformed_data[labels!=-1]

In [None]:
scatter_plot_2d(clean_transformed_data)

# Plot the cumulative variance w.r.t number of components

In [None]:
pca = PCA()

pca.fit(gray_frames.reshape(gray_frames.shape[0],-1))
cumsum = np.cumsum(pca.explained_variance_ratio_)

In [None]:
sns.set()
plt.figure(figsize=(12,8))
plt.title("Cumulative variance ratio w.r.t to components numer")
plt.xlabel("Components")
plt.ylabel("variance ratio")
plt.plot(np.arange(gray_frames.shape[0]),cumsum)
plt.show()

## Dimensionality Reduction with Pretrained Network

---



In [None]:
image_shape = frames[0].shape
extractor = tf.keras.applications.ConvNeXtXLarge(
    model_name="convnext_xlarge",
    include_top=False,
    include_preprocessing=True,
    weights="imagenet",
    pooling='avg',
    input_shape=image_shape)

In [None]:
output = extractor.predict(frames, batch_size=1)

In [None]:
output.shape

In [None]:
plt.hist(np.sqrt(((output - np.expand_dims(output, axis=1))**2).sum(-1)).flatten(), bins=30)
plt.show()

In [None]:
clustering = DBSCAN(eps=10, n_jobs=-1).fit(output)
labels = clustering.labels_
show(frames[labels==-1], figsize=(20,10))