## Import data

In [1]:
!unzip drive/MyDrive/INF8770_TP3_A2023.zip

Archive:  drive/MyDrive/INF8770_TP3_A2023.zip
   creating: src/
  inflating: src/evaluate.py         
   creating: data/
   creating: data/challenge/
   creating: data/challenge/jpeg/
  inflating: data/challenge/jpeg/i001.jpeg  
  inflating: data/challenge/jpeg/i002.jpeg  
  inflating: data/challenge/jpeg/i003.jpeg  
  inflating: data/challenge/jpeg/i004.jpeg  
  inflating: data/challenge/jpeg/i005.jpeg  
  inflating: data/challenge/jpeg/i006.jpeg  
  inflating: data/challenge/jpeg/i007.jpeg  
  inflating: data/challenge/jpeg/i008.jpeg  
  inflating: data/challenge/jpeg/i009.jpeg  
  inflating: data/challenge/jpeg/i010.jpeg  
  inflating: data/challenge/jpeg/i011.jpeg  
  inflating: data/challenge/jpeg/i012.jpeg  
  inflating: data/challenge/jpeg/i013.jpeg  
  inflating: data/challenge/jpeg/i014.jpeg  
  inflating: data/challenge/jpeg/i015.jpeg  
  inflating: data/challenge/jpeg/i016.jpeg  
  inflating: data/challenge/jpeg/i017.jpeg  
  inflating: data/challenge/jpeg/i018.jpeg  
  infl

## Data Exploration

In [2]:
import os
import pandas as pd
import cv2
import numpy as np
from tqdm import tqdm

In [3]:
os.listdir('data/test/')

['test_gt.csv', 'jpeg', 'png']

In [4]:
test = pd.read_csv('data/test/test_gt.csv')
test.head()

Unnamed: 0,image,video,minutage
0,i001,v001,1.75
1,i002,v050,17.017
2,i003,out,
3,i004,v042,11.378033
4,i005,v063,5.338667


## Q.2

### Algorithme de Carol

In [5]:
# Load a video and store the histograms of trames according to target_fps frequence
def load_video(filename,target_fps,hist_bins=128,hist_range=[0,256]):
  cap = cv2.VideoCapture(f"data/mp4/{filename}")
  fps = cap.get(cv2.CAP_PROP_FPS)
  if target_fps is None:
    target_fps = fps
  i = 0
  assert(target_fps<=fps)
  interval = fps//target_fps
  list_hist = []
  # Look through the frames
  while(True):
    ret, frame = cap.read()
    if ret:
      if i%interval == 0:
        hist = np.histogram(frame.ravel(),bins=hist_bins,range=hist_range)
        list_hist.append(hist[0])
      i+=1
    if cv2.waitKey(1) & 0xFF == ord('q') or ret==False :
        cap.release()
        cv2.destroyAllWindows()
        break
  time_array = np.arange(len(list_hist))/target_fps
  return np.array(list_hist), time_array

In [6]:
# Build histograms dict
video_dic = dict()
for filename in tqdm(os.listdir('data/mp4')):
  video_dic[filename[:-4]] = load_video(filename,target_fps=1,hist_bins=128,hist_range=[0,256])

100%|██████████| 100/100 [03:56<00:00,  2.36s/it]


In [7]:
# Function that returns the closest video and time
def get_video(image_filename,out_threshold=np.inf):
  img = cv2.imread(f'data/test/{image_filename}')
  img_hist = np.histogram(img,128,[0,256])[0]
  global_min = np.inf
  for video in video_dic:
    local_min = np.linalg.norm(np.array(video_dic[video][0])-img_hist,axis=1).min()
    if local_min < global_min:
      global_min = local_min
      best_video = video
      best_index = np.linalg.norm(np.array(video_dic[video][0])-img_hist,axis=1).argmin()
      best_time = video_dic[best_video][1][best_index]
      if global_min==0:
        return best_video,best_index
  if global_min>out_threshold:
    return None,None
  else:
    return best_video,best_time

get_video('jpeg/i045.jpeg')

('v054', 3.0)

In [8]:
# Loads test dataset
test=pd.read_csv('/content/data/test/test_gt.csv')

In [9]:
# Creates prediction on png
res_png = pd.DataFrame()
video_pred=[]
minutage_pred=[]
for image in tqdm(test.image):
  image_filename='png/'+image+'.png'
  video,minutage = get_video(image_filename)
  video_pred.append(video)
  minutage_pred.append(minutage)
res_png['image']=test.image
res_png['video_pred']=video_pred
res_png['minutage_pred']=minutage_pred
# Stores results in file
res_png.to_csv("results/test_png.csv",index=False)

100%|██████████| 300/300 [00:18<00:00, 15.93it/s]


In [10]:
# Creates prediction on jpeg
res_jpeg = pd.DataFrame()
video_pred=[]
minutage_pred=[]
for image in tqdm(test.image):
  image_filename='jpeg/'+image+'.jpeg'
  video,minutage = get_video(image_filename)
  video_pred.append(video)
  minutage_pred.append(minutage)
res_jpeg['image']=test.image
res_jpeg['video_pred']=video_pred
res_jpeg['minutage_pred']=minutage_pred
# Stores results in file
res_jpeg.to_csv("results/test_jpeg.csv",index=False)

100%|██████████| 300/300 [00:15<00:00, 19.52it/s]


In [11]:
!python src/evaluate.py --file=results/test_png.csv --file_gt=data/test/test_gt.csv

Taux de bonnes réponses : 78.0% (234/300)
Ecart temporel moyen : 0.58 sec


In [12]:
!python src/evaluate.py --file=results/test_jpeg.csv --file_gt=data/test/test_gt.csv

Taux de bonnes réponses : 67.7% (203/300)
Ecart temporel moyen : 2.59 sec


In [13]:
# Upper bound sur la précision
(test.video!='out').mean()

0.7866666666666666

## Q3.

### Méthode avancée

In [14]:
!pip install torchmetrics
!pip install annoy
!pip install decord

Collecting torchmetrics
  Downloading torchmetrics-1.2.0-py3-none-any.whl (805 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/805.2 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m122.9/805.2 kB[0m [31m3.8 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━[0m [32m450.6/805.2 kB[0m [31m6.7 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m798.7/805.2 kB[0m [31m8.3 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m805.2/805.2 kB[0m [31m7.5 MB/s[0m eta [36m0:00:00[0m
Collecting lightning-utilities>=0.8.0 (from torchmetrics)
  Downloading lightning_utilities-0.10.0-py3-none-any.whl (24 kB)
Installing collected packages: lightning-utilities, torchmetrics
Successfully installed lightning-utilities-0.10.0 torchmetrics-1.2.0
Collecting 

In [15]:
import os
from PIL import Image
import torch
import torch.nn as nn
from torchvision import models, transforms
from torchmetrics.image import StructuralSimilarityIndexMeasure
from torchmetrics.regression import MeanSquaredError
from annoy import AnnoyIndex
import os
import pandas as pd
import numpy as np
from tqdm import tqdm
from decord import VideoReader
from decord import cpu
import matplotlib.pyplot as plt

MP4_DIR = "data/mp4/"

model_transform = transforms.Compose([
    transforms.ToPILImage("RGB"),
    transforms.Resize((224, 224)),
    transforms.ToTensor()
])

to_gray_scale_transform = transforms.Compose([
    transforms.Grayscale()
])

"""# Find all images"""
if torch.cuda.is_available():
    dev = "cuda:0"
else:
    dev = "cpu"
device = torch.device(dev)

# Charge le réseau préentrainé
resnet50_weights = models.ResNet50_Weights.IMAGENET1K_V1

ResNet50_model = models.resnet50(weights=resnet50_weights)

ResNet50_model.fc = nn.Identity()

# Utilise si possible le GPU
ResNet50_model.to(device)

# # Pour ne pas entraîner le modèle
ResNet50_model.eval()

files_dict = {}

# Pour analyser des photos de même taille


"""Trouver des images similaires"""

def generate_dict():
    video_files_names: list = os.listdir(MP4_DIR)
    temp_frames = {}
    id = 0
    # We use a global index that indentify a frame in all the videos
    for video_name in video_files_names:
        path = os.path.join(MP4_DIR, video_name)
        vr = VideoReader(path)
        total_frames = len(vr)
        fps = vr.get_avg_fps()
        for frame in range(total_frames):
            temp_frames[id] = {
                "video": video_name,
                "path": path,
                "index": frame,
                "fps": fps,
                "total_frames": total_frames,
                "timestamp": frame / fps,
                "global_index": id,
            }
            id += 1
    return temp_frames

frames = generate_dict()

def video_process():
    resnet_annoy_index_20 = AnnoyIndex(2048, "angular")

    videos_files_names: list = os.listdir(MP4_DIR)

    frame_id = 0
    for video_index in tqdm(range(len(videos_files_names))):
        video_path = MP4_DIR + videos_files_names[video_index]
        vr = VideoReader(video_path, ctx=cpu(0))
        fps = vr.get_avg_fps()
        total_frames = len(vr)
        for frames_index in (range((int(total_frames)))):
            frame = vr[frames_index]

            input_tensor = model_transform(frame.asnumpy()).unsqueeze(0).to(device)
            if input_tensor.size()[1] == 3:
                resnet_output_tensor = ResNet50_model(input_tensor)
                resnet_out_np = resnet_output_tensor[0].data.cpu().numpy()

                files_dict[frame_id] = {
                    "video": videos_files_names[video_index],
                    "frame": frames_index,
                    "time": frames_index / fps}
                # Ajoute la représentation de la frame fournie par le modèle ResNet50 à l'index
                resnet_annoy_index_20.add_item(frame_id, resnet_out_np)
                frame_id += 1

    resnet_annoy_index_20.build(20)
    resnet_annoy_index_20.save("resnet_annoy_index_20.ann")


def cosine_similarity(tensor_hist_1, tensor_hist_2):
    return torch.dot(tensor_hist_1, tensor_hist_2) / (torch.norm(tensor_hist_1) * torch.norm(tensor_hist_2))


def show_plot(im1, im2, name, ratio=(12,16), cmap=None):
    fig, axs = plt.subplots(2, 1, figsize=ratio)
    axs[0].set_title("Original")
    axs[1].set_title("Predicted")
    if not cmap:
        axs[0].imshow(im1)
        axs[1].imshow(im2)
    else:
        axs[0].imshow(im1, cmap=cmap)
        axs[1].imshow(im2, cmap=cmap)
    plt.title(name)
    plt.show()

def test_dir(directory, csv_name):

    annoy_annex = AnnoyIndex(2048, "angular")
    annoy_annex.load("resnet_annoy_index_20.ann")

    image, video_pred, minutage_pred = [], [], []
    images_files = os.listdir(directory)

    n_bins = 25

    for image_file in tqdm(images_files, desc=csv_name):
        img = Image.open(directory + image_file)
        np_img = np.array(img)

        model_input_tensor = model_transform(np.array(img)).unsqueeze(0).to(device)

        tensor_original = torch.from_numpy(np_img).permute((2, 0, 1)).to(device).float()
        tensor_luminance_original = to_gray_scale_transform(tensor_original)[0]
        tensor_original_gauss1 = transforms.functional.gaussian_blur(tensor_luminance_original.unsqueeze(0), (9, 9),3).squeeze(0).to(device)
        tensor_original_gauss2 = transforms.functional.gaussian_blur(tensor_luminance_original.unsqueeze(0), (9, 9),6).squeeze(0).to(device)
        tensor_original_dif_gauss = torch.absolute(tensor_original_gauss1 - tensor_original_gauss2)
        tensor_original_dif_gauss = tensor_original_dif_gauss * 255 / torch.max(tensor_original_dif_gauss).item()
        hist_luminance_original = torch.histc(tensor_luminance_original, bins=n_bins)

        tensor_original_rgb = tensor_original.unbind(0)
        hist_red_original = torch.histc(tensor_original_rgb[0], bins=n_bins)
        hist_green_original = torch.histc(tensor_original_rgb[1], bins=n_bins)
        hist_blue_original = torch.histc(tensor_original_rgb[2], bins=n_bins)


        if model_input_tensor.size()[1] == 3:

            model_output_tensor = ResNet50_model(model_input_tensor)
            # Retrouve la trame la plus proche dans l'index Annoy
            voisin_proche = annoy_annex.get_nns_by_vector(model_output_tensor[0], 1)

            video_name = frames[voisin_proche[0]]["video"]
            video_frame = frames[voisin_proche[0]][("index")]
            video_timestamp = frames[voisin_proche[0]]["timestamp"]
            predicted_video = VideoReader(MP4_DIR + video_name)
            np_predicted = predicted_video[video_frame].asnumpy()
            tensor_predicted = torch.from_numpy(np_predicted).permute((2, 0, 1)).to(device).float()

            tensor_luminance_predicted = to_gray_scale_transform(tensor_predicted)[0].to(device)

            tensor_predicted_gauss1 = transforms.functional.gaussian_blur(tensor_luminance_predicted.unsqueeze(0), (9, 9),3).squeeze(0).to(device)
            tensor_predicted_gauss2 = transforms.functional.gaussian_blur(tensor_luminance_predicted.unsqueeze(0), (9, 9), 6).squeeze(0).to(device)
            tensor_predicted_dif_gauss = torch.absolute(tensor_predicted_gauss1-tensor_predicted_gauss2)
            tensor_predicted_dif_gauss = tensor_predicted_dif_gauss * 255/torch.max(tensor_predicted_dif_gauss).item()

            # unbind(0) seperate each channel of the image in three tensors R, G and B
            tensor_predicted_rgb = tensor_predicted.unbind(0)

            hist_luminance_predicted = torch.histc(tensor_luminance_predicted, bins=n_bins)
            # Cosine Similarity of luminance histogram
            cos_l = cosine_similarity(hist_luminance_original, hist_luminance_predicted).detach().cpu().item()

            hist_red_predicted = torch.histc(tensor_predicted_rgb[0], bins=n_bins)
            hist_green_predicted = torch.histc(tensor_predicted_rgb[1], bins=n_bins)
            hist_blue_predicted = torch.histc(tensor_predicted_rgb[2], bins=n_bins)

            # Cosine Similarity of r, g and b histograms
            cos_r = cosine_similarity(hist_red_original, hist_red_predicted)
            cos_g = cosine_similarity(hist_green_original, hist_green_predicted)
            cos_b = cosine_similarity(hist_blue_original, hist_blue_predicted)
            cos_rgb = torch.mean(torch.cat((cos_r.unsqueeze(0), cos_g.unsqueeze(0), cos_b.unsqueeze(0)))).detach().cpu().item()

            # SSIM
            ssim_fct = StructuralSimilarityIndexMeasure(data_range=255).to(device)
            ssim = ssim_fct(tensor_original.unsqueeze(0), tensor_predicted.unsqueeze(0)).detach().cpu().item()

            # MSE
            mean_squared_error_fct = MeanSquaredError(squared=False).to(device)
            mse_l = mean_squared_error_fct(tensor_luminance_original, tensor_luminance_predicted).detach().cpu().item()
            mse_b = mean_squared_error_fct(tensor_original_dif_gauss, tensor_predicted_dif_gauss).detach().cpu().item()

            image.append(image_file[:4])
            if ssim > 0.8 or cos_rgb > 0.9 or cos_l > 0.92 or mse_l < 40 or mse_b < 12:
                video_pred.append(video_name[:4])
                minutage_pred.append(video_timestamp)
            else:
                video_pred.append("out")
                minutage_pred.append("")

    df_csv = pd.DataFrame({"image": image,
                           "video_pred": video_pred,
                           "minutage_pred": minutage_pred
                           })
    df_csv.to_csv(f"{csv_name}.csv", index=False)


Downloading: "https://download.pytorch.org/models/resnet50-0676ba61.pth" to /root/.cache/torch/hub/checkpoints/resnet50-0676ba61.pth
100%|██████████| 97.8M/97.8M [00:02<00:00, 44.8MB/s]


In [16]:
# Génère le fichier annoy
# Le fichier généré est disponible ici : https://drive.google.com/file/d/1A-W9iZ_XbYTEcuQ9wRHCjACNxSmEvvcr/view?usp=sharing
video_process()

100%|██████████| 100/100 [14:05<00:00,  8.46s/it]


In [21]:
# Recherche sur les requêts tests
test_dir("data/test/png/", "results/test_png")
test_dir("data/test/jpeg/", "results/test_jpeg")

results/test_png: 100%|██████████| 300/300 [02:17<00:00,  2.18it/s]
results/test_jpeg: 100%|██████████| 300/300 [02:03<00:00,  2.43it/s]


In [22]:
!python src/evaluate.py --file=results/test_png.csv --file_gt=data/test/test_gt.csv

Taux de bonnes réponses : 99.3% (298/300)
Ecart temporel moyen : 0.00 sec


In [28]:
!python src/evaluate.py --file=results/test_jpeg.csv --file_gt=data/test/test_gt.csv

Taux de bonnes réponses : 99.7% (299/300)
Ecart temporel moyen : 0.31 sec


## Q4.

In [25]:
# Recherche sur les requêts challenge
test_dir("data/challenge/png/", "results/challenge_png")
test_dir("data/challenge/jpeg/", "results/challenge_jpeg")

results/challenge_png: 100%|██████████| 300/300 [02:15<00:00,  2.21it/s]
results/challenge_jpeg: 100%|██████████| 300/300 [02:02<00:00,  2.45it/s]
