In [3]:
import os
import cv2
import torch
from ultralytics import YOLO


# ОБРАБОТАТЬ ВИДЕО

model = YOLO('yolov8n.pt')  # yolov8s.pt yolov8m.pt yolov8l.pt yolov8x.pt

def process_frame(frame):
    results = model(frame)
    
    annotated_frame = results[0].plot()
    return annotated_frame

def process_video(video_path, output_path):
    cap = cv2.VideoCapture(video_path)
    if not cap.isOpened():
        raise FileNotFoundError(f"Не удается открыть файл {video_path}")

    fourcc = cv2.VideoWriter_fourcc(*'mp4v')
    fps = cap.get(cv2.CAP_PROP_FPS)
    width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
    height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    
    out = cv2.VideoWriter(output_path, fourcc, fps, (width, height))
    
    current_frame = 0
    while True:
        ret, frame = cap.read()
        if not ret:
            break
        
        current_frame += 1

        annotated_frame = process_frame(frame)
        
        out.write(annotated_frame)

        print(f"Frame num {current_frame} / {total_frames}")
    
    cap.release()
    out.release()
    print(f"Finished. Saved into {output_path}")

video_path = "data/videos/house_4.mov"
output_path = "data/videos/annotation_videos/house_4_1_annotated.mp4"

process_video(video_path, output_path)


0: 640x352 4 bottles, 448.8ms
Speed: 41.9ms preprocess, 448.8ms inference, 77.8ms postprocess per image at shape (1, 3, 640, 352)
Frame num 1 из 302

0: 640x352 5 bottles, 1 refrigerator, 104.7ms
Speed: 11.9ms preprocess, 104.7ms inference, 22.0ms postprocess per image at shape (1, 3, 640, 352)
Frame num 2 из 302

0: 640x352 6 bottles, 1 refrigerator, 64.0ms
Speed: 2.0ms preprocess, 64.0ms inference, 1.0ms postprocess per image at shape (1, 3, 640, 352)
Frame num 3 из 302

0: 640x352 4 bottles, 1 refrigerator, 60.0ms
Speed: 1.0ms preprocess, 60.0ms inference, 2.0ms postprocess per image at shape (1, 3, 640, 352)
Frame num 4 из 302

0: 640x352 6 bottles, 1 refrigerator, 66.0ms
Speed: 1.0ms preprocess, 66.0ms inference, 2.0ms postprocess per image at shape (1, 3, 640, 352)
Frame num 5 из 302

0: 640x352 6 bottles, 1 refrigerator, 73.0ms
Speed: 2.0ms preprocess, 73.0ms inference, 2.0ms postprocess per image at shape (1, 3, 640, 352)
Frame num 6 из 302

0: 640x352 5 bottles, 1 refrigerato

In [1]:
import cv2
import numpy as np

# ОТКРЫТЬ ВИДЕО

video_path_1 = 'data/videos/house_1.MOV'
video_path_2 = 'data/videos/annotation_videos/house_1_1_annotated.mp4'

cap1 = cv2.VideoCapture(video_path_1)
cap2 = cv2.VideoCapture(video_path_2)

# if not cap1.isOpened():
#     print(f"No path 1 - '{video_path_1}'.")
#     exit(1)

# if not cap2.isOpened():
#     print(f"No path 2 - '{video_path_2}'.")
#     exit(1)

frame_count1 = int(cap1.get(cv2.CAP_PROP_FRAME_COUNT))
frame_count2 = int(cap2.get(cv2.CAP_PROP_FRAME_COUNT))

while True:
    ret1, frame1 = cap1.read()
    ret2, frame2 = cap2.read()

    if not ret1:
        cap1.set(cv2.CAP_PROP_POS_FRAMES, 0)
        ret1, frame1 = cap1.read()

    if not ret2:
        cap2.set(cv2.CAP_PROP_POS_FRAMES, 0)
        ret2, frame2 = cap2.read()

    height = min(frame1.shape[0], frame2.shape[0])
    frame1 = cv2.resize(frame1, (int(frame1.shape[1] * height / frame1.shape[0]), height))
    frame2 = cv2.resize(frame2, (int(frame2.shape[1] * height / frame2.shape[0]), height))

    combined_frame = np.hstack((frame1, frame2))

    cv2.imshow('Combined Video', combined_frame)

    if cv2.waitKey(50) & 0xFF == ord('q'):
        break

cap1.release()
cap2.release()
cv2.destroyAllWindows()

In [7]:
import requests

# ЗАГРУЗКА ФАЙЛА МОДЕЛИ

def download_file(url, local_filename):
    with requests.get(url, stream=True) as r:
        r.raise_for_status()
        with open(local_filename, 'wb') as f:
            for chunk in r.iter_content(chunk_size=8192): 
                f.write(chunk)
    return local_filename

model_url = 'http://places2.csail.mit.edu/models_places365/resnet18_places365.pth.tar'
model_path = 'resnet18_places365.pth.tar'
download_file(model_url, model_path)


'resnet18_places365.pth.tar'

In [1]:
import os
import cv2
import torch
import requests
from ultralytics import YOLO
from torchvision import models, transforms
from PIL import Image
from collections import Counter

model_yolo = YOLO('yolov8n.pt')

model_places = models.resnet18(num_classes=365)
checkpoint = torch.hub.load_state_dict_from_url(
    'http://places2.csail.mit.edu/models_places365/resnet18_places365.pth.tar', 
    map_location=lambda storage, loc: storage
)
state_dict = {str.replace(k, 'module.', ''): v for k, v in checkpoint['state_dict'].items()}
model_places.load_state_dict(state_dict)
model_places.eval()

centre_crop = transforms.Compose([
    transforms.Resize((256, 256)),
    transforms.CenterCrop(224),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

LABELS_URL = 'https://raw.githubusercontent.com/csailvision/places365/master/categories_places365.txt'
categories = [line.strip().split(' ')[0][3:] for line in requests.get(LABELS_URL).text.split('\n') if line]

def classify_scene(frame):
    img = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
    input_img = centre_crop(img).unsqueeze(0)
    logit = model_places(input_img)
    _, predicted = torch.max(logit, 1)
    return categories[predicted.item()]

def process_frame(frame):
    results = model_yolo(frame)
    annotated_frame = results[0].plot()
    objects = results[0].boxes.cls.tolist()
    labels = [model_yolo.names[int(cls)] for cls in objects]
    return annotated_frame, labels

def process_video(video_path, output_path):
    cap = cv2.VideoCapture(video_path)
    if not cap.isOpened():
        raise FileNotFoundError(f"Не удается открыть файл {video_path}")

    fourcc = cv2.VideoWriter_fourcc(*'mp4v')
    fps = cap.get(cv2.CAP_PROP_FPS)
    width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
    height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))

    out = cv2.VideoWriter(output_path, fourcc, fps, (width, height))
    current_frame = 0
    all_objects = []

    scene_counts = {}

    while True:
        ret, frame = cap.read()
        if not ret:
            break

        current_frame += 1

        annotated_frame, objects = process_frame(frame)
        all_objects.extend(objects)

        scene = classify_scene(frame)
        if scene in scene_counts:
            scene_counts[scene] += 1
        else:
            scene_counts[scene] = 1

        out.write(annotated_frame)

        print(f"Frame num {current_frame} / {total_frames}")
    
    cap.release()
    out.release()
    print(f"Finished. Saved into {output_path}")

    common_scene = max(scene_counts, key=scene_counts.get)
    object_counts = Counter(all_objects)
    detected_objects = ', '.join([f"{obj} ({count})" for obj, count in object_counts.items()])

    print("\n" + "="*50)
    print("Hello!")
    print(f"My data indicates that the video most likely shows: {common_scene}")
    print("I have detected the following objects:")
    print(detected_objects)
    print("="*50 + "\n")
    
video_path = "data/videos/house_1.mov"
output_path = "data/videos/annotation_videos/house_1_1_annotated.mp4"

process_video(video_path, output_path)


0: 640x352 1 bottle, 2 sinks, 73.8ms
Speed: 3.0ms preprocess, 73.8ms inference, 2.0ms postprocess per image at shape (1, 3, 640, 352)
Frame num 1 / 313

0: 640x352 1 bottle, 1 sink, 57.9ms
Speed: 2.0ms preprocess, 57.9ms inference, 1.0ms postprocess per image at shape (1, 3, 640, 352)
Frame num 2 / 313

0: 640x352 1 bottle, 1 sink, 59.8ms
Speed: 1.0ms preprocess, 59.8ms inference, 1.0ms postprocess per image at shape (1, 3, 640, 352)
Frame num 3 / 313

0: 640x352 1 bottle, 2 sinks, 58.8ms
Speed: 1.0ms preprocess, 58.8ms inference, 1.0ms postprocess per image at shape (1, 3, 640, 352)
Frame num 4 / 313

0: 640x352 1 bottle, 1 sink, 63.8ms
Speed: 3.0ms preprocess, 63.8ms inference, 2.0ms postprocess per image at shape (1, 3, 640, 352)
Frame num 5 / 313

0: 640x352 1 bottle, 1 sink, 71.0ms
Speed: 2.0ms preprocess, 71.0ms inference, 1.6ms postprocess per image at shape (1, 3, 640, 352)
Frame num 6 / 313

0: 640x352 1 bottle, 1 sink, 64.0ms
Speed: 1.5ms preprocess, 64.0ms inference, 1.0ms 