# LLaVa


In [None]:
import torch
from datasets import load_dataset
from PIL import Image
import requests
import json
import os
from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
import numpy as np
import re

from transformers import LlavaProcessor, LlavaForConditionalGeneration

from typing import List, Dict, Any
import pandas as pd
from sklearn.metrics import accuracy_score

from io import BytesIO

In [None]:
class LLaVATester:
    def __init__(self, model_name="llava-hf/llava-1.5-7b-hf", device=torch.device("cuda:0")):
        self.processor = LlavaProcessor.from_pretrained(model_name, use_fast=True)
        self.model = LlavaForConditionalGeneration.from_pretrained(
            model_name,
            dtype=torch.float16,
            low_cpu_mem_usage=True
        ).to(device)
        self.device = device

    def prepare_prompt(self, question, choices):
        choices_text = "\n".join([f"{chr(65+i)}. {choice}" for i, choice in enumerate(choices)])

        prompt = f"""USER: <image>Вопрос: {question} Варианты ответа: {choices_text} Выбери правильный вариант и объясни свой выбор. ASSISTANT:"""
        return prompt

    def extract_answer(self, response):
        """Извлечение ответа из текста модели"""
        response_clean = response.upper().strip()

        patterns = [
            r'ANSWER[:\s]*([A-Z])',
            r'CORRECT[:\s]*([A-Z])',
            r'OPTION[:\s]*([A-Z])',
            r'CHOICE[:\s]*([A-Z])',
            r'\b([A-Z])\b',
            r'\(([A-Z])\)',
            r'\[([A-Z])\]',
        ]

        for pattern in patterns:
            match = re.search(pattern, response)
            if match:
                return match.group(1)
        return None

    def run_inference(self, example):
        """Запуск инференса для одного примера"""
        prompt = self.prepare_prompt(example['question'], example['choices'])
        inputs = self.processor(
            text=prompt,
            images=example['image'],
            return_tensors="pt",
            padding=True
        )
        inputs = {k: v.to(self.device) if isinstance(v, torch.Tensor) else v
         for k, v in inputs.items()}
        with torch.no_grad():
            outputs = self.model.generate(
                **inputs,
                max_new_tokens=512,
                do_sample=True,
                temperature=0.3,
                pad_token_id=self.processor.tokenizer.eos_token_id
            )

        response = self.processor.decode(outputs[0], skip_special_tokens=True)

        if "ASSISTANT:" in response:
            assistant_response = response.split("ASSISTANT:")[-1].strip()
        else:
            assistant_response = response

        predicted_answer = self.extract_answer(assistant_response)

        correct_answer = chr(65 + example['answer'])  # 0->A, 1->B, etc.

        return {
            'question': example['question'],
            'choices': example['choices'],
            'correct_answer': correct_answer,
            'predicted_answer': predicted_answer,
            'model_response': assistant_response,
            'is_correct': predicted_answer == correct_answer if predicted_answer else False
        }

In [None]:
class ScienceQAExperiment:
    def __init__(self, model_name="llava-hf/llava-1.5-7b-hf", device=torch.device("cuda:0")):
        self.processor = LlavaProcessor.from_pretrained(model_name, use_fast=True)
        self.model = LlavaForConditionalGeneration.from_pretrained(
            model_name,
            dtype=torch.float16,
            low_cpu_mem_usage=True
        ).to(device)
        self.device = device
        self.model.eval()

    def load_scienceqa_data(self, split="test", subject=None):
        """Загрузка и подготовка данных ScienceQA"""
        dataset = load_dataset("derek-thomas/ScienceQA", split=split)

        processed_data = []
        for item in dataset:
            # Фильтрация по предмету если указан
            if subject and item['subject'] != subject:
                continue

            # Загрузка изображения если есть
            image = None
            if item['image']:
                try:
                    if item['image'].startswith('http'):
                        response = requests.get(item['image'])
                        image = Image.open(BytesIO(response.content))
                    else:
                        # Для локальных путей
                        image = Image.open(item['image'])
                except:
                    image = None

            processed_data.append({
                'id': item['id'],
                'question': item['question'],
                'choices': item['choices'],
                'answer': item['answer'],
                'image': image,
                'subject': item['subject'],
                'topic': item['topic'],
                'hint': item['hint'],
                'has_image': item['image'] is not None
            })

        return processed_data

    def prepare_prompt(self, question: str, choices: List[str], hint: str = None,
                      prompt_template: str = "scienceqa") -> str:
        """Подготовка промпта для ScienceQA с учетом специфики"""
        choices_text = "\n".join([f"{chr(65+i)}. {choice}" for i, choice in enumerate(choices)])

        templates = {
            "scienceqa": f"USER: <image>\nQuestion: {question}\n{hint + ' ' if hint else ''}Options: {choices_text}\nChoose the correct answer and explain your reasoning. ASSISTANT:",
            "direct": f"USER: <image>\n{question}\nOptions: {choices_text}\nAnswer: ASSISTANT:",
            "reasoning": f"USER: <image>\nQuestion: {question}\n{hint + ' ' if hint else ''}Options: {choices_text}\nThink step by step and explain your reasoning before giving the final answer. ASSISTANT:",
            "cot": f"USER: <image>\nQuestion: {question}\n{hint + ' ' if hint else ''}Options: {choices_text}\nLet's think through this step by step: ASSISTANT:"
        }

        return templates.get(prompt_template, templates["scienceqa"])

    def run_inference(self, example: Dict, generation_config: Dict = None) -> Dict:
        """Запуск инференса для ScienceQA примера"""
        if generation_config is None:
            generation_config = {
                "max_new_tokens": 512,
                "do_sample": True,
                "temperature": 0.3,
                "top_p": 0.9,
                "pad_token_id": self.processor.tokenizer.eos_token_id
            }

        prompt = self.prepare_prompt(
            example['question'],
            example['choices'],
            example.get('hint')
        )

        try:
            # Если нет изображения, используем текстовый промпт без <image>
            if not example['image']:
                prompt = prompt.replace("<image>", "")

            inputs = self.processor(
                text=prompt,
                images=example['image'] if example['image'] else None,
                return_tensors="pt",
                padding=True
            ).to(self.device)

            with torch.no_grad():
                outputs = self.model.generate(**inputs, **generation_config)

            response = self.processor.decode(outputs[0], skip_special_tokens=True)

            # Извлечение ответа ассистента
            if "ASSISTANT:" in response:
                assistant_response = response.split("ASSISTANT:")[-1].strip()
            else:
                assistant_response = response.split("ASSISTANT")[-1].strip() if "ASSISTANT" in response else response

            predicted_answer = self.extract_answer(assistant_response, len(example['choices']))
            correct_answer = chr(65 + example['answer'])

            return {
                'id': example['id'],
                'question': example['question'],
                'choices': example['choices'],
                'correct_answer': correct_answer,
                'predicted_answer': predicted_answer,
                'model_response': assistant_response,
                'is_correct': predicted_answer == correct_answer if predicted_answer else False,
                'subject': example['subject'],
                'topic': example['topic'],
                'has_image': example['has_image']
            }

        except Exception as e:
            print(f"Error processing example {example['id']}: {e}")
            return None

    def extract_answer(self, response: str, choices_count: int) -> str:
        """Извлечение ответа для ScienceQA"""
        response_upper = response.upper()

        patterns = [
            r'(?:ANSWER|CORRECT|CHOICE|OPTION)[\s:]*([A-Z])',
            r'FINAL[\s\w]*ANSWER[\s:]*([A-Z])',
            r'\[([A-Z])\]',
            r'\(([A-Z])\)',
            r'\b([A-Z])\b(?!.*\b([A-Z])\b)'
        ]

        for pattern in patterns:
            matches = re.findall(pattern, response_upper)
            if matches:
                candidate = matches[-1] if isinstance(matches[-1], str) else matches[-1][-1]
                if ord(candidate) - 65 < choices_count:
                    return candidate

        return None

    def run_comprehensive_evaluation(self, data: List[Dict], experiment_configs: Dict) -> pd.DataFrame:
        """Запуск комплексной оценки с разными конфигурациями"""
        results = []

        for exp_name, config in experiment_configs.items():
            print(f"Running experiment: {exp_name}")

            exp_results = []
            for example in tqdm(data[:100], desc=exp_name):  # Ограничиваем для теста
                result = self.run_inference(example, config.get('generation_config'))
                if result:
                    result['experiment'] = exp_name
                    result['prompt_template'] = config.get('prompt_template', 'scienceqa')
                    exp_results.append(result)

            # Анализ результатов
            if exp_results:
                accuracy = self.calculate_accuracy(exp_results)
                subject_analysis = self.analyze_by_subject(exp_results)

                results.append({
                    'experiment': exp_name,
                    'accuracy': accuracy,
                    'total_samples': len(exp_results),
                    'valid_responses': sum(1 for r in exp_results if r['predicted_answer']),
                    'subject_analysis': subject_analysis,
                    'config': config
                })

        return pd.DataFrame(results)

    def calculate_accuracy(self, results: List[Dict]) -> float:
        """Расчет точности"""
        valid_results = [r for r in results if r['predicted_answer']]
        if not valid_results:
            return 0.0
        return sum(1 for r in valid_results if r['is_correct']) / len(valid_results)

    def analyze_by_subject(self, results: List[Dict]) -> Dict:
        """Анализ точности по предметам"""
        subjects = {}
        for result in results:
            if result['predicted_answer']:
                subject = result['subject']
                if subject not in subjects:
                    subjects[subject] = {'correct': 0, 'total': 0}
                subjects[subject]['total'] += 1
                if result['is_correct']:
                    subjects[subject]['correct'] += 1

        return {subj: data['correct'] / data['total'] if data['total'] > 0 else 0
                for subj, data in subjects.items()}

# Конфигурация экспериментов
def get_experiment_configs():
    return {
        "baseline": {
            "prompt_template": "scienceqa",
            "generation_config": {
                "max_new_tokens": 512,
                "temperature": 0.3,
                "top_p": 0.9,
                "do_sample": True
            }
        },
        "low_temp": {
            "prompt_template": "scienceqa",
            "generation_config": {
                "max_new_tokens": 512,
                "temperature": 0.1,
                "top_p": 0.9,
                "do_sample": True
            }
        },
        "high_temp": {
            "prompt_template": "scienceqa",
            "generation_config": {
                "max_new_tokens": 512,
                "temperature": 0.7,
                "top_p": 0.9,
                "do_sample": True
            }
        },
        "cot_reasoning": {
            "prompt_template": "cot",
            "generation_config": {
                "max_new_tokens": 512,
                "temperature": 0.3,
                "top_p": 0.9,
                "do_sample": True
            }
        },
        "greedy": {
            "prompt_template": "direct",
            "generation_config": {
                "max_new_tokens": 512,
                "do_sample": False,
                "num_beams": 1
            }
        }
    }

experiment = ScienceQAExperiment()

# Загрузка данных
print("Loading ScienceQA data...")
test_data = experiment.load_scienceqa_data(split="validation")  # Начните с validation

# Запуск экспериментов
configs = get_experiment_configs()
results_df = experiment.run_comprehensive_evaluation(test_data, configs)

# Сохранение результатов
results_df.to_csv("scienceqa_experiment_results.csv", index=False)

print("\n=== Experiment Results ===")
print(results_df[['experiment', 'accuracy', 'valid_responses', 'total_samples']])

# Детальный анализ лучшего эксперимента
best_exp = results_df.loc[results_df['accuracy'].idxmax()]
print(f"\nBest experiment: {best_exp['experiment']} (Accuracy: {best_exp['accuracy']:.3f})")
print("Subject-wise performance:", best_exp['subject_analysis'])



Fetching 3 files:   0%|          | 0/3 [00:00<?, ?it/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/4.99G [00:00<?, ?B/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/4.18G [00:00<?, ?B/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/4.96G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/141 [00:00<?, ?B/s]

Loading ScienceQA data...


README.md: 0.00B [00:00, ?B/s]

data/train-00000-of-00001-1028f23e353fbe(…):   0%|          | 0.00/377M [00:00<?, ?B/s]

data/validation-00000-of-00001-6c7328ff6(…):   0%|          | 0.00/126M [00:00<?, ?B/s]

data/test-00000-of-00001-f0e719df791966f(…):   0%|          | 0.00/122M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/12726 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/4241 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/4241 [00:00<?, ? examples/s]

KeyError: 'id'

In [None]:
def evaluate_on_scienceqa(tester, dataset, num_samples=300):
    image_examples = [dataset[i] for i in range(num_samples) if dataset[i]['image'] is not None]
    # print(image_examples)
    results = []
    correct_count = 0

    for example in tqdm(image_examples, desc="Обработка примеров"):
        result = tester.run_inference(example)
        results.append(result)
        if result['is_correct']:
            correct_count += 1

    # Расчет метрик
    accuracy = correct_count / len(results) if results else 0

    return {
        'results': results,
        'accuracy': accuracy,
        'total_tested': len(results),
        'correct_count': correct_count
    }

In [None]:
dataset = load_dataset("derek-thomas/ScienceQA", split="validation"[:400])
tester = LLaVATester()

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [None]:
eval_results = evaluate_on_scienceqa(tester, dataset)
print(eval_results['accuracy'])

Обработка примеров: 100%|██████████| 154/154 [02:59<00:00,  1.16s/it]


In [None]:
eval_results['accuracy']

0.5194805194805194

# PoseLLM

In [None]:
import torch
from datasets import load_dataset
from PIL import Image
import requests
import json
import os
from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
import numpy as np
import re
from transformers import (
    CLIPProcessor,
    CLIPModel
)

классификация действий человека по изображениям на классы: "walking", "running", "sitting", "standing", "jumping", "dancing", "lifting", "bending", "crouching", "pointing", "cycling", "swimming", "playing sports", "exercising"

In [None]:
dataset = load_dataset("clip-benchmark/wds_objectnet")

Resolving data files:   0%|          | 0/74 [00:00<?, ?it/s]

Downloading data:   0%|          | 0/73 [00:00<?, ?files/s]

test/0.tar:   0%|          | 0.00/1.00G [00:00<?, ?B/s]

test/1.tar:   0%|          | 0.00/1.00G [00:00<?, ?B/s]

test/10.tar:   0%|          | 0.00/1.00G [00:00<?, ?B/s]

test/11.tar:   0%|          | 0.00/1.00G [00:00<?, ?B/s]

test/12.tar:   0%|          | 0.00/1.00G [00:00<?, ?B/s]

test/13.tar:   0%|          | 0.00/1.00G [00:00<?, ?B/s]

test/14.tar:   0%|          | 0.00/1.00G [00:00<?, ?B/s]

test/15.tar:   0%|          | 0.00/1.00G [00:00<?, ?B/s]

test/16.tar:   0%|          | 0.00/1.00G [00:00<?, ?B/s]

test/17.tar:   0%|          | 0.00/1.00G [00:00<?, ?B/s]

test/18.tar:   0%|          | 0.00/1.00G [00:00<?, ?B/s]

test/19.tar:   0%|          | 0.00/1.00G [00:00<?, ?B/s]

test/2.tar:   0%|          | 0.00/1.00G [00:00<?, ?B/s]

test/20.tar:   0%|          | 0.00/1.01G [00:00<?, ?B/s]

test/21.tar:   0%|          | 0.00/1.00G [00:00<?, ?B/s]

test/22.tar:   0%|          | 0.00/1.00G [00:00<?, ?B/s]

test/23.tar:   0%|          | 0.00/1.00G [00:00<?, ?B/s]

test/24.tar:   0%|          | 0.00/1.00G [00:00<?, ?B/s]

test/25.tar:   0%|          | 0.00/1.00G [00:00<?, ?B/s]

test/26.tar:   0%|          | 0.00/1.00G [00:00<?, ?B/s]

test/27.tar:   0%|          | 0.00/1.00G [00:00<?, ?B/s]

test/28.tar:   0%|          | 0.00/1.01G [00:00<?, ?B/s]

test/29.tar:   0%|          | 0.00/1.00G [00:00<?, ?B/s]

test/3.tar:   0%|          | 0.00/1.00G [00:00<?, ?B/s]

test/30.tar:   0%|          | 0.00/1.00G [00:00<?, ?B/s]

test/31.tar:   0%|          | 0.00/1.00G [00:00<?, ?B/s]

test/32.tar:   0%|          | 0.00/1.01G [00:00<?, ?B/s]

test/33.tar:   0%|          | 0.00/1.00G [00:00<?, ?B/s]

test/34.tar:   0%|          | 0.00/1.00G [00:00<?, ?B/s]

KeyboardInterrupt: 

In [None]:
import cv2
from transformers import (
    AutoProcessor,
    AutoModelForCausalLM,
    AutoImageProcessor,
    AutoModel
)
import mediapipe as mp
from typing import Dict, List, Optional

class CLIPPose:
    def __init__(self, model_name="openai/clip-vit-base-patch32", device="cuda:0"):
        """
        Инициализация PoseLLM эксперимента

        PoseLLM - мультимодальная модель для понимания человеческих поз и действий
        """
        self.device = torch.device(device if torch.cuda.is_available() else "cpu")
        # Загрузка процессора и модели
        self.processor = CLIPProcessor.from_pretrained(model_name)
        self.model = CLIPModel.from_pretrained(
            model_name,
            dtype=torch.float16,
            device_map="auto",
            trust_remote_code=True
        )
        self.actions = [
            "walking", "running", "sitting", "standing", "jumping",
            "dancing", "lifting", "bending", "crouching", "pointing",
            "cycling", "swimming", "playing sports", "exercising"
        ]

    def analyze_with_clip(self, image, question):
        """Правильный анализ с помощью CLIP модели"""
        # Создаем текстовые варианты для классификации действий
        text_descriptions = [f"a person {action}" for action in self.actions]
        text_descriptions.append("no person in the image")

        # Обрабатываем через CLIP
        inputs = self.processor(
            text=text_descriptions,
            images=image,
            return_tensors="pt",
            padding=True
        ).to(self.device)

        with torch.no_grad():
            outputs = self.model(**inputs)

        # Получаем similarity scores между изображением и текстовыми описаниями
        logits_per_image = outputs.logits_per_image  # [1, num_text_descriptions]
        probs = logits_per_image.softmax(dim=1)

        # Находим наиболее вероятное действие
        best_action_idx = probs.argmax().item()
        confidence = probs[0][best_action_idx].item()

        if best_action_idx < len(self.actions):
            predicted_action = self.actions[best_action_idx]
        else:
            predicted_action = "no person"

        # Получаем вероятности для всех действий
        action_probs = {}
        for i, action in enumerate(self.actions):
            action_probs[action] = probs[0][i].item()

        return {
            'predicted_action': predicted_action,
            'confidence': confidence,
            'all_probs': action_probs,
            'method': 'CLIP_zeroshot'
        }


class PoseEvaluationBenchmark:
    def __init__(self):
        """
        Бенчмарк для оценки PoseLLM на различных датасетах
        """
        self.datasets = {
            'human_action': "imagenet-1k",  # для действий человека
            'pose_estimation': "mpii-human-pose",  # для оценки поз
            'sports': "stanford_40_actions",  # спортивные действия
        }

    def load_pose_dataset(self, dataset_name: str, split: str = "validation", num_samples: int = 100):
        """
        Загрузка датасета для оценки поз
        """
        dataset_map = {
            'human_action': "anonymous1/Human-Action-Recognition",
            'pose_estimation': "trpakov/mpii-human-pose",
            'sports': "daveni/sports-actions"
        }

        if dataset_name in dataset_map:
            dataset = load_dataset(dataset_map[dataset_name], split=f"{split}[:{num_samples}]")
            return dataset

        # Fallback - создаем синтетический датасет
        return self.create_synthetic_pose_dataset(num_samples)

    def create_synthetic_pose_dataset(self, num_samples: int = 50):
        """
        Создание синтетического датасета для тестирования
        """
        synthetic_data = []
        actions = ['walking', 'running', 'sitting', 'standing', 'jumping', 'dancing']

        for i in range(num_samples):
            action = actions[i % len(actions)]
            synthetic_data.append({
                'image_id': f'synth_{i}',
                'image': None,  # Заглушка для реального изображения
                'question': f"What action is the person performing?",
                'ground_truth': action,
                'action_category': action
            })

        return synthetic_data

def evaluate_pose_understanding(experiment: PoseLLMExperiment,
                              dataset: List[Dict],
                              task_type: str = "action_recognition") -> Dict:
    """
    Оценка понимания поз на датасете
    """

    results = []
    metrics = {
        'total_tested': 0,
        'correct_actions': 0,
        'pose_detected': 0,
        'confident_responses': 0
    }

    for example in tqdm(dataset, desc="Анализ поз"):
        if example.get('image') is None:
            continue

        # Запускаем анализ
        result = experiment.run_pose_analysis(
            image=example['image'],
            question=example['question']
        )
        if result:
            metrics['total_tested'] += 1
            # Анализ результатов
            if result['has_pose']:
                metrics['pose_detected'] += 1

            # Проверяем правильность распознавания действия
            if task_type == "action_recognition":
                predicted_action = extract_action_from_response(result['answer'])
                ground_truth = example.get('ground_truth', '')

                if is_action_correct(predicted_action, ground_truth):
                    metrics['correct_actions'] += 1

            # Проверяем уверенность ответа
            if is_confident_response(result['answer']):
                metrics['confident_responses'] += 1

            results.append({
                'example_id': example.get('image_id', 'unknown'),
                'result': result,
                'ground_truth': example.get('ground_truth', ''),
                'is_correct': is_action_correct(
                    extract_action_from_response(result['answer']),
                    example.get('ground_truth', '')
                )
            })

    # Расчет финальных метрик
    if metrics['total_tested'] > 0:
        metrics['action_accuracy'] = metrics['correct_actions'] / metrics['total_tested']
        metrics['pose_detection_rate'] = metrics['pose_detected'] / metrics['total_tested']
        metrics['confidence_rate'] = metrics['confident_responses'] / metrics['total_tested']

    return {
        'results': results,
        'metrics': metrics,
        'task_type': task_type
    }

def extract_action_from_response(response: str) -> str:
    """
    Извлечение действия из текстового ответа
    """
    actions = ['walking', 'running', 'sitting', 'standing', 'jumping', 'dancing',
               'lifting', 'bending', 'crouching', 'pointing']

    response_lower = response.lower()
    for action in actions:
        if action in response_lower:
            return action

    return "unknown"

def is_action_correct(predicted: str, ground_truth: str) -> bool:
    """
    Проверка правильности распознанного действия
    """
    return predicted.lower() == ground_truth.lower()

def is_confident_response(response: str) -> bool:
    """
    Проверка уверенности ответа по ключевым словам
    """
    confidence_indicators = ['confident', 'certain', 'definitely', 'clearly', 'obviously']
    return any(indicator in response.lower() for indicator in confidence_indicators)

def visualize_pose_results(evaluation_results: Dict):
    """
    Визуализация результатов анализа поз
    """
    metrics = evaluation_results['metrics']
    results = evaluation_results['results']

    # Создаем графики
    fig, axes = plt.subplots(2, 2, figsize=(15, 12))

    # 1. Overall Metrics
    metric_names = ['Action Accuracy', 'Pose Detection', 'Confidence Rate']
    metric_values = [
        metrics.get('action_accuracy', 0),
        metrics.get('pose_detection_rate', 0),
        metrics.get('confidence_rate', 0)
    ]

    axes[0,0].bar(metric_names, metric_values, color=['skyblue', 'lightcoral', 'lightgreen'])
    axes[0,0].set_title('Overall Performance Metrics')
    axes[0,0].set_ylabel('Score')

    # 2. Action Distribution
    actions = [extract_action_from_response(r['result']['answer']) for r in results]
    action_counts = {action: actions.count(action) for action in set(actions)}

    axes[0,1].pie(action_counts.values(), labels=action_counts.keys(), autopct='%1.1f%%')
    axes[0,1].set_title('Predicted Actions Distribution')

    # 3. Confidence Analysis
    confidence_levels = ['High' if is_confident_response(r['result']['answer']) else 'Low'
                        for r in results]
    conf_counts = {level: confidence_levels.count(level) for level in set(confidence_levels)}

    axes[1,0].bar(conf_counts.keys(), conf_counts.values(), color=['gold', 'silver'])
    axes[1,0].set_title('Response Confidence Levels')
    axes[1,0].set_ylabel('Count')

    # 4. Accuracy by Action Type
    action_accuracies = {}
    for action in set(actions):
        action_results = [r for r in results if extract_action_from_response(r['result']['answer']) == action]
        if action_results:
            accuracy = sum(1 for r in action_results if r['is_correct']) / len(action_results)
            action_accuracies[action] = accuracy

    axes[1,1].barh(list(action_accuracies.keys()), list(action_accuracies.values()))
    axes[1,1].set_title('Accuracy by Action Type')
    axes[1,1].set_xlabel('Accuracy')

    plt.tight_layout()
    plt.show()

    # Вывод метрик
    print(f"\n📊 РЕЗУЛЬТАТЫ ЭКСПЕРИМЕНТА:")
    print(f"Точность распознавания действий: {metrics.get('action_accuracy', 0):.2%}")
    print(f"Детекция поз: {metrics.get('pose_detection_rate', 0):.2%}")
    print(f"Уверенные ответы: {metrics.get('confidence_rate', 0):.2%}")
    print(f"Всего протестировано: {metrics.get('total_tested', 0)} примеров")

def run_complete_pose_experiment():
    """
    Полный пайплайн эксперимента с PoseLLM
    """
    print("🚀 ЗАПУСК ПОЛНОГО ЭКСПЕРИМЕНТА POSEELLM")

    # 1. Инициализация
    experiment = PoseLLMExperiment()
    benchmark = PoseEvaluationBenchmark()

    # 2. Загрузка датасетов
    print("📥 Загрузка датасетов...")
    action_dataset = benchmark.load_pose_dataset('human_action', num_samples=50)
    pose_dataset = benchmark.load_pose_dataset('pose_estimation', num_samples=30)

    # 3. Запуск оценки
    print("🔍 Оценка понимания действий...")
    action_results = evaluate_pose_understanding(
        experiment,
        action_dataset,
        task_type="action_recognition"
    )

    print("🔍 Оценка детекции поз...")
    pose_results = evaluate_pose_understanding(
        experiment,
        pose_dataset,
        task_type="pose_detection"
    )

    # 4. Визуализация результатов
    print("📈 Визуализация результатов...")
    visualize_pose_results(action_results)
    visualize_pose_results(pose_results)

    # 5. Сохранение результатов
    results = {
        'action_recognition': action_results,
        'pose_detection': pose_results,
        'experiment_config': {
            'model': 'microsoft/posellm-7b',
            'datasets': list(benchmark.datasets.keys()),
            'timestamp': str(np.datetime64('now'))
        }
    }

    with open('posellm_experiment_results.json', 'w') as f:
        json.dump(results, f, indent=2)

    print("💾 Результаты сохранены в posellm_experiment_results.json")
    return results


In [None]:
experiment = PoseLLMExperiment()

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


preprocessor_config.json:   0%|          | 0.00/316 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/592 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/389 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/605M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/605M [00:00<?, ?B/s]

Downloading model to /usr/local/lib/python3.12/dist-packages/mediapipe/modules/pose_landmark/pose_landmark_heavy.tflite


In [None]:


# Тест на одном примере
test_image = Image.new('RGB', (224, 224), color='blue')  # Заглушка
test_question = "What is the person doing in this image?"

result = experiment.run_pose_analysis(test_image, test_question)
print("🧪 Тестовый результат:", result)

# Полный эксперимент (раскомментируйте для запуска)
# full_results = run_complete_pose_experiment()

AttributeError: 'CLIPModel' object has no attribute 'generate'

# ViT

In [1]:
import torch
import torch.nn as nn
from transformers import ViTForImageClassification, ViTConfig
from torchvision import datasets, transforms
from tqdm import tqdm
from functools import partial

In [2]:
@torch.no_grad()
def quantize_weight_per_channel_absmax(w, n_bits=8):
    """
    Args:
        w: (out_features, in_features)
    """
    scales = torch.max(torch.abs(w), dim=1, keepdim=True).values / (2 ** (n_bits - 1) - 1)

    # Квантование и деквантование
    w_quant = torch.where(w != 0, torch.clamp(torch.round(w / scales), -2**(n_bits-1), 2**(n_bits-1)-1), 0)
    w_dequant = w_quant * scales
    return w_dequant


@torch.no_grad()
def quantize_activation_per_token_absmax(t, n_bits=8):
    """
    Args:
        t: (d0, ..., dn, token_embedding_dim)
    """
    original_shape = t.shape

    t_flat = t.view(-1, t.shape[-1])

    t_dequant = quantize_weight_per_channel_absmax(t_flat, n_bits)

    return t_dequant.view(original_shape)


class W8A8Linear(nn.Module):
    # This class implements fake quantization.
    # It performs all computations in FP16 but simulates the effects of quantization
    def __init__(
        self,
        in_features,
        out_features,
        bias=True,
        act_quant="per_token",
        quantize_output=False,
    ):
        super().__init__()
        self.in_features = in_features
        self.out_features = out_features

        self.register_buffer(
            "weight",
            torch.randn(
                self.out_features,
                self.in_features,
                dtype=torch.float16,
                requires_grad=False,
            ),
        )
        if bias:
            self.register_buffer(
                "bias",
                torch.zeros(
                    (1, self.out_features), dtype=torch.float16, requires_grad=False
                ),
            )
        else:
            self.register_buffer("bias", None)

        if act_quant == "per_token":
            self.act_quant_name = "per_token"
            self.act_quant = partial(quantize_activation_per_token_absmax, n_bits=8)
        else:
            raise ValueError(f"Invalid act_quant: {act_quant}")

        if quantize_output:
            self.output_quant_name = self.act_quant_name
            self.output_quant = self.act_quant
        else:
            self.output_quant_name = "None"
            self.output_quant = lambda x: x

    def to(self, *args, **kwargs):
        super(W8A8Linear, self).to(*args, **kwargs)
        self.weight = self.weight.to(*args, **kwargs)
        if self.bias is not None:
            self.bias = self.bias.to(*args, **kwargs)
        return self

    @torch.no_grad()
    def forward(self, x):
        q_x = self.act_quant(x)
        y = torch.functional.F.linear(q_x, self.weight, self.bias)
        q_y = self.output_quant(y)
        return q_y

    @staticmethod
    def from_float(
        module, weight_quant="per_channel", act_quant="per_token", quantize_output=False
    ):
        assert isinstance(module, torch.nn.Linear)
        new_module = W8A8Linear(
            module.in_features,
            module.out_features,
            module.bias is not None,
            act_quant=act_quant,
            quantize_output=quantize_output,
        )
        if weight_quant == "per_channel":
            new_module.weight = quantize_weight_per_channel_absmax(
                module.weight, n_bits=8
            )  # use 8-bit integer for weight
        else:
            raise ValueError(f"Invalid weight_quant: {weight_quant}")

        new_module.weight_quant_name = weight_quant
        if module.bias is not None:
            new_module.bias = module.bias
        return new_module

    def __repr__(self):
        return f"W8A8Linear({self.in_features}, {self.out_features}, bias={self.bias is not None}, weight_quant={self.weight_quant_name}, act_quant={self.act_quant_name}, output_quant={self.output_quant_name})"

In [3]:
def quantize_vit_like(
    model,
    weight_quant="per_channel",
    act_quant="per_token",
    quantize_ff=True,
    quantize_attention=True
):
    """
    Квантизация ViT модели аналогично BERT примеру
    """
    for name, module in model.named_children():
        if len(list(module.children())) > 0:
            quantize_vit_like(module, weight_quant, act_quant, quantize_ff, quantize_attention)

        if quantize_attention and isinstance(module, nn.Linear) and any(x in name for x in ['query', 'key', 'value', 'attention.output']):
            print(f"Quantizing attention layer: {name}")
            new_layer = W8A8Linear.from_float(
                module,
                weight_quant=weight_quant,
                act_quant=act_quant,
                quantize_output=False
            )
            setattr(model, name, new_layer)

        elif quantize_ff and isinstance(module, nn.Linear) and any(x in name for x in ['intermediate', 'output']):
            print(f"Quantizing FFN layer: {name}")
            new_layer = W8A8Linear.from_float(
                module,
                weight_quant=weight_quant,
                act_quant=act_quant,
                quantize_output=False
            )
            setattr(model, name, new_layer)

    return model

In [4]:
def evaluate_model(model, test_loader, device):
    """Оценка точности модели"""
    model.eval()
    correct = 0
    total = 0

    with torch.no_grad():
        for images, labels in tqdm(test_loader, desc="Evaluating"):
            images, labels = images.to(device), labels.to(device)

            outputs = model(images)
            _, predicted = torch.max(outputs.logits, 1)

            total += labels.size(0)
            correct += (predicted == labels).sum().item()

    accuracy = 100 * correct / total
    return accuracy

def measure_memory_usage(model):
    """Измерение использования памяти"""
    param_size = 0
    for param in model.parameters():
        param_size += param.nelement() * param.element_size()
    buffer_size = 0
    for buffer in model.buffers():
        buffer_size += buffer.nelement() * buffer.element_size()

    size_all_mb = (param_size + buffer_size) / 1024**2
    return size_all_mb

In [7]:
def setup_cifar10():
    """Подготовка CIFAR-10 датасета"""
    transform = transforms.Compose([
        transforms.Resize((224, 224)),  # ViT ожидает 224x224
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406],
                           std=[0.229, 0.224, 0.225])
    ])

    test_dataset = datasets.CIFAR10(
        root='./data',
        train=False,
        download=True,
        transform=transform
    )

    test_loader = torch.utils.data.DataLoader(
        test_dataset,
        batch_size=32,
        shuffle=False
    )

    return test_loader

class_names = ['airplane', 'automobile', 'bird', 'cat', 'deer',
               'dog', 'frog', 'horse', 'ship', 'truck']

In [5]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = ViTForImageClassification.from_pretrained(
    "google/vit-base-patch16-224",
    num_labels=10,  # для CIFAR-10
    ignore_mismatched_sizes=True  # игнорируем несовпадение размеров head
).to(device)

Some weights of ViTForImageClassification were not initialized from the model checkpoint at google/vit-base-patch16-224 and are newly initialized because the shapes did not match:
- classifier.bias: found shape torch.Size([1000]) in the checkpoint and torch.Size([10]) in the model instantiated
- classifier.weight: found shape torch.Size([1000, 768]) in the checkpoint and torch.Size([10, 768]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [8]:
test_loader = setup_cifar10()

print("Evaluating original model...")
original_accuracy = evaluate_model(model, test_loader, device)
original_memory = measure_memory_usage(model)

print(f"Original Model - Accuracy: {original_accuracy:.2f}%, Memory: {original_memory:.2f} MB")

Evaluating original model...


Evaluating: 100%|██████████| 313/313 [02:17<00:00,  2.28it/s]

Original Model - Accuracy: 7.79%, Memory: 327.33 MB





In [9]:
# 3. Применяем квантизацию
print("Applying quantization...")
quantized_model = quantize_vit_like(model)
quantized_model.to(device)

# 4. Замеряем после квантизации
print("Evaluating quantized model...")
quantized_accuracy = evaluate_model(quantized_model, test_loader, device)
quantized_memory = measure_memory_usage(quantized_model)

print(f"Quantized Model - Accuracy: {quantized_accuracy:.2f}%, Memory: {quantized_memory:.2f} MB")

# 5. Сравниваем результаты
print("\n=== RESULTS ===")
print(f"Accuracy drop: {original_accuracy - quantized_accuracy:.2f}%")
print(f"Memory reduction: {original_memory - quantized_memory:.2f} MB ({((original_memory - quantized_memory) / original_memory * 100):.1f}%)")


Applying quantization...
Quantizing attention layer: query
Quantizing attention layer: key
Quantizing attention layer: value
Quantizing attention layer: query
Quantizing attention layer: key
Quantizing attention layer: value
Quantizing attention layer: query
Quantizing attention layer: key
Quantizing attention layer: value
Quantizing attention layer: query
Quantizing attention layer: key
Quantizing attention layer: value
Quantizing attention layer: query
Quantizing attention layer: key
Quantizing attention layer: value
Quantizing attention layer: query
Quantizing attention layer: key
Quantizing attention layer: value
Quantizing attention layer: query
Quantizing attention layer: key
Quantizing attention layer: value
Quantizing attention layer: query
Quantizing attention layer: key
Quantizing attention layer: value
Quantizing attention layer: query
Quantizing attention layer: key
Quantizing attention layer: value
Quantizing attention layer: query
Quantizing attention layer: key
Quantizin

Evaluating: 100%|██████████| 313/313 [02:15<00:00,  2.30it/s]

Quantized Model - Accuracy: 7.76%, Memory: 327.33 MB

=== RESULTS ===
Accuracy drop: 0.03%
Memory reduction: 0.00 MB (0.0%)



