In [None]:
# vLLM基本機能テスト - 修正版
import torch
from vllm import LLM, SamplingParams
from transformers import AutoConfig

def check_model_config(model_name):
    """モデル設定を事前確認"""
    try:
        config = AutoConfig.from_pretrained(model_name)
        max_pos = getattr(config, 'max_position_embeddings', None)
        model_max_len = getattr(config, 'max_length', None)
        print(f"Model: {model_name}")
        print(f"  max_position_embeddings: {max_pos}")
        print(f"  model_max_length: {model_max_len}")
        return max_pos
    except Exception as e:
        print(f"設定確認エラー: {e}")
        return None

def test_vllm_with_model(model_name, max_len=512, gpu_util=0.7):
    """指定されたモデルでvLLMテスト"""
    print(f"\n=== Testing {model_name} ===")
    
    # モデル設定確認
    max_pos = check_model_config(model_name)
    if max_pos and max_len > max_pos:
        max_len = min(max_len, max_pos)
        print(f"max_model_lenを{max_len}に調整")
    
    try:
        # vLLMモデル初期化
        llm = LLM(
            model=model_name,
            trust_remote_code=True,
            max_model_len=max_len,
            max_num_seqs=1,
            gpu_memory_utilization=gpu_util,  # GPU使用率を下げる
            enforce_eager=True,  # メモリ効率を改善
        )
        print(f"✅ {model_name} ロード成功")
        
        # テキスト生成テスト
        prompts = ["Hello, how are you?"]
        sampling_params = SamplingParams(
            temperature=0.2,
            max_tokens=32,
            top_p=0.9
        )
        
        outputs = llm.generate(prompts, sampling_params)
        result = outputs[0].outputs[0].text
        print(f"✅ テキスト生成成功: {result.strip()}")
        
        # メモリクリーンアップ
        del llm
        torch.cuda.empty_cache()
        
        return True
        
    except Exception as e:
        print(f"❌ {model_name} テスト失敗: {e}")
        # メモリクリーンアップ
        torch.cuda.empty_cache()
        return False

# GPU情報確認
if torch.cuda.is_available():
    gpu_memory = torch.cuda.get_device_properties(0).total_memory / 1024**3
    print(f"GPU Memory: {gpu_memory:.1f} GB")
else:
    print("CUDA not available")

# 段階的テスト
models_to_test = [
    # ("gpt2", 512, 0.6),  # 最軽量
    # ("microsoft/DialoGPT-small", 512, 0.7),  # 小サイズ
    ("microsoft/DialoGPT-medium", 1024, 0.7),  # 中サイズ（修正版）
]

print("=== vLLM段階的テスト開始 ===")
success_count = 0

for model_name, max_len, gpu_util in models_to_test:
    if test_vllm_with_model(model_name, max_len, gpu_util):
        success_count += 1
    else:
        print(f"⚠️ {model_name}をスキップして次のモデルをテスト")

print(f"\n=== テスト結果: {success_count}/{len(models_to_test)} 成功 ===")

GPU Memory: 12.0 GB
=== vLLM段階的テスト開始 ===

=== Testing microsoft/DialoGPT-medium ===
Model: microsoft/DialoGPT-medium
  max_position_embeddings: 1024
  model_max_length: 20
INFO 07-23 17:38:15 [config.py:841] This model supports multiple tasks: {'reward', 'embed', 'generate', 'classify'}. Defaulting to 'generate'.
ERROR 07-23 17:38:16 [config.py:130] Error retrieving safetensors: 'microsoft/DialoGPT-medium' is not a safetensors repo. Couldn't find 'model.safetensors.index.json' or 'model.safetensors' files., retrying 1 of 2
ERROR 07-23 17:38:18 [config.py:128] Error retrieving safetensors: 'microsoft/DialoGPT-medium' is not a safetensors repo. Couldn't find 'model.safetensors.index.json' or 'model.safetensors' files.
INFO 07-23 17:38:18 [config.py:3368] Downcasting torch.float32 to torch.bfloat16.
INFO 07-23 17:38:18 [config.py:1472] Using max model len 1024
INFO 07-23 17:38:18 [config.py:2285] Chunked prefill is enabled with max_num_batched_tokens=8192.
INFO 07-23 17:38:23 [__init__.py

Loading pt checkpoint shards:   0% Completed | 0/1 [00:00<?, ?it/s]
Loading pt checkpoint shards: 100% Completed | 1/1 [00:02<00:00,  2.02s/it]
Loading pt checkpoint shards: 100% Completed | 1/1 [00:02<00:00,  2.02s/it]



INFO 07-23 17:39:00 [default_loader.py:272] Loading weights took 2.03 seconds
INFO 07-23 17:39:00 [gpu_model_runner.py:1801] Model loading took 0.6611 GiB and 33.998652 seconds
INFO 07-23 17:39:01 [gpu_worker.py:232] Available KV cache memory: 7.48 GiB
INFO 07-23 17:39:02 [kv_cache_utils.py:716] GPU KV cache size: 81,696 tokens
INFO 07-23 17:39:02 [kv_cache_utils.py:720] Maximum concurrency for 1,024 tokens per request: 79.78x
INFO 07-23 17:39:02 [core.py:172] init engine (profile, create kv cache, warmup model) took 1.79 seconds
✅ microsoft/DialoGPT-medium ロード成功


Adding requests:   0%|          | 0/1 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

✅ テキスト生成成功: 





=== テスト結果: 1/1 成功 ===


In [None]:
# Hugging Face モデルキャッシュテスト
from transformers import AutoModel, AutoTokenizer
from huggingface_hub import hf_hub_download
import torch

def test_model_cache():
    """モデルキャッシュ機能をテスト"""
    try:
        print("=== Hugging Face モデルキャッシュテスト ===")
        
        # 軽量モデルでテスト（BERT-base-uncased）
        model_name = "bert-base-uncased"
        print(f"モデル '{model_name}' をダウンロード中...")
        
        # モデルとトークナイザーをロード（自動キャッシュ）
        model = AutoModel.from_pretrained(model_name)
        tokenizer = AutoTokenizer.from_pretrained(model_name)
        
        # config.jsonファイルのローカルキャッシュパスを表示
        config_path = hf_hub_download(repo_id=model_name, filename="config.json")
        print(f"✅ モデルキャッシュパス: {config_path}")
        
        # 簡単な推論テスト
        text = "Hello, this is a test."
        inputs = tokenizer(text, return_tensors="pt")
        
        with torch.no_grad():
            outputs = model(**inputs)
        
        print(f"✅ 推論テスト成功: 出力形状 {outputs.last_hidden_state.shape}")
        
        # メモリクリーンアップ
        del model, tokenizer
        torch.cuda.empty_cache() if torch.cuda.is_available() else None
        
        return True
        
    except Exception as e:
        print(f"❌ モデルキャッシュテスト失敗: {e}")
        return False

# テスト実行
test_model_cache()

In [None]:
from vllm.assets.image import ImageAsset
from vllm import LLM, SamplingParams
from transformers import AutoProcessor
from PIL import Image
import requests

# ImageAssetのテスト
print("Testing ImageAsset...")
image_asset = ImageAsset("cherry_blossom")
print(f"Image loaded successfully: {image_asset.pil_image.size}")

# prepare model (より軽量なモデルを使用)
print("Loading model...")
model_id = "microsoft/Phi-3.5-vision-instruct"
try:
    llm = LLM(
        model=model_id,  # より軽量なビジョンモデル
        trust_remote_code=True,
        max_model_len=2048,  # メモリ使用量を削減
        max_num_seqs=1,      # 並列処理数を削減
    )
    print("Model loaded successfully")
    
        # for best performance, use num_crops=4 for multi-frame, num_crops=16 for single-frame.
    processor = AutoProcessor.from_pretrained(model_id,
    trust_remote_code=True,
    num_crops=4
    )
    
    images = []
    placeholder = ""

    # Note: if OOM, you might consider reduce number of frames in this example.
    for i in range(1,20):
        url = f"https://image.slidesharecdn.com/azureintroduction-191206101932/75/Introduction-to-Microsoft-Azure-Cloud-{i}-2048.jpg"
        images.append(Image.open(requests.get(url, stream=True).raw))
        placeholder += f"<|image_{i}|>\n"

    messages = [
        {"role": "user", "content": placeholder+"Summarize the deck of slides."},
    ]

    prompt = processor.tokenizer.apply_chat_template(
    messages,
    tokenize=False,
    add_generation_prompt=True
    )

    inputs = processor(prompt, images, return_tensors="pt").to("cuda:0")
    generation_args = {
    "max_new_tokens": 1000,
    "temperature": 0.0,
    "do_sample": False,
    }

    generate_ids = model.generate(**inputs, 
    eos_token_id=processor.tokenizer.eos_token_id, 
    **generation_args
    )

    # remove input tokens 
    generate_ids = generate_ids[:, inputs['input_ids'].shape[1]:]
    response = processor.batch_decode(generate_ids, 
    skip_special_tokens=True, 
    clean_up_tokenization_spaces=False)[0] 

    print(response)

except Exception as e:
    print(f"Model loading failed: {e}")
    print("Trying alternative approach...")



Testing ImageAsset...
Image loaded successfully: (1770, 1180)
Loading model...
INFO 07-23 17:21:43 [config.py:240] Replacing legacy 'type' key with 'rope_type'
INFO 07-23 17:21:43 [config.py:841] This model supports multiple tasks: {'generate', 'embed', 'classify', 'reward'}. Defaulting to 'generate'.
INFO 07-23 17:21:43 [config.py:1472] Using max model len 2048
INFO 07-23 17:21:43 [config.py:2285] Chunked prefill is enabled with max_num_batched_tokens=8192.
INFO 07-23 17:21:51 [__init__.py:244] Automatically detected platform cuda.
INFO 07-23 17:21:54 [core.py:526] Waiting for init message from front-end.
INFO 07-23 17:21:54 [core.py:69] Initializing a V1 LLM engine (v0.9.2) with config: model='microsoft/Phi-3.5-vision-instruct', speculative_config=None, tokenizer='microsoft/Phi-3.5-vision-instruct', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config={}, tokenizer_revision=None, trust_remote_code=True, dtype=torch.bfloat16, max_seq_len=2048, download

Process EngineCore_0:
Traceback (most recent call last):
  File "/home/lius/miniconda3/envs/agent_ragenv/lib/python3.10/multiprocessing/process.py", line 314, in _bootstrap
    self.run()
  File "/home/lius/miniconda3/envs/agent_ragenv/lib/python3.10/multiprocessing/process.py", line 108, in run
    self._target(*self._args, **self._kwargs)
  File "/home/lius/miniconda3/envs/agent_ragenv/lib/python3.10/site-packages/vllm/v1/engine/core.py", line 590, in run_engine_core
    raise e
  File "/home/lius/miniconda3/envs/agent_ragenv/lib/python3.10/site-packages/vllm/v1/engine/core.py", line 577, in run_engine_core
    engine_core = EngineCoreProc(*args, **kwargs)
  File "/home/lius/miniconda3/envs/agent_ragenv/lib/python3.10/site-packages/vllm/v1/engine/core.py", line 404, in __init__
    super().__init__(vllm_config, executor_class, log_stats,
  File "/home/lius/miniconda3/envs/agent_ragenv/lib/python3.10/site-packages/vllm/v1/engine/core.py", line 75, in __init__
    self.model_executor =

Model loading failed: Engine core initialization failed. See root cause above. Failed core proc(s): {}
Trying alternative approach...
Testing basic vLLM functionality...
INFO 07-23 17:22:03 [config.py:841] This model supports multiple tasks: {'generate', 'embed', 'classify', 'reward'}. Defaulting to 'generate'.
ERROR 07-23 17:22:03 [config.py:130] Error retrieving safetensors: 'microsoft/DialoGPT-medium' is not a safetensors repo. Couldn't find 'model.safetensors.index.json' or 'model.safetensors' files., retrying 1 of 2
ERROR 07-23 17:22:05 [config.py:128] Error retrieving safetensors: 'microsoft/DialoGPT-medium' is not a safetensors repo. Couldn't find 'model.safetensors.index.json' or 'model.safetensors' files.
INFO 07-23 17:22:05 [config.py:3368] Downcasting torch.float32 to torch.bfloat16.
INFO 07-23 17:22:05 [config.py:1472] Using max model len 512
INFO 07-23 17:22:05 [config.py:2285] Chunked prefill is enabled with max_num_batched_tokens=8192.
INFO 07-23 17:22:10 [__init__.py:24

Process EngineCore_0:
Traceback (most recent call last):
  File "/home/lius/miniconda3/envs/agent_ragenv/lib/python3.10/multiprocessing/process.py", line 314, in _bootstrap
    self.run()
  File "/home/lius/miniconda3/envs/agent_ragenv/lib/python3.10/multiprocessing/process.py", line 108, in run
    self._target(*self._args, **self._kwargs)
  File "/home/lius/miniconda3/envs/agent_ragenv/lib/python3.10/site-packages/vllm/v1/engine/core.py", line 590, in run_engine_core
    raise e
  File "/home/lius/miniconda3/envs/agent_ragenv/lib/python3.10/site-packages/vllm/v1/engine/core.py", line 577, in run_engine_core
    engine_core = EngineCoreProc(*args, **kwargs)
  File "/home/lius/miniconda3/envs/agent_ragenv/lib/python3.10/site-packages/vllm/v1/engine/core.py", line 404, in __init__
    super().__init__(vllm_config, executor_class, log_stats,
  File "/home/lius/miniconda3/envs/agent_ragenv/lib/python3.10/site-packages/vllm/v1/engine/core.py", line 75, in __init__
    self.model_executor =

Text model also failed: Engine core initialization failed. See root cause above. Failed core proc(s): {}
vLLM may require GPU or specific model configurations
