In [5]:
%pip install transformers torch accelerate torchvision

INFO: pip is looking at multiple versions of torchvision to determine which version is compatible with other requirements. This could take a while.
Collecting torchvision
  Obtaining dependency information for torchvision from https://files.pythonhosted.org/packages/29/88/00c69db213ee2443ada8886ec60789b227e06bb869d85ee324578221a7f7/torchvision-0.21.0-cp311-cp311-macosx_11_0_arm64.whl.metadata
  Downloading torchvision-0.21.0-cp311-cp311-macosx_11_0_arm64.whl.metadata (6.1 kB)
Downloading torchvision-0.21.0-cp311-cp311-macosx_11_0_arm64.whl (1.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m5.1 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: torchvision
  Attempting uninstall: torchvision
    Found existing installation: torchvision 0.17.2
    Uninstalling torchvision-0.17.2:
      Successfully uninstalled torchvision-0.17.2
Successfully installed torchvision-0.21.0

[1m[[0m[34;49mnotice[0m[1;

In [3]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, QuantoConfig

class ConversationMemory:
    def __init__(self):
        self.conversation = []

    def add_message(self, speaker: str, text: str):
        self.conversation.append((speaker, text))

    def get_context(self) -> str:
        return "\n".join(f"{s}: {t}" for (s, t) in self.conversation)

    def clear_memory(self):
        self.conversation = []

def generate_response(model, tokenizer, prompt, max_new_tokens=128, device='cpu'):
    input_ids = tokenizer.encode(prompt, return_tensors='pt').to(device)
    with torch.no_grad():
        output_ids = model.generate(
            input_ids,
            max_new_tokens=max_new_tokens,
            do_sample=True,
            top_p=0.95,
            top_k=50,
            temperature=0.7,
            pad_token_id=tokenizer.eos_token_id
        )
    return tokenizer.decode(output_ids[0], skip_special_tokens=True)

def main():
    model_name = "deepseek-ai/DeepSeek-V3"

    # Decide which device to use (MPS if available, else CPU)
    device = "cpu" if torch.backends.mps.is_available() else "cpu"
    print(f"Using device: {device}")

    # 1) Load tokenizer
    tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)

    # 2) Load model on CPU first to avoid quantization errors
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        trust_remote_code=True,
        quantization_config=QuantoConfig(weights="int4")
    )

    # 3) Move model to MPS or stay on CPU
    model = model.to(device)

    memory = ConversationMemory()

    print("Deepseek Chat. Type 'quit' to exit.\n")
    while True:
        user_input = input("User: ").strip()
        if user_input.lower() == "quit":
            print("Bot: Goodbye!")
            break

        memory.add_message("User", user_input)

        conversation_context = memory.get_context()
        prompt = (
            "The following is a conversation between a user and an AI assistant. "
            "The AI assistant is helpful, polite, and knowledgeable.\n\n"
            f"{conversation_context}\nBot:"
        )

        full_response = generate_response(model, tokenizer, prompt, device=device)
        # Extract just the bot's new text
        bot_response = full_response.split("Bot:")[-1].strip()
        memory.add_message("Bot", bot_response)

        print(f"Bot: {bot_response}")

In [4]:
main()

Using device: cpu


RuntimeError: No GPU found. A GPU is needed for FP8 quantization.