In [1]:
import gradio as gr
import soundfile
import json
import os
from datetime import datetime
from transformers import AutoProcessor, AutoModelForCausalLM, GenerationConfig

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
class SpeechTranslatorApp:
    def __init__(self):
        model_path = 'Phi-4-multimodal-instruct'
        
        # Initialize models
        self.processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
        self.model = AutoModelForCausalLM.from_pretrained(
            model_path,
            trust_remote_code=True,
            torch_dtype='auto',
            _attn_implementation='flash_attention_2',
        ).cuda()
        
        self.generation_config = GenerationConfig.from_pretrained(model_path, 'generation_config.json')

        # Define prompts
        self.user_prompt = '<|user|>'
        self.assistant_prompt = '<|assistant|>'
        self.prompt_suffix = '<|end|>'
        self.system_prompt = '<|system|>'
        
        # Define supported languages with their codes and native names
        self.languages = {
            "English": {
                "code": "en",
                "native": "English"
            },
            "Chinese": {
                "code": "zh",
                "native": "中文"
            },
            "German": {
                "code": "de",
                "native": "Deutsch"
            },
            "French": {
                "code": "fr",
                "native": "Français"
            },
            "Italian": {
                "code": "it",
                "native": "Italiano"
            },
            "Japanese": {
                "code": "ja",
                "native": "日本語"
            },
            "Spanish": {
                "code": "es",
                "native": "Español"
            },
            "Portuguese": {
                "code": "pt",
                "native": "Português"
            }
        }
        
        # Initialize storage
        self.translations_dir = "translations"
        os.makedirs(self.translations_dir, exist_ok=True)
        self.translations = self.load_translations()
        
    def get_translation_file_path(self, lang_code):
        """Get path for language-specific translation file"""
        return os.path.join(self.translations_dir, f"translations_{lang_code}.json")

    def load_translations(self):
        """Load translations for all languages"""
        translations = {}
        for lang_info in self.languages.values():
            file_path = self.get_translation_file_path(lang_info["code"])
            if os.path.exists(file_path):
                with open(file_path, 'r', encoding='utf-8') as f:
                    translations[lang_info["code"]] = json.load(f)
            else:
                translations[lang_info["code"]] = []
        return translations

    def save_translation(self, lang_code, translation):
        """Save translation for specific language"""
        file_path = self.get_translation_file_path(lang_code)
        with open(file_path, 'w', encoding='utf-8') as f:
            json.dump(translation, f, ensure_ascii=False, indent=2)

    def transcribe_audio(self, audio_input, source_lang="English"):
        """Transcribe audio to text"""
        speech_prompt = f"Transcribe this {source_lang} audio to text."
        
        prompt = f'{self.user_prompt}<|audio_1|>{speech_prompt}{self.prompt_suffix}{self.assistant_prompt}'
        audio = soundfile.read(audio_input)
        
        inputs = self.processor(text=prompt, audios=[audio], return_tensors='pt').to('cuda')
        
        generate_ids = self.model.generate(
            **inputs,
            max_new_tokens=2000,
            generation_config=self.generation_config,
        )
        generate_ids = generate_ids[:, inputs['input_ids'].shape[1]:]
        
        transcription = self.processor.batch_decode(
            generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False
        )[0]
        
        return transcription.strip()

    def translate_text(self, text, source_lang, target_lang):
        """Translate text between languages"""
        if not text:
            return "No text to translate"
        
        translation_prompt = f"Translate the following {source_lang} text to {target_lang}. Provide only the translation without any additional text or explanation:"
        
        prompt = f'{self.system_prompt}You are a professional translator.{self.prompt_suffix}{self.user_prompt}{translation_prompt}\n\n{text}{self.prompt_suffix}{self.assistant_prompt}'

        inputs = self.processor(prompt, images=None, return_tensors='pt').to('cuda')

        try:
            generate_ids = self.model.generate(
                **inputs,
                max_new_tokens=2000,
                generation_config=self.generation_config,
            )
            generate_ids = generate_ids[:, inputs['input_ids'].shape[1]:]
            translation = self.processor.batch_decode(
                generate_ids,
                skip_special_tokens=True,
                clean_up_tokenization_spaces=False
            )[0]
            return translation.strip()
        except Exception as e:
            print(f"Translation error: {str(e)}")
            return f"Translation failed: {str(e)}"

    def process_translation(self, audio, source_lang, target_lang):
        """Process audio input and generate translation"""
        # Transcribe audio to text
        source_text = self.transcribe_audio(audio, source_lang)
        
        # Translate to target language
        translation = self.translate_text(source_text, source_lang, target_lang)
        
        # Create translation entry
        translation_entry = {
            "timestamp": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
            "source_language": source_lang,
            "target_language": target_lang,
            "source_text": source_text,
            "translated_text": translation
        }
        
        # Save translation
        source_code = self.languages[source_lang]["code"]
        target_code = self.languages[target_lang]["code"]
        
        if source_code not in self.translations:
            self.translations[source_code] = []
        if target_code not in self.translations:
            self.translations[target_code] = []
            
        self.translations[source_code].append(translation_entry)
        self.translations[target_code].append(translation_entry)
        
        self.save_translation(source_code, self.translations[source_code])
        self.save_translation(target_code, self.translations[target_code])
        
        return self.format_translation_display(translation_entry)

    def format_translation_display(self, entry):
        """Format translation for display"""
        output = f"""Timestamp: {entry['timestamp']}\n\n"""
        output += f"""Source Language ({entry['source_language']}):\n{entry['source_text']}\n\n"""
        output += f"""Target Language ({entry['target_language']}):\n{entry['translated_text']}\n"""
        return output

    def list_translations(self, lang_code):
        """List translations for specific language"""
        if lang_code not in self.translations or not self.translations[lang_code]:
            return "No translations found"
        
        return "\n\n---\n\n".join([
            self.format_translation_display(entry) 
            for entry in self.translations[lang_code]
        ])

    def create_interface(self):
        """Create Gradio interface"""
        with gr.Blocks(theme=gr.themes.Soft()) as interface:
            gr.Markdown("# Multilingual Speech Translation Hub")
            gr.Markdown("Record speech or upload audio file for translation between multiple languages")
            
            with gr.Row():
                source_lang = gr.Dropdown(
                    choices=list(self.languages.keys()),
                    value="English",
                    label="Source Language"
                )
                target_lang = gr.Dropdown(
                    choices=list(self.languages.keys()),
                    value="Chinese",
                    label="Target Language"
                )
            
            with gr.Row():
                audio_input = gr.Audio(
                    sources=["microphone", "upload"],
                    type="filepath",
                    label="Record or Upload Audio"
                )
            
            with gr.Row():
                translate_btn = gr.Button("Translate")
            
            with gr.Row():
                output = gr.Textbox(
                    label="Translation Results",
                    lines=10
                )
            
            # History viewer
            with gr.Accordion("Translation History", open=False):
                lang_select = gr.Dropdown(
                    choices=list(self.languages.keys()),
                    value="English",
                    label="Select Language"
                )
                history_output = gr.Textbox(
                    label="Translation History",
                    lines=20
                )
            
            # Event handlers
            translate_btn.click(
                fn=self.process_translation,
                inputs=[audio_input, source_lang, target_lang],
                outputs=output
            )
            
            lang_select.change(
                fn=lambda x: self.list_translations(self.languages[x]["code"]),
                inputs=[lang_select],
                outputs=history_output
            )
            
            return interface

In [4]:
def run_app():
    # Create app instance
    app = SpeechTranslatorApp()
    
    # Launch Gradio interface
    interface = app.create_interface()
    interface.launch(
        share=True,
        server_name="0.0.0.0"
    )

In [8]:
run_app()

ImportError: FlashAttention2 has been toggled on, but it cannot be used due to the following error: the package flash_attn seems to be not installed. Please refer to the documentation of https://huggingface.co/docs/transformers/perf_infer_gpu_one#flashattention-2 to install Flash Attention 2.

In [1]:
import torch
print("CUDA disponível?", torch.cuda.is_available())
print("GPU:", torch.cuda.get_device_name(0) if torch.cuda.is_available() else "Nenhuma GPU detectada.")


CUDA disponível? True
GPU: NVIDIA GeForce RTX 2070 SUPER


In [None]:
# Use a pipeline as a high-level helper
from transformers import pipeline

messages = [
    {"role": "user", "content": "Who are you?"},
]
pipe = pipeline("text-generation", model="deepseek-ai/deepseek-llm-7b-base", trust_remote_code=True)
pipe(messages)

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

In [None]:
import requests

url = "http://localhost:1234/v1/chat/completions"

headers = {
    "Content-Type": "application/json"
}

payload = {
    "model": "deepseek-r1-distill-qwen-7b",
    "messages": [
        {
            "role": "system",
            "content": "Organize the content, in any way possible based on the subject, add content if possible, give ideas, insights, and suggestions"
        }
    ],
    "temperature": 0.7,
    "max_tokens": -1,
    "stream": False
}

response = requests.post(url, headers=headers, json=payload)

print("Status Code:", response.status_code)
print("Response:")
print(response.json())

Status Code: 200
Response:
{'id': 'chatcmpl-nk0pcrsda5986916rocs9b', 'object': 'chat.completion', 'created': 1746757836, 'model': 'deepseek-r1-distill-qwen-7b', 'choices': [{'index': 0, 'logprobs': None, 'finish_reason': 'stop', 'message': {'role': 'assistant', 'content': '<think>\nOkay, so I need to figure out how to organize this text about sustainable living. The user has given a pretty detailed response with various sections like introduction, reducing waste, energy efficiency, water conservation, carbon footprint, mental health, and future trends.\n\nHmm, first, I should probably read through the existing content to understand what\'s covered and where there might be gaps or opportunities for expansion. Let me start by skimming each section.\n\nThe introduction is good; it sets up sustainable living as a way to live in harmony with nature. It mentions reducing waste, conserving energy, water, and protecting the environment. That\'s solid. Maybe I can add more about why sustainabil

: 

In [1]:
# Use a pipeline as a high-level helper
from transformers import pipeline

messages = [
    {"role": "user", "content": "Who are you?"},
]
pipe = pipeline("text-generation", model="deepseek-ai/DeepSeek-R1-Distill-Qwen-14B", trust_remote_code=True)
pipe(messages)

  from .autonotebook import tqdm as notebook_tqdm
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Fetching 4 files: 100%|██████████| 4/4 [22:44<00:00, 341.19s/it]   
Sliding Window Attention is enabled but not implemented for `sdpa`; unexpected results may be encountered.
Loading checkpoint shards:  25%|██▌       | 1/4 [02:01<06:02, 120.92s/it]

: 

In [11]:
import accelerate
print(accelerate.__version__)

1.6.0
