In [1]:
from pyannote.audio import Pipeline
pipeline = Pipeline.from_pretrained(
    "pyannote/speaker-diarization-3.1",
    use_auth_token="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")

  from .autonotebook import tqdm as notebook_tqdm
  if ismodule(module) and hasattr(module, '__file__'):
  if ismodule(module) and hasattr(module, '__file__'):


In [2]:
import torch
import torchaudio
from pydub.utils import mediainfo
import subprocess
import tempfile
import os

# Step 1: Define the helper
def ensure_wav_format(input_path: str, sample_rate: int = 16000, channels: int = 1) -> str:
    try:
        info = mediainfo(input_path)
        if info.get('format_name') == 'wav' and info.get('codec_name') == 'pcm_s16le':
            return input_path
    except Exception:
        pass  # If mediainfo fails, fallback to ffmpeg

    fd, output_path = tempfile.mkstemp(suffix=".wav")
    os.close(fd)

    command = [
        "ffmpeg", "-y", "-i", input_path,
        "-ar", str(sample_rate),
        "-ac", str(channels),
        "-c:a", "pcm_s16le",
        output_path
    ]
    result = subprocess.run(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
    if result.returncode != 0:
        raise RuntimeError(f"FFmpeg failed:\n{result.stderr.decode()}")
    return output_path

# Step 2: Use the helper in your pipeline
pipeline.to(torch.device("cuda"))

original_audio_path = "3.mp3"  # or mp3, amr, etc.
cleaned_audio_path = ensure_wav_format(original_audio_path)

audio, sample_rate = torchaudio.load(cleaned_audio_path)

diarization = pipeline({"waveform": audio, "sample_rate": sample_rate}, num_speakers=2)

segments = []
for turn, _, speaker in diarization.itertracks(yield_label=True):
    segement_part = {
        "start": turn.start,
        "end": turn.end,
        "speaker": speaker
    }
    segments.append(segement_part)

# Optional: delete temp file if it was converted
if cleaned_audio_path != original_audio_path:
    os.remove(cleaned_audio_path)


It can be re-enabled by calling
   >>> import torch
   >>> torch.backends.cuda.matmul.allow_tf32 = True
   >>> torch.backends.cudnn.allow_tf32 = True
See https://github.com/pyannote/pyannote-audio/issues/1370 for more details.

  std = sequences.std(dim=-1, correction=1)
  std = sequences.std(dim=-1, correction=1)


In [3]:
from pydub import AudioSegment
import torch
import torchaudio
import io
import os

# Step 1: Ensure it's a proper WAV file (use the helper from previous cell)
audio_path = "3.mp3"
cleaned_audio_path = ensure_wav_format(audio_path)

# Step 2: Load using AudioSegment
full_audio = AudioSegment.from_wav(cleaned_audio_path)

# Step 3: Process diarized segments
chunk_tensors = []

for i, seg in enumerate(segments):
    start_ms = int(seg["start"] * 1000)
    end_ms = int(seg["end"] * 1000)
    speaker = seg["speaker"]

    # Slice the chunk
    audio_chunk = full_audio[start_ms:end_ms]

    # Export to in-memory buffer
    buffer = io.BytesIO()
    audio_chunk.export(buffer, format="wav")
    buffer.seek(0)

    # Load waveform from buffer
    waveform, sr = torchaudio.load(buffer)

    if sr != 16000:
        waveform = torchaudio.functional.resample(waveform, sr, 16000)

    if waveform.shape[0] > 1:
        waveform = torch.mean(waveform, dim=0, keepdim=True)

    chunk_tensors.append({
        "waveform": waveform,
        "sampling_rate": 16000,
        "speaker": speaker,
        "index": i
    })

# Step 4 (Optional): Clean up temp if converted
if cleaned_audio_path != audio_path:
    os.remove(cleaned_audio_path)


In [4]:
from transformers import SeamlessM4Tv2ForSpeechToText, SeamlessM4TTokenizer, SeamlessM4TFeatureExtractor

access_token ="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"

model = SeamlessM4Tv2ForSpeechToText.from_pretrained("ai4bharat/indic-seamless", token=access_token).to("cuda")
processor = SeamlessM4TFeatureExtractor.from_pretrained("ai4bharat/indic-seamless", token=access_token)
tokenizer = SeamlessM4TTokenizer.from_pretrained("ai4bharat/indic-seamless", token=access_token)


Instantiating a decoder SeamlessM4Tv2Attention without passing `layer_idx` is not recommended and will lead to errors during the forward call, if caching is used. Please make sure to provide a `layer_idx` when creating this class.
Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.62it/s]



In [5]:
transcriptions = []

for chunk in chunk_tensors:
    waveform = chunk["waveform"]
    speaker = chunk["speaker"]
    index = chunk["index"]

    waveform = waveform.squeeze(0).cpu()

    # Skip too-short waveforms
    if waveform.numel() < 3200:  # Less than 0.2 seconds at 16kHz
        print(f"Skipping chunk {index} from {speaker}: too short ({waveform.numel()} samples)")
        continue

    # Prepare model inputs
    inputs = processor(waveform, sampling_rate=16000, return_tensors="pt").to("cuda")

    with torch.no_grad():
        generated_tokens = model.generate(**inputs, tgt_lang="hin")[0].cpu().numpy().squeeze()

    text = tokenizer.decode(generated_tokens, skip_special_tokens=True, clean_up_tokenization_spaces=True)

    transcriptions.append({
        "speaker": speaker,
        "index": index,
        "text": text
    })

Skipping chunk 12 from SPEAKER_01: too short (1632 samples)
Skipping chunk 36 from SPEAKER_01: too short (256 samples)
Skipping chunk 36 from SPEAKER_01: too short (256 samples)
Skipping chunk 50 from SPEAKER_01: too short (2160 samples)
Skipping chunk 51 from SPEAKER_01: too short (544 samples)
Skipping chunk 50 from SPEAKER_01: too short (2160 samples)
Skipping chunk 51 from SPEAKER_01: too short (544 samples)
Skipping chunk 103 from SPEAKER_00: too short (1616 samples)
Skipping chunk 103 from SPEAKER_00: too short (1616 samples)
Skipping chunk 108 from SPEAKER_01: too short (544 samples)
Skipping chunk 108 from SPEAKER_01: too short (544 samples)
Skipping chunk 124 from SPEAKER_00: too short (2704 samples)
Skipping chunk 124 from SPEAKER_00: too short (2704 samples)
Skipping chunk 148 from SPEAKER_01: too short (544 samples)
Skipping chunk 148 from SPEAKER_01: too short (544 samples)
Skipping chunk 195 from SPEAKER_01: too short (2432 samples)
Skipping chunk 195 from SPEAKER_01: too

In [6]:
transcriptions = sorted(transcriptions, key=lambda x: x["index"])

for t in transcriptions:
    print(f"[{t['speaker']}]: {t['text']}")

[SPEAKER_01]: और
[SPEAKER_01]: सर, श्री कृष्ण दहित कुमार, ऑनलाइन दे दो।
[SPEAKER_01]: ऑनलाइन ले दो? ले लो आप ऑनलाइन ले लो
[SPEAKER_00]: तो, यह क्या है
[SPEAKER_01]: बताओ नंबर 8 के
[SPEAKER_01]: एक मिनट
[SPEAKER_01]: ओपन कर लिया है किसी भी डेस्क
[SPEAKER_01]: हम्म
[SPEAKER_01]: धन्यवाद
[SPEAKER_01]: यह बहुत महत्वपूर्ण है
[SPEAKER_01]: क्रिया हमार है
[SPEAKER_01]: हाँ यह सही है
[SPEAKER_01]: तो, यह 972 है
[SPEAKER_01]: 7 4 1
[SPEAKER_01]: 741
[SPEAKER_01]: डबल 6 1
[SPEAKER_01]: डबल 6 1
[SPEAKER_01]: सर इसमें एक चीज ही देखें सर
[SPEAKER_01]: सर यह जकीश चंद दलित कुमार का ये ट्रेडिंग अकाउंट खोल रहा हूँ ना मैं
[SPEAKER_01]: धन्यवाद
[SPEAKER_01]: बिना किसी संरचना के लेरोस्कोप
[SPEAKER_01]: यह सर यह वाला आएगा ना सर
[SPEAKER_01]: तो यह इसमें है ना सर यह अब देखें सर यह जो चगावड़ा है ना
[SPEAKER_01]: जो मैंने आपको परसों फ़ोन लगाई थी सर
[SPEAKER_01]: एक मिनट रुको
[SPEAKER_01]: भुगतान होगा।
[SPEAKER_01]: पांच हजार
[SPEAKER_01]: तो नंबर लिगाओ तो यह है
[SPEAKER_01]: 972
[SPEAKER_01]: सात चार एक
[SPEA

In [8]:
import requests
import json

OLLAMA_URL = "http://localhost:11434/api/chat"
MODEL = "mistral:7b-instruct"

system_prompt = """
You are an assistant designed to analyze conversations between a customer and a customer service agent. You will receive a raw transcript of a conversation, often informal, fragmented, and potentially in a mixture of Hindi and English. Your task is to analyze the conversation carefully and generate a detailed, structured report with specific insights.

Your response MUST contain all four of the following sections, in this exact format:

Conversation Summary:
[Provide a detailed and businesslike summary of the entire conversation in English. Include specific item names, references to timestamps or repeated attempts, relevant actions taken, and any follow-up instructions or confusion discussed.]

Identified Issues:
[List each issue the customer faced, using specific details from the conversation. Include exact item names, invoice details, missing data points, software/system issues, etc.]
[Avoid generic phrases. Be precise and descriptive.]

Resolution Status:
[Select only one of the following options:
Resolved
Partially Resolved – Follow-up Required
Unresolved
Base your judgment on the conversation. Do not add any extra commentary. If multiple issues are discussed, base your choice on the overall status.]

Customer Sentiment:
[Briefly describe the customer's overall emotional tone during the conversation — e.g., calm, frustrated, confused, impatient, cooperative, etc. This should be 1-2 lines and reflect the customer's behavior, urgency, or satisfaction level.]

Important Instructions
Always include all four sections.
Use bullet points in Identified Issues.
Keep the tone professional and businesslike.
Be as specific and detailed as possible, especially with product names, time references, or transactional data.
Do not speculate — only summarize based on what is explicitly stated.
"""

conversation = "\n".join([f"[{t['speaker']}]: {t['text']}" for t in transcriptions])

# Concatenate system prompt and conversation as a single user message
user_message = system_prompt.strip() + "\n\n" + conversation

payload = {
    "model": MODEL,
    "messages": [
        {"role": "user", "content": user_message}
    ]
}

response = requests.post(OLLAMA_URL, json=payload)
response.raise_for_status()

contents = []
for line in response.text.strip().splitlines():
    try:
        obj = json.loads(line)
        if "message" in obj and "content" in obj["message"]:
            contents.append(obj["message"]["content"])
    except Exception:
        pass

full_response = "".join(contents)
print(full_response)

 It appears that the text you've provided is a conversation between two individuals, possibly a system and a user. The exact meaning isn't entirely clear without context, but I can try to break down some parts of it for better understanding:

1. "You are breaking up check it out" - This line suggests that there might be an issue with the connection or the system itself.
2. "This is my system" - The system owner is stating that what's happening belongs to their system.
3. "My system will stop working completely, please make sure everything you do is fully saved and let me know if anything isn't clear." - This line indicates that the system owner is going to shut down their system for some reason, possibly due to maintenance or an update, and they are asking the user to ensure all work is saved.
4. "Check it out" - In this context, it might mean that the user should verify the information or process again since there seems to be a problem with the connection or system.
5. "What is it?" -