In [4]:
import gc
import re
import json
import pandas as pd
from pydub import AudioSegment

from TTS.api import TTS
from gliner import GLiNER
from diffusers import DiffusionPipeline
from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification

from utils import *

device = "cuda" if torch.cuda.is_available() else "cpu"

  from .autonotebook import tqdm as notebook_tqdm


cuda


In [2]:
gn = GLiNER.from_pretrained("urchade/gliner_mediumv2.1").to(device)
tts = TTS("tts_models/multilingual/multi-dataset/xtts_v2").to(device)
classifier = pipeline("sentiment-analysis", model="michellejieli/emotion_text_classifier",  device=device)

Fetching 5 files: 100%|██████████| 5/5 [00:00<?, ?it/s]


 > tts_models/multilingual/multi-dataset/xtts_v2 is already downloaded.
 > Using model: xtts


  self.speakers = torch.load(speaker_file_path)
  return torch.load(f, map_location=map_location, **kwargs)
GPT2InferenceModel has generative capabilities, as `prepare_inputs_for_generation` is explicitly overwritten. However, it doesn't directly inherit from `GenerationMixin`. From 👉v4.50👈 onwards, `PreTrainedModel` will NOT inherit from `GenerationMixin`, and this model will lose the ability to call `generate` and other related functions.
  - If you are the owner of the model architecture code, please modify your model class such that it inherits from `GenerationMixin` (after `PreTrainedModel`, otherwise you'll get an exception).
  - If you are not the owner of the model architecture class, please contact the model code owner to update it.


In [18]:
def load_dialogue_data(file_path):
    with open(file_path, 'r') as file:
        result = json.load(file)
    
    quotes_data = []
    for entry in result:
        for quote in entry['quotes']:
            quotes_data.append({
                "speaker": quote['speaker']['name'],
                "content": quote['quote']
            })
    
    df = pd.DataFrame(quotes_data)
    df['content'] = df['content'].str.replace('\n', ' ', regex=False)
    df['emotion'] = None
    df['char'] = None
    
    return df

def analyze_dialogues(df, classifier, gn_model, labels=["Male", "Female", "Other"]):
    """emotions and character types"""
    for idx in range(df.shape[0]):
        sentence_with_name = f"{df['speaker'][idx]} : {df['content'][idx]}"
        sentence_only = df['content'][idx]
        
        classify_result = classifier(sentence_only)
        gn_result = gn_model.predict_entities(sentence_with_name, labels, threshold=0.2)
        
        df.loc[idx, 'emotion'] = classify_result[0]['label']
        df.loc[idx, 'char'] = gn_result[0]['label']
    
    return df

def setup_voice_mapping():
    """voice file mappings"""
    voice_files = [f"Voice/Neutral/03-01-01-01-01-01-{str(i).zfill(2)}.wav" 
                   for i in range(6, 17)]
    default_speaker = "Voice/Neutral/03-01-01-01-01-01-02.wav"
    
    return voice_files, default_speaker

def generate_audio(df, tts_model, results_dir):
    """Generate audio files"""
    import os
    
    # Create results directory
    os.makedirs(results_dir, exist_ok=True)
    
    # Setup voice mapping
    voice_files, default_speaker = setup_voice_mapping()
    unique_speakers = df['speaker'].unique()
    speaker_mapping = {speaker: voice_files[i % len(voice_files)] 
                      for i, speaker in enumerate(unique_speakers)}
    
    # Generate audio files
    for idx in range(df.shape[0]):
        speaker_wav = speaker_mapping.get(df['speaker'][idx], default_speaker)
        
        tts_model.tts_to_file(
            text=df['content'][idx],
            speaker_wav=speaker_wav,
            language="en",
            emotion=df['emotion'][idx],
            file_path=f"{results_dir}/output{idx}.wav"
        )

def combine_audio_files(results_dir):
    """Combine all generated wav files into a single audio file"""
    try:
        # Get list of wav files in order
        wav_files = sorted(
            [f for f in os.listdir(results_dir) if f.startswith("output") and f.endswith(".wav")],
            key=lambda x: int(x.replace("output", "").replace(".wav", ""))
        )
        
        if not wav_files:
            print(f"No wav files found in {results_dir}")
            return
        
        # Initialize with first file
        combined_audio = AudioSegment.from_wav(os.path.join(results_dir, wav_files[0]))
        
        # Add remaining files
        for wav_file in wav_files[1:]:
            next_audio = AudioSegment.from_wav(os.path.join(results_dir, wav_file))
            combined_audio += next_audio
        
        # Export combined file
        output_path = os.path.join(results_dir, "combined_audio.wav")
        combined_audio.export(output_path, format="wav")
        print(f"Combined audio saved to: {output_path}")
        
    except Exception as e:
        print(f"Error combining audio files: {str(e)}")

In [19]:
def main():
    # Process files from output_25.json to output_65.json
    for file_num in range(25, 66):
        file_path = f'./output_char/output_{file_num}.json'
        
        try:
            # Load and process data
            print(f"\nProcessing {file_path}...")
            df = load_dialogue_data(file_path)
            
            # Analyze dialogues
            df = analyze_dialogues(df, classifier, gn)
            
            # Print unique speakers for verification
            print(f"File {file_num} - Unique speakers:", df['speaker'].unique())
            
            # Generate audio files in a new directory for each JSON file
            results_dir = f"Results/Results_{file_num}"
            generate_audio(df, tts, results_dir)
            
            # Combine all generated audio files
            combine_audio_files(results_dir)
            
            print(f"Completed processing file {file_num}")
            
        except FileNotFoundError:
            print(f"Warning: File {file_path} not found, skipping...")
            continue
        except Exception as e:
            print(f"Error processing file {file_path}: {str(e)}")
            continue

if __name__ == "__main__":
    main()


Processing ./output_char/output_25.json...
File 25 - Unique speakers: ['Wolf-Wind' 'Unknown' 'Glooskap' 'the children' 'The Wolf']
 > Text splitted to sentences.
['I am Wolf-Wind, the giant,']
 > Processing time: 1.3489201068878174
 > Real-time factor: 0.43181893665616106
 > Text splitted to sentences.
['cross not my path, for I will kill all the people I meet, and eat them all up.']
 > Processing time: 2.350579261779785
 > Real-time factor: 0.4518610747859208
 > Text splitted to sentences.
['I will catch you and kill you all and eat you and bleach your bones upon the sand.']
 > Processing time: 2.907226800918579
 > Real-time factor: 0.4535856373843447
 > Text splitted to sentences.
['I will come back and catch you yet.', 'You cannot escape from me.']
 > Processing time: 2.901244878768921
 > Real-time factor: 0.4352577943123688
 > Text splitted to sentences.
['You cannot harm us; we are strong, for we came at first from the Night-Night Land in the far north country, and over us the Ch

Token indices sequence length is longer than the specified maximum sequence length for this model (529 > 512). Running this sequence through the model will result in indexing errors


Error processing file ./output_char/output_55.json: The expanded size of the tensor (529) must match the existing size (514) at non-singleton dimension 1.  Target sizes: [1, 529].  Tensor sizes: [1, 514]

Processing ./output_char/output_56.json...
File 56 - Unique speakers: ['Hudden and Dudden' 'Hudden' 'Jack' 'Donald' 'The Tailor' 'Unknown'
 'I have no money to pay you with' "I'm not going to be a hero"
 "I'll be hanged, I'll be hanged, I'll be hanged!"
 "You can't have a kingdom without a king,"
 'I am not afraid of you, for you are but a woman' 'They_' 'The Miller'
 "Hudden's Wife" 'Tanner' "I'll give you three chances to guess"
 'The Three Blind Mice' 'the cobbler' 'Dudden' 'Trembling' 'The Farmer'
 "The Miller's Daughter"
 "I'll be waiting for you, and I'll be watching you" "Donald O'Neary"
 'I have no right to do that' "You've been a good friend to me, Donald"]
 > Text splitted to sentences.
["If only we could get that vagabond Donald O'Neary out of the country."]
 > Processing t

In [5]:
import re #讀原始故事

with open("./fairy_tales/1.txt", 'r',  encoding='utf-8') as file:
    content = " ".join(file.read().split())

cleaned_content = re.sub(r"\\'", "'", content)

##---再來讀quote_analysis_result.json 變成df(不變) ->原本你寫的---##

dialogue_count = 0
dialogues = []
lines = cleaned_content.split('"')
# print(lines)

lines = [line for line in lines if line.strip()]
ori = pd.DataFrame(lines, columns=['content'])
merged = ori.merge(df, 
                  left_on='content', 
                  right_on='content', 
                  how='left')
merged['speaker'] = merged['char'].fillna('Other')
merged['speaker'] = merged['emotion'].fillna('neutral')


# ##-- 之後把所有df 改成merged: - -##

# speaker_mapping = {
#     'Female': "Voice/03-01-01-01-01-01-06.wav",
#     'Male': "Voice/03-01-01-01-01-01-05.wav",
#     'Other': "Voice/03-01-01-01-01-01-11.wav"
# }
# default_speaker = "Voice/03-01-01-01-01-01-11.wav"

# # 文字到語音
# for d in range(merged.shape[0]):
#     speaker_wav = speaker_mapping.get(merged['char'][d], default_speaker)
    
#     tts.tts_to_file(
#         text=merged['content'][d],
#         speaker_wav=speaker_wav,
#         # speaker_wav='Voice/03-01-01-01-01-01-06.wav',
#         language="en",
#         emotion=merged['emotion'][d],
#         file_path=f"Results/output{d}.wav"
#     )



NameError: name 'df' is not defined