In [1]:
import os
import moviepy.editor as mp
import matplotlib.pyplot as plt
from PIL import Image
import torch
import torchvision.transforms as transforms
import torch.nn as nn
import torchaudio
import torchvision.models as models

objc[50756]: Class AVFFrameReceiver is implemented in both /Users/tt/miniforge3/envs/ds/lib/python3.8/site-packages/av/.dylibs/libavdevice.60.3.100.dylib (0x309358760) and /Users/tt/miniforge3/envs/ds/lib/libavdevice.59.7.100.dylib (0x31c414778). One of the two will be used. Which one is undefined.
objc[50756]: Class AVFAudioReceiver is implemented in both /Users/tt/miniforge3/envs/ds/lib/python3.8/site-packages/av/.dylibs/libavdevice.60.3.100.dylib (0x3093587b0) and /Users/tt/miniforge3/envs/ds/lib/libavdevice.59.7.100.dylib (0x31c4147c8). One of the two will be used. Which one is undefined.


In [2]:
def mel_spectrogram(audio_path):
    waveform, sample_rate = torchaudio.load(audio_path)

    transformMel = torchaudio.transforms.MelSpectrogram(sample_rate=sample_rate,
                                                        n_mels=128,
                                                        hop_length=256,
                                                        n_fft=1024)
    transformdB = torchaudio.transforms.AmplitudeToDB()
    mel_spec = transformMel(waveform)
    mel_spec = transformdB(mel_spec)
    return mel_spec

tmp = mel_spectrogram('./MELD_Data/train/dia0_utt0.mp4')

: 

In [None]:

# 비디오 파일에서 오디오를 추출하여 저장하는 함수
def extract_audio_from_video(video_path, output_audio_path):
    try:
        # 비디오 파일에서 오디오 추출
        video = mp.VideoFileClip(video_path)
        video.audio.write_audiofile(output_audio_path, codec='pcm_s16le')
        print(f"Audio extracted and saved to: {output_audio_path}")
    except Exception as e:
        print(f"Failed to process {video_path}: {e}")

# MELD 데이터셋에서 비디오 파일들을 처리
video_folder = '/content/train_splits'  
output_folder = '/content/video_to_audio'  

# 출력 폴더가 없으면 생성
os.makedirs(output_folder, exist_ok=True)

# 비디오 파일들 순회
for root, dirs, files in os.walk(video_folder):
    for file in files:
        if file.endswith('.mp4'):  # 파일 확장자가 .mp4인 경우에만 처리
            video_path = os.path.join(root, file)
            output_audio_path = os.path.join(output_folder, file.replace('.mp4', '.wav'))
            extract_audio_from_video(video_path, output_audio_path)



def mel_spectrogram(audio_path):
    waveform, sample_rate = torchaudio.load(audio_path)

    transform = torchaudio.transforms.Compose([
        torchaudio.transforms.MelSpectrogram(
            sample_rate=sample_rate,
            n_mels=128,
            hop_length=256,
            n_fft=1024),
        torchaudio.transforms.AmplitudeToDB()
    ])
    mel_spec = transform(waveform)

    # 멜 스펙트로그램 시각화 및 저장
    plt.figure(figsize=(10, 4))
    plt.imshow(log_mel_spec[0].numpy(), cmap='viridis', origin='lower', aspect='auto')
    plt.title('Log-Mel Spectrogram')
    plt.ylabel('Mel Frequency')
    plt.xlabel('Time')
    plt.colorbar(format='%+2.0f dB')
    plt.savefig(output_image_path)
    plt.close()
    print(f"Mel spectrogram saved to: {output_image_path}")

# 오디오 파일들이 저장된 폴더와 멜 스펙트로그램 이미지를 저장할 폴더 설정
audio_folder = '/content/video_to_audio'  
output_folder = '/content/mel_spectrogram_image'  # 멜 스펙트로그램 이미지를 저장할 폴더 경로

# 출력 폴더가 없으면 생성
os.makedirs(output_folder, exist_ok=True)

# 오디오 파일들 순회 및 멜 스펙트로그램 변환
for root, dirs, files in os.walk(audio_folder):
    for file in files:
        if file.endswith('.wav'):  
            audio_path = os.path.join(root, file)
            output_image_path = os.path.join(output_folder, file.replace('.wav', '.png'))
            convert_audio_to_mel_spectrogram(audio_path, output_image_path)




# ResNet을 사용하여 이미지 특징 추출
class ResNetEncoder(nn.Module):
    def __init__(self):
        super(ResNetEncoder, self).__init__()
        self.resnet = models.resnet50(pretrained=True)
        self.resnet = nn.Sequential(*list(self.resnet.children())[:-2])

    def forward(self, x):
        return self.resnet(x)

# 공통 Transformer 백본 정의
class CommonTransformer(nn.Module):
    def __init__(self, input_dim, hidden_dim, num_layers):
        super(CommonTransformer, self).__init__()
        self.transformer = nn.Transformer(
            d_model=input_dim,
            nhead=8,
            num_encoder_layers=num_layers,
            num_decoder_layers=num_layers,
            dim_feedforward=hidden_dim,
            dropout=0.1
        )

    def forward(self, x):
        x = x.flatten(2).permute(2, 0, 1)  # Transformer expects (S, N, E)
        output = self.transformer(x, x)
        return output.permute(1, 0, 2)  # Back to (N, S, E)

# OmniVec 모델 정의
class OmniVecModel(nn.Module):
    def __init__(self):
        super(OmniVecModel, self).__init__()
        self.image_encoder = ResNetEncoder()
        self.common_transformer = CommonTransformer(input_dim=2048, hidden_dim=2048, num_layers=6)

    def forward(self, x):
        x = self.image_encoder(x)
        features = self.common_transformer(x)
        return features

# 모델 생성
model = OmniVecModel()

# 이미지 전처리 함수 정의
def preprocess_image(image_path):
    transform = transforms.Compose([
        transforms.Resize((224, 224)),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
    ])
    image = Image.open(image_path).convert('RGB')
    image = transform(image)
    return image.unsqueeze(0)  # 배치 차원 추가

# 멜 스펙트로그램 이미지들이 저장된 폴더 설정
image_folder = '/content/mel_spectrogram_image' 

# 이미지 파일들 순회 및 특징 추출
for root, dirs, files in os.walk(image_folder):
    for file in files:
        if file.endswith('.png'):  # 파일 확장자가 .png인 경우에만 처리
            image_path = os.path.join(root, file)
            image_tensor = preprocess_image(image_path)
            
            # 모델을 통해 특징 추출
            model.eval()
            with torch.no_grad():
                features = model(image_tensor)
                print(f"Extracted Features Shape for {file}: {features.shape}")
                print(f"Extracted Features for {file}: {features}")
