<a href="https://colab.research.google.com/github/jsjj10002/FackVoiceClassfication/blob/main/%EC%9D%8C%EC%84%B1_%ED%8A%B9%EC%A7%95%EC%B6%94%EC%B6%9C.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 데이터 가져오기

[The Fake-or-Real (FoR) Dataset](https://www.kaggle.com/datasets/mohammedabdeldayem/the-fake-or-real-dataset/data)

The Fake-or-Real (FoR) dataset is a collection of more than 195,000 utterances from real humans and computer generated speech. The dataset can be used to train classifiers to detect synthetic speech.
The dataset is published in four versions: for-original, for-norm, for-2sec and for-rerec.

*The  first version, named for-original, contains the files as collected from the speech sources, without any modification (balanced version).

The second version, called for-norm, contains the same files, but balanced in terms of gender and class and normalized in terms of sample rate, volume and number of channels.

The third one, named for-2sec is based on the second one, but with the files truncated at 2 seconds.

The last version, named for-rerec, is a rerecorded version of the for-2second dataset, to simulate a scenario where an attacker sends an utterance through a voice channel (i.e. a phone call or a voice message).*


In [1]:
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle
!chmod 600 ~/.kaggle/kaggle.json

In [2]:
!kaggle datasets download -d mohammedabdeldayem/the-fake-or-real-dataset

Dataset URL: https://www.kaggle.com/datasets/mohammedabdeldayem/the-fake-or-real-dataset
License(s): GNU Lesser General Public License 3.0
Downloading the-fake-or-real-dataset.zip to /content
100% 16.0G/16.0G [02:13<00:00, 145MB/s]
100% 16.0G/16.0G [02:13<00:00, 129MB/s]


In [3]:
%cd /content
!mkdir the-fake-or-real-dataset
!unzip -qq "/content/the-fake-or-real-dataset.zip" -d the-fake-or-real-dataset/

/content


### 라이브러리 호출

In [4]:
import numpy as np
import pandas as pd
import os
import librosa
from sklearn.model_selection import train_test_split

## 데이터 라벨링
데이터를 훈련, 시험, 검증 셋으로 나누고, 가짜와 실제 목소리로 라벨링함.

In [5]:
def load_data(root_path):
    data = []

    # 각 버전별로 사용할 하위 폴더 이름 설정
    """
    versions = {
        'for-original': 'for-original',
        'for-norm': 'for-norm',
        'for-2sec': 'for-2seconds',
        'for-rerec': 'for-rerecorded'
    }

    # 각 버전별로 반복
    for version_key, version_folder in versions.items():
        # 각 버전의 하위 폴더로 경로 생성
        version_path = os.path.join(root_path, version_key, version_folder)
"""
        # 데이터셋의 세 가지 범주: testing, training, validation
    categories = ['testing', 'training', 'validation']

    for category in categories:
        category_path = os.path.join(root_path, category)
        types = ['fake', 'real']

            # fake, real 폴더 내의 파일 탐색
        for type_ in types:
            type_path = os.path.join(category_path, type_)
                # 해당 경로가 존재하는지 확인
            if not os.path.exists(type_path):
                print(f"경로를 찾을 수 없습니다: {type_path}")
                continue

            for filename in os.listdir(type_path):
                    # .wav 파일만 처리
                if filename.endswith('.wav'):
                    file_path = os.path.join(type_path, filename)
                        # 데이터 리스트에 파일 경로와 라벨 추가
                    data.append({'path': file_path, 'label': type_})

    # 데이터 프레임 생성
    df = pd.DataFrame(data)
    return df

In [6]:
# 데이터셋의 기본 경로
dataset_path = "/content/the-fake-or-real-dataset/for-original/for-original"

# 데이터 로드
df = load_data(dataset_path)

# 데이터 프레임의 상위 5개 행 출력
print(df.head())

# 데이터 프레임의 기본 정보 출력
print(df.info())

# 각 라벨의 개수 확인
print(df['label'].value_counts())

# 데이터셋 분할: 트레인, 테스트, 검증 세트
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)
train_df, val_df = train_test_split(train_df, test_size=0.25, random_state=42)

# 결과 확인
print("Training set size:", len(train_df))
print("Validation set size:", len(val_df))
print("Testing set size:", len(test_df))

                                                path label
0  /content/the-fake-or-real-dataset/for-original...  fake
1  /content/the-fake-or-real-dataset/for-original...  fake
2  /content/the-fake-or-real-dataset/for-original...  fake
3  /content/the-fake-or-real-dataset/for-original...  fake
4  /content/the-fake-or-real-dataset/for-original...  fake
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40913 entries, 0 to 40912
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   path    40913 non-null  object
 1   label   40913 non-null  object
dtypes: object(2)
memory usage: 639.4+ KB
None
label
real    34605
fake     6308
Name: count, dtype: int64
Training set size: 24547
Validation set size: 8183
Testing set size: 8183


## 특징 추출(MFCC, 스펙트럼 대역, 제로 크로싱 레이트)

In [7]:
import multiprocessing
from functools import partial
def extract_features(file_path):
    try:
        # 파일에서 오디오 데이터 로드
        audio, sample_rate = librosa.load(file_path, sr=None)

        # 오디오 데이터가 비어 있지 않은지 확인
        if audio.size == 0:
            print(f"오디오 데이터가 비어 있습니다: {file_path}")
            return None

        # MFCC 추출
        mfccs = np.mean(librosa.feature.mfcc(y=audio, sr=sample_rate, n_mfcc=13), axis=1)

        # 스펙트럼 대역 평균 추출
        spectral_centroids = np.mean(librosa.feature.spectral_centroid(y=audio, sr=sample_rate), axis=1)

        # 제로 크로싱 레이트 평균 추출
        zero_crossing_rates = np.mean(librosa.feature.zero_crossing_rate(audio), axis=1)

        return mfccs, spectral_centroids, zero_crossing_rates
    except Exception as e:
        print(f"파일 처리 중 에러가 발생했습니다: {file_path}, 에러: {e}")
        return None

#병렬실행으로 실행 시간 단축
def parallel_feature_extraction(file_paths):
    pool = multiprocessing.Pool(processes=multiprocessing.cpu_count())
    features = pool.map(extract_features, file_paths)
    pool.close()
    pool.join()
    return features

In [8]:
# 각 데이터셋에 대해 병렬 특징 추출 실행
for dataset in [train_df, val_df, test_df]:
    # 파일 경로 리스트에서 None 제외
    file_paths = dataset['path'].dropna().tolist()
    # 병렬 특징 추출
    extracted_features = parallel_feature_extraction(file_paths)
    # 결과 처리
    features = pd.Series(extracted_features)
    valid_features = features.dropna()
    # 특징 데이터프레임 생성
    feature_df = pd.DataFrame(valid_features.tolist(), columns=['mfcc', 'spectral_band', 'zero_crossing_rate'])
    # 각 특징을 해당 데이터셋에 추가
    dataset['mfcc'] = feature_df['mfcc']
    dataset['spectral_band'] = feature_df['spectral_band']
    dataset['zero_crossing_rate'] = feature_df['zero_crossing_rate']

# 결과 확인
print(train_df.head())

# 데이터프레임을 CSV 파일로 저장
train_df.to_csv('/content/TrainingDataset.csv', index=False)
val_df.to_csv('/content/ValidationDataset.csv', index=False)
test_df.to_csv('/content/TestingDataset.csv', index=False)

# 저장된 파일 경로 출력
print("파일이 저장되었습니다:")
print("/content/TrainingDataset.csv")
print("/content/ValidationDataset.csv")
print("/content/TestingDataset.csv")

                                                    path label  \
26417  /content/the-fake-or-real-dataset/for-original...  real   
31435  /content/the-fake-or-real-dataset/for-original...  real   
24337  /content/the-fake-or-real-dataset/for-original...  real   
2156   /content/the-fake-or-real-dataset/for-original...  fake   
13290  /content/the-fake-or-real-dataset/for-original...  real   

                                                    mfcc  \
26417                                                NaN   
31435                                                NaN   
24337  [-297.1028, 77.44087, 3.2898045, 8.396718, -20...   
2156   [-309.6398, 91.91697, -7.0920734, 7.699547, -1...   
13290  [-288.67917, 64.08434, -5.1230516, 11.13057, -...   

              spectral_band     zero_crossing_rate  
26417                   NaN                    NaN  
31435                   NaN                    NaN  
24337   [2657.135073871866]  [0.17655243514972144]  
2156    [2276.301715645839]  [

In [1]:
%cd /content

/content
