In [1]:
import librosa
import numpy as np
from scipy.fftpack import dct
import os 
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import wave
import math
import scipy.io.wavfile as wav
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import VotingClassifier
from sklearn import svm
from sklearn.ensemble import AdaBoostClassifier
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.preprocessing import StandardScaler
import pandas as pd

Split wave

Load Data

In [2]:
def load_data(data_dir):
    real_files = [os.path.join(data_dir, "real", f) for f in os.listdir(os.path.join(data_dir, "real")) if f.endswith(".wav")]
    fake_files = [os.path.join(data_dir, "fake", f) for f in os.listdir(os.path.join(data_dir, "fake")) if f.endswith(".wav")]
    fake_labels = [0] * len(fake_files)
    real_labels = [1] * len(real_files)

    files = fake_files + real_files
    labels = fake_labels + real_labels

    return files, labels

Preprocessing

In [3]:
def evaluate(frames):
    # Calculate the average energy of the frames
    frame_energies = np.sum(np.square(frames), axis=1)
    avg_energy = np.mean(frame_energies)

    return avg_energy

In [4]:
def optimal_fft_length(window_length: int) -> int:
    return 2 ** int(np.ceil(np.log2(window_length)))

In [3]:
def preprocessing(audio_file,n_features):
    # Load the audio signal
    y, sr = librosa.load(audio_file, sr=None)

    # Step 1: Pre-emphasis
    pre_emphasis_coeff = 0.97
    y_filt = librosa.effects.preemphasis(y, coef=pre_emphasis_coeff)

    # Step 2: Frame blocking
    frame_length = 0.025  # 20 ms
    hop_length = 0.010    # 10 ms
    frame_length_samples = int(frame_length * sr)
    hop_length_samples = int(hop_length * sr)
    frames = librosa.util.frame(y_filt, frame_length= frame_length_samples, hop_length= hop_length_samples)

    # Step 3: Windowing

    frame_length = len(frames)
    output = frames*np.hamming(frame_length)[:, np.newaxis]
    frames = output

    # Step 4: Fast Fourier Transform (FFT)
    fft_size = 512
    spectrogram = np.abs(np.fft.fft(frames, n=fft_size))
    
    # Step 5: Mel frequency wrapping
    n_mels = n_features
    mel_spec = librosa.feature.melspectrogram(sr=sr, S=spectrogram, n_mels=n_mels)
    
    # Step 6: Discrete Cosine Transform (DCT)
    n_mfcc = 216
    mfcc = dct(np.log(mel_spec), type=2, axis=1, norm='ortho')[:, :n_mfcc]
    
    # Scaled MFCC features
    max_mfcc=np.max(mfcc, axis=1)
    mins, maxs=np.min(max_mfcc), np.max(max_mfcc)
    scaled_mfcc=(max_mfcc-mins)/(maxs-mins)

    return scaled_mfcc

Train the model

Preparing Data

In [4]:
url = r"C:\Users\VIET HOANG - VTS\Desktop\tien xu ly"
files, labels = load_data(url)

X_train, X_test, y_train, y_test = train_test_split(files, labels, test_size=0.2, random_state=42)

X_train = [preprocessing(file,68) for file in X_train]
X_test = [preprocessing(file,68) for file in X_test]



In [37]:
svmm = svm.SVC()

# ada = AdaBoostClassifier(estimator=svmm, n_estimators=300, learning_rate=0.05)
svmm.fit(X_train,y_train)
svmm.score(X_test,y_test)

0.845

In [39]:
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB



url = r"C:\Users\VIET HOANG - VTS\Desktop\tien xu ly"
files, labels = load_data(url)

X_train, X_test, y_train, y_test = train_test_split(files, labels, test_size=0.2, random_state=42)

X_train = [preprocessing(file,68) for file in X_train]
X_test = [preprocessing(file,68) for file in X_test]


# Create a list of classifiers
classifiers = [
    LogisticRegression(),
    DecisionTreeClassifier(),
    SVC(),
    RandomForestClassifier(),
    AdaBoostClassifier(),
    KNeighborsClassifier(),
    GaussianNB()
]

# Train and evaluate each classifier
for classifier in classifiers:
    classifier.fit(X_train, y_train)
    y_pred = classifier.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print(f"{classifier.__class__.__name__}: Accuracy = {accuracy}")

LogisticRegression: Accuracy = 0.6976744186046512
DecisionTreeClassifier: Accuracy = 0.5697674418604651
SVC: Accuracy = 0.7325581395348837
RandomForestClassifier: Accuracy = 0.5930232558139535




AdaBoostClassifier: Accuracy = 0.627906976744186
KNeighborsClassifier: Accuracy = 0.5813953488372093
GaussianNB: Accuracy = 0.5581395348837209


In [None]:
model = RandomForestClassifier()

# Define the hyperparameters grid for tuning
param_grid = {
    'n_estimators': [100, 200, 300],  # Number of trees in the forest
    'max_depth': [None, 5, 10],  # Maximum depth of the trees
    'min_samples_split': [2, 5, 10],  # Minimum number of samples required to split a node
    'min_samples_leaf': [1, 2, 4]  # Minimum number of samples required at each leaf node
}

# Perform grid search with cross-validation
model= GridSearchCV(estimator=model, param_grid=param_grid, scoring='f1', cv=5)
model.fit(X_train, y_train)

# Modelin doğruluğunu değerlendirme
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Test Accuracy: {:.2f}%".format(accuracy * 100))

In [None]:
model.best_params_

{'max_depth': None,
 'min_samples_leaf': 2,
 'min_samples_split': 10,
 'n_estimators': 200}

In [None]:
print(len(files))

196


In [9]:
for i in range(60,90):
    url = r"C:\Users\VIET HOANG - VTS\Desktop\tien xu ly"
    files, labels = load_data(url)

    X_train, X_test, y_train, y_test = train_test_split(files, labels, test_size=0.2, random_state=42)

    X_train = [preprocessing(file,i) for file in X_train]
    X_test = [preprocessing(file,i) for file in X_test]
    
    svc = svm.SVC()
    svc.fit(X_train,y_train)
    print(f"{i}: {svc.score(X_test,y_test)}")
    

60: 0.627906976744186
61: 0.6046511627906976
62: 0.6162790697674418
63: 0.5813953488372093
64: 0.686046511627907
65: 0.6511627906976745
66: 0.6162790697674418
67: 0.6511627906976745
68: 0.7325581395348837
69: 0.6046511627906976
70: 0.6511627906976745
71: 0.5813953488372093
72: 0.6046511627906976
73: 0.5813953488372093
74: 0.6395348837209303
75: 0.6046511627906976
76: 0.6395348837209303


KeyboardInterrupt: 

In [206]:
svmm = svm.SVC(C=20, gamma=0.01, kernel='rbf', probability=True)
svmm.fit(X_train,y_train)
svmm.score(X_test,y_test)

0.6162790697674418

In [None]:
from sklearn import svm
# Create an SVM classifier
# Define the SVM model
svc = svm.SVC()

# Set up the parameter grid
param_grid = {
    'C': [0.1, 1, 2,5,7,10,20,25,30,35,50,100],                 # Penalty parameter C
    'kernel': ['linear', 'poly', 'rbf', 'sigmoid', 'precomputed'],        # Kernel function: linear or radial basis function (rbf)
    'gamma': ['scale', 'auto', 0.1, 1, 0.01]  # Kernel coefficient for 'rbf'. 'scale' and 'auto' use default values.
}

# Perform grid search with cross-validation
clf= GridSearchCV(estimator=svc, param_grid=param_grid, cv=5)

# Train the classifier on the training data
clf.fit(X_train, y_train)

# Make predictions on the test data
y_pred = clf.predict(X_test)

# Evaluate the accuracy of the classifier
accuracy = accuracy_score(y_test, y_pred)
print("Test Accuracy: {:.2f}%".format(accuracy * 100))

clf.best_params_

Test Accuracy: 58.14%


{'C': 5, 'gamma': 0.01, 'kernel': 'rbf'}

In [182]:
print("Test Accuracy: {:.2f}%".format(accuracy * 100))

Test Accuracy: 40.70%


In [51]:
rf = LogisticRegression()
rf.fit(X_train,y_train)
rf.score(X_test,y_test)

0.6976744186046512

In [88]:

gb_classifier = GradientBoostingClassifier()

# Define the parameter grid for grid search
param_grid = {
    'n_estimators': [100, 200, 300],  # Number of boosting stages
    'learning_rate': [0.1, 0.05, 0.01],  # Learning rate
    'max_depth': [3, 4, 5],  # Maximum depth of individual estimators
    # Add more hyperparameters for tuning if needed
}

# Perform grid search
grid_search = GridSearchCV(gb_classifier, param_grid, cv=5)
grid_search.fit(X_train, y_train)

# Get the best gradient boosting model
best_gb_model = grid_search.best_estimator_

# Predict on the testing data
y_pred = best_gb_model.predict(X_test)

# Evaluate the classification accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

KeyboardInterrupt: 

In [None]:
grid_search.best_params_

{'learning_rate': 0.05, 'max_depth': 5, 'n_estimators': 300}

In [73]:
from sklearn.impute import KNNImputer
import numpy as np

# Assuming you have the dataset with missing values stored in X
# X should be a 2D array or DataFrame

# Create an instance of KNNImputer
imputer = KNNImputer(n_neighbors=500)  # You can adjust the number of neighbors (k) as needed

# Perform imputation
X_train = imputer.fit_transform(X_train)
X_test = imputer.fit_transform(X_test)

In [49]:
rf = RandomForestClassifier(max_depth=None, min_samples_leaf=2, min_samples_split=2, n_estimators=200)
svmm = svm.SVC(probability=True)

ls = LogisticRegression()
# gb = GradientBoostingClassifier(learning_rate = 0.05, max_depth = 5, n_estimators = 300)
# Define the voting classifier

voting_clf = VotingClassifier(
    estimators=[('svm', svmm),('ls',ls)],
    voting='hard'
)
voting_clf.fit(X_train,y_train)
# param_grid = {
#     'n_estimators': [50],
#     'learning_rate': [0.1]
# }

# # Define the AdaBoost classifier with the voting classifier as the base estimator
# adaboost_clf = AdaBoostClassifier(estimator=svmm)

# grid_search = GridSearchCV(adaboost_clf, param_grid=param_grid,cv = 2)
# # Fit the AdaBoost classifier
# grid_search.fit(X_train, y_train)

# Calculate the accuracy score
accuracy = voting_clf.score(X_test, y_test)
print("Accuracy: ", accuracy)

Accuracy:  0.7209302325581395


In [16]:
rf = RandomForestClassifier(max_depth=None, min_samples_leaf=2, min_samples_split=2, n_estimators=200)
rf.fit(X_train,y_train)
rf.score(X_test,y_test)

0.6627906976744186