In [11]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from keras.models import Sequential
from keras.layers import Dense, LSTM
from sklearn.metrics import accuracy_score, classification_report
from sklearn.inspection import permutation_importance
from scikeras.wrappers import KerasClassifier
from sklearn.utils.validation import check_X_y
from sklearn.impute import SimpleImputer


In [3]:
# Load and preprocess data
file_path = '/Users/pvuda/Documents/features/combined_file.csv'
data = pd.read_csv(file_path)

In [12]:
# Create a new target variable for binary classification
data['real_or_fake_general'] = data['real_or_fake'].apply(lambda x: 'R' if x == 'R' else 'F')

# Encode the binary labels
label_encoder = LabelEncoder()
data['real_or_fake_general'] = label_encoder.fit_transform(data['real_or_fake_general'])

# Prepare feature matrix and target vector
X = data.drop(columns=['audio_id', 'real_or_fake', 'real_or_fake_general'])
y = data['real_or_fake_general']

In [13]:
X.isna().sum()

spectral_centroid_mean     0
spectral_centroid_std      0
spectral_centroid_var      0
spectral_centroid_min      0
spectral_centroid_max      0
                          ..
intensity_duration        26
speakingrate              26
articulationrate          26
asd                       50
totalpauseduration        26
Length: 408, dtype: int64

In [14]:
y.isna().sum()

0

In [16]:
# Handle missing values in X
imputer = SimpleImputer(strategy='mean')
X_imputed = imputer.fit_transform(X)

In [17]:
# Standardize features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_imputed)

In [18]:
# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

In [19]:
# Reshape input data for LSTM
X_train_reshaped = X_train.reshape((X_train.shape[0], X_train.shape[1], 1))
X_test_reshaped = X_test.reshape((X_test.shape[0], X_test.shape[1], 1))

In [20]:
# Define the LSTM model
def create_lstm_model():
    model = Sequential()
    model.add(LSTM(50, input_shape=(X_train_reshaped.shape[1], 1), activation='relu'))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    return model

In [22]:
# Wrap the Keras model with KerasClassifier
lstm_model = KerasClassifier(model=create_lstm_model, epochs=2, batch_size=32, verbose=1)

# Fit the model
history = lstm_model.fit(X_train_reshaped, y_train, validation_data=(X_test_reshaped, y_test))

Epoch 1/2


  super().__init__(**kwargs)


[1m6787/6787[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m406s[0m 60ms/step - accuracy: 0.8579 - loss: nan - val_accuracy: 0.8596 - val_loss: nan
Epoch 2/2
[1m6787/6787[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m419s[0m 62ms/step - accuracy: 0.8586 - loss: nan - val_accuracy: 0.8596 - val_loss: nan


In [26]:
# Make predictions
nn_predictions_prob = lstm_model.predict(X_test_reshaped)
nn_predictions = (nn_predictions_prob > 0.5).astype(int)

[1m1697/1697[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 14ms/step


In [27]:
# Evaluate the model
accuracy = accuracy_score(y_test, nn_predictions)
print("Neural Network Accuracy: ", accuracy)
print(classification_report(y_test, nn_predictions, target_names=label_encoder.classes_))

Neural Network Accuracy:  0.8595634957178377


              precision    recall  f1-score   support

           F       0.86      1.00      0.92     46670
           R       0.00      0.00      0.00      7625

    accuracy                           0.86     54295
   macro avg       0.43      0.50      0.46     54295
weighted avg       0.74      0.86      0.79     54295



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [29]:
# Custom scorer to handle reshaping
def custom_scorer(estimator, X, y):
    X_reshaped = X.reshape((X.shape[0], X.shape[1], 1))
    predictions = estimator.predict(X_reshaped)
    predictions = (predictions > 0.5).astype(int)
    return accuracy_score(y, predictions)

# Calculate permutation importance with custom scorer
result = permutation_importance(lstm_model, X_test, y_test, n_repeats=1, random_state=42, scoring=custom_scorer)

# Display feature importance
importance_df = pd.DataFrame(result.importances_mean, index=X.columns, columns=['Importance'])
print(importance_df)


[1m1697/1697[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 14ms/step
[1m1697/1697[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 14ms/step
[1m1697/1697[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 14ms/step
[1m1697/1697[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 14ms/step
[1m1697/1697[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 14ms/step
[1m1697/1697[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 14ms/step
[1m1697/1697[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 14ms/step
[1m1697/1697[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 14ms/step
[1m1697/1697[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 14ms/step
[1m1697/1697[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 14ms/step
[1m1697/1697[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 14ms/step
[1m1697/1697[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 14ms/step
[1m1697/1697[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m

In [31]:
importance_df.sort_values

<bound method DataFrame.sort_values of                         Importance
spectral_centroid_mean         0.0
spectral_centroid_std          0.0
spectral_centroid_var          0.0
spectral_centroid_min          0.0
spectral_centroid_max          0.0
...                            ...
intensity_duration             0.0
speakingrate                   0.0
articulationrate               0.0
asd                            0.0
totalpauseduration             0.0

[408 rows x 1 columns]>