# Part 1: Urdu Deepfake Audio Detection (Binary Classification)

In [None]:
import pandas as pd
import librosa
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression, Perceptron
from sklearn.model_selection import train_test_split
import torch
import torch.nn as nn
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
import joblib

# ==========================================
# Step 1: Preprocessing & Feature Extraction
# ==========================================

# Load the Dataset
labels_df = pd.read_csv("labels.csv")  # Columns: [audio_filename, label (0=Deepfake, 1=Bonafide]
audio_paths = labels_df["audio_filename"].values
y = labels_df["label"].values

# Extract MFCC Features
def extract_mfcc(audio_path, n_mfcc=13, fixed_length=2):
    y, sr = librosa.load(audio_path, sr=22050, duration=fixed_length)  # Force 2-second clips
    mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=n_mfcc)

    # Pad/Crop to fixed length (e.g., 2 seconds)
    if mfcc.shape[1] < 87:  # 87 frames ≈ 2 seconds for 22050 Hz
        mfcc = np.pad(mfcc, ((0, 0), (0, 87 - mfcc.shape[1])))
    else:
        mfcc = mfcc[:, :87]

    return mfcc.flatten()  # Convert to 1D vector (e.g., 13 MFCCs * 87 frames = 1131 features)

# Extract features for all audio files
X = np.array([extract_mfcc(f"audio_files/{path}") for path in audio_paths])

# Normalize Features
scaler = StandardScaler()
X = scaler.fit_transform(X)

# =============================
# Step 2: Model Building
# =============================

# Scikit-Learn Models (SVM, Logistic Regression, Perceptron)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# Train models
svm = SVC(kernel="rbf", probability=True).fit(X_train, y_train)
lr = LogisticRegression().fit(X_train, y_train)
perceptron = Perceptron().fit(X_train, y_train)

# PyTorch DNN
class AudioDNN(nn.Module):
    def __init__(self, input_dim):
        super().__init__()
        self.layers = nn.Sequential(
            nn.Linear(input_dim, 128),
            nn.ReLU(),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Linear(64, 1),
            nn.Sigmoid()
        )

    def forward(self, x):
        return self.layers(x)

# Initialize
model = AudioDNN(X_train.shape[1])
criterion = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters())

# Convert data to PyTorch tensors
X_train_tensor = torch.FloatTensor(X_train)
y_train_tensor = torch.FloatTensor(y_train).unsqueeze(1)

# Training loop
for epoch in range(20):
    optimizer.zero_grad()
    outputs = model(X_train_tensor)
    loss = criterion(outputs, y_train_tensor)
    loss.backward()
    optimizer.step()

# =====================
# Step 3: Evaluation
# =====================

# Scikit-Learn Models
def evaluate_model(model, X_test, y_test):
    y_pred = model.predict(X_test)
    y_proba = model.predict_proba(X_test)[:, 1] if hasattr(model, "predict_proba") else None

    metrics = {
        "Accuracy": accuracy_score(y_test, y_pred),
        "Precision": precision_score(y_test, y_pred),
        "Recall": recall_score(y_test, y_pred),
        "F1-Score": f1_score(y_test, y_pred),
        "AUC-ROC": roc_auc_score(y_test, y_proba) if y_proba is not None else "N/A"
    }
    return metrics

print("SVM:", evaluate_model(svm, X_test, y_test))
print("Logistic Regression:", evaluate_model(lr, X_test, y_test))
print("Perceptron:", evaluate_model(perceptron, X_test, y_test))

# PyTorch DNN
model.eval()
with torch.no_grad():
    y_proba = model(torch.FloatTensor(X_test)).numpy().flatten()
    y_pred = (y_proba > 0.5).astype(int)

print("DNN Metrics:")
print("Accuracy:", accuracy_score(y_test, y_pred))
print("AUC-ROC:", roc_auc_score(y_test, y_proba))

# =====================
# Step 4: Save Models
# =====================

# Save scikit-learn models and scaler
joblib.dump(svm, "svm_model.pkl")
joblib.dump(lr, "logistic_model.pkl")
joblib.dump(perceptron, "perceptron_model.pkl")
joblib.dump(scaler, "scaler.pkl")

# Save PyTorch model
torch.save(model.state_dict(), "dnn_model.pth")

# Part 2: Multi-Label Defect Prediction (Multi-Label Classification)

In [None]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import hamming_loss, f1_score
from sklearn.linear_model import LogisticRegression, Perceptron
from sklearn.svm import LinearSVC
from torch.utils.data import Dataset, DataLoader
import joblib

# =============================
# Step 1: Data Preprocessing
# =============================

# Load dataset
df = pd.read_csv("dataset.csv")
texts = df["report"].values
labels = df.drop("report", axis=1).values  # Shape: (n_samples, n_labels)

# Check label distribution
print("\nLabel Distribution:")
print(df.drop("report", axis=1).sum(axis=0))

# TF-IDF Vectorization
tfidf = TfidfVectorizer(max_features=5000, stop_words="english")
X = tfidf.fit_transform(texts)

# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(
    X, labels, test_size=0.2, random_state=42
)

# =============================================
# Step 2: Model Building & Training
# =============================================

class TextDataset(Dataset):
    def __init__(self, X, y):
        self.X = X
        self.y = y

    def __len__(self):
        return self.X.shape[0]

    def __getitem__(self, idx):
        return (
            torch.FloatTensor(self.X[idx].toarray()).squeeze(),
            torch.FloatTensor(self.y[idx])
        )

class DNN(nn.Module):
    def __init__(self, input_dim, output_dim):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(input_dim, 256),
            nn.ReLU(),
            nn.Dropout(0.5),
            nn.Linear(256, 128),
            nn.ReLU(),
            nn.Linear(128, output_dim),
            nn.Sigmoid()
        )

    def forward(self, x):
        return self.net(x)

def train_pytorch_model():
    # Initialize model
    model = DNN(X_train.shape[1], y_train.shape[1])
    criterion = nn.BCELoss()
    optimizer = torch.optim.Adam(model.parameters())

    # Create DataLoader
    train_loader = DataLoader(
        TextDataset(X_train, y_train),
        batch_size=32,
        shuffle=True
    )

    # Training loop
    for epoch in range(15):
        for batch_x, batch_y in train_loader:
            optimizer.zero_grad()
            outputs = model(batch_x)
            loss = criterion(outputs, batch_y)
            loss.backward()
            optimizer.step()

    return model

# Train all models
print("\nTraining Models...")

# 1. Logistic Regression (One-vs-Rest)
lr_model = LogisticRegression(max_iter=1000, class_weight="balanced")
lr_model = OneVsRestClassifier(lr_model).fit(X_train, y_train)

# 2. SVM (One-vs-Rest)
svm_model = LinearSVC(class_weight="balanced", dual=False)
svm_model = OneVsRestClassifier(svm_model).fit(X_train, y_train)

# 3. Online Perceptron
perceptron = Perceptron()
perceptron = OneVsRestClassifier(perceptron)
for i in range(X_train.shape[0]):
    perceptron.partial_fit(X_train[i], y_train[i], classes=range(y_train.shape[1]))

# 4. PyTorch DNN
dnn_model = train_pytorch_model()

# =============================================
# Step 3: Evaluation
# =============================================

def evaluate(y_true, y_pred, model_name):
    return {
        "Model": model_name,
        "Hamming Loss": hamming_loss(y_true, y_pred),
        "Micro-F1": f1_score(y_true, y_pred, average="micro"),
        "Macro-F1": f1_score(y_true, y_pred, average="macro")
    }

# Generate predictions
results = []

# Scikit-learn models
for name, model in [("Logistic Regression", lr_model),
                    ("SVM", svm_model),
                    ("Perceptron", perceptron)]:
    y_pred = model.predict(X_test)
    results.append(evaluate(y_test, y_pred, name))

# PyTorch DNN
dnn_model.eval()
with torch.no_grad():
    y_proba = dnn_model(torch.FloatTensor(X_test.toarray())).numpy()
y_pred_dnn = (y_proba > 0.5).astype(int)
results.append(evaluate(y_test, y_pred_dnn, "PyTorch DNN"))

# Display results
print("\nEvaluation Results:")
print(pd.DataFrame(results))

# =============================================
# Step 4: Save Artifacts
# =============================================

# Save models and vectorizer
joblib.dump(lr_model, "logistic_regression_model.pkl")
joblib.dump(svm_model, "svm_model.pkl")
joblib.dump(perceptron, "perceptron_model.pkl")
joblib.dump(tfidf, "tfidf_vectorizer.pkl")
torch.save(dnn_model.state_dict(), "dnn_model.pth")

# =============================================
# Step 5: Streamlit App Integration
# =============================================

def streamlit_app():
    import streamlit as st

    st.title("Software Defect Prediction")

    # Load artifacts
    tfidf = joblib.load("tfidf_vectorizer.pkl")
    lr_model = joblib.load("logistic_regression_model.pkl")
    dnn_model = DNN(X_train.shape[1], y_train.shape[1])
    dnn_model.load_state_dict(torch.load("dnn_model.pth"))
    dnn_model.eval()

    # UI Components
    model_choice = st.selectbox("Select Model", ["Logistic Regression", "Perceptron", "DNN"])
    user_input = st.text_area("Enter software defect report:")

    if st.button("Predict"):
        # Preprocess input
        X_input = tfidf.transform([user_input])

        # Get predictions
        if model_choice == "Logistic Regression":
            preds = lr_model.predict(X_input)[0]
        elif model_choice == "Perceptron":
            preds = perceptron.predict(X_input)[0]
        else:
            with torch.no_grad():
                preds = (dnn_model(torch.FloatTensor(X_input.toarray())) > 0.5).float().numpy()[0]

        # Display results
        labels = df.columns[1:]
        results = [labels[i] for i, val in enumerate(preds) if val == 1]
        st.write("Predicted Defects:", ", ".join(results) if results else "No defects predicted")

if __name__ == "__main__":
    streamlit_app()

# Part 3: Interactive Streamlit App

In [None]:
import streamlit as st
import librosa
import numpy as np
import joblib
import torch
import torch.nn as nn

# ======================
# Model Loading
# ======================
audio_scaler = joblib.load("scaler.pkl")
audio_svm = joblib.load("svm_model.pkl")
audio_lr = joblib.load("lr_model.pkl")
text_tfidf = joblib.load("tfidf.pkl")
text_lr = joblib.load("lr_model.pkl")
text_perceptron = joblib.load("perceptron_model.pkl")

# ======================
# Audio Components
# ======================
class AudioClassifier(nn.Module):
    def __init__(self, input_dim):
        super().__init__():
        self.layers = nn.Sequential(
            nn.Linear(input_dim, 128),
            nn.ReLU(),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Linear(64, 1),
            nn.Sigmoid()
        )
    def forward(self, x):
        return self.layers(x)

def process_audio(audio_file):
    y, sr = librosa.load(audio_file, sr=22050)
    mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13)
    features = np.concatenate([np.mean(mfcc,1), np.std(mfcc,1)])
    return audio_scaler.transform([features])[0]

# ======================
# Text Components
# ======================
class TextClassifier(nn.Module):
    def __init__(self, input_dim, output_dim):
        super().__init__()
        self.layers = nn.Sequential(
            nn.Linear(input_dim, 256),
            nn.ReLU(),
            nn.Dropout(0.5),
            nn.Linear(256, 128),
            nn.ReLU(),
            nn.Linear(128, output_dim),
            nn.Sigmoid()
        )
    def forward(self, x):
        return self.layers(x)

# ======================
# Streamlit UI
# ======================
st.set_page_config(layout="wide")

# Audio Section
with st.expander("🔊 Deepfake Audio Detection", expanded=True):
    audio_file = st.file_uploader("Upload audio", type=["wav", "mp3"])
    audio_model = st.selectbox("Audio Model", ["SVM", "Logistic Regression", "DNN"])

    if audio_file:
        features = process_audio(audio_file)
        if audio_model == "DNN":
            model = AudioClassifier(len(features))
            model.load_state_dict(torch.load("dnn_model.pt"))
            with torch.no_grad():
                proba = model(torch.FloatTensor(features)).item()
        else:
            model = audio_svm if audio_model == "SVM" else audio_lr
            proba = model.predict_proba([features])[0][1]
        st.write(f"Prediction: {'Real' if proba > 0.5 else 'Fake'} ({proba:.2%})")

# Text Section
with st.expander("📝 Defect Prediction", expanded=True):
    text_input = st.text_area("Input defect report")
    text_model = st.selectbox("Text Model", ["Logistic Regression", "Perceptron", "DNN"])

    if text_input:
        X = text_tfidf.transform([text_input])
        if text_model == "DNN":
            model = TextClassifier(X.shape[1], text_lr.classes_.shape[0])
            model.load_state_dict(torch.load("dnn_model.pt"))
            with torch.no_grad():
                preds = model(torch.FloatTensor(X.toarray())).numpy()[0]
        else:
            model = text_lr if text_model == "Logistic Regression" else text_perceptron
            preds = model.predict_proba(X)[0]
        labels = [text_lr.classes_[i] for i in np.where(preds > 0.5)[0]]
        st.write("Predicted defects:", ", ".join(labels) if labels else "None")