In [7]:
import numpy as np
import librosa
import time
import joblib
import os

from datasets import load_dataset
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression, Perceptron
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam

# --- 1. Load Dataset (Limited Subset for Safety) ---
try:
    dataset = load_dataset("CSALT/deepfake_detection_dataset_urdu", split="train[:100]")
    print("Dataset Loaded:", dataset[0].keys())
except Exception as e:
    print("Failed to load dataset:", e)
    exit()

# --- 2. Feature Extraction ---
def extract_mfcc(audio_array, sr):
    mfcc = librosa.feature.mfcc(y=audio_array, sr=sr, n_mfcc=13)
    return np.mean(mfcc.T, axis=0)

X = []

for i, item in enumerate(dataset):
    try:
        audio_array = item["audio"]["array"]
        sr = item["audio"]["sampling_rate"]
        mfcc = extract_mfcc(audio_array, sr)
        X.append(mfcc)
    except Exception as e:
        print(f"Error processing sample {i}: {e}")
    time.sleep(1)  # Avoid rate limits

X = np.array(X)
y = np.array([0 if i < len(X) // 2 else 1 for i in range(len(X))])  # Dummy binary labels

# --- 3. Train-Test Split ---
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# --- 4. Train and Save Classical Models ---
models = {
    "svm_audio_model.pkl": SVC(probability=True),
    "lr_audio_model.pkl": LogisticRegression(max_iter=1000),
    "perceptron_audio_model.pkl": Perceptron()
}

for filename, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    y_prob = model.predict_proba(X_test)[:, 1] if hasattr(model, "predict_proba") else y_pred

    print(f"\n🧠 {filename} Results:")
    print("Accuracy:", accuracy_score(y_test, y_pred))
    print("Precision:", precision_score(y_test, y_pred))
    print("Recall:", recall_score(y_test, y_pred))
    print("F1-Score:", f1_score(y_test, y_pred))
    print("AUC-ROC:", roc_auc_score(y_test, y_prob))

    # ✅ Save model using joblib
    joblib.dump(model, filename)
    print(f"✅ Saved: {filename}")

# --- 5. Deep Neural Network (DNN) ---
dnn = Sequential([
    Dense(64, input_shape=(X.shape[1],), activation='relu'),
    Dense(32, activation='relu'),
    Dense(1, activation='sigmoid')
])

dnn.compile(optimizer=Adam(learning_rate=0.001),
            loss='binary_crossentropy',
            metrics=['accuracy'])

dnn.fit(X_train, y_train, epochs=20, batch_size=16, verbose=0)

y_pred_dnn = (dnn.predict(X_test) > 0.5).astype(int).flatten()
y_prob_dnn = dnn.predict(X_test).flatten()

print("\n🧠 Deep Neural Network Results:")
print("Accuracy:", accuracy_score(y_test, y_pred_dnn))
print("Precision:", precision_score(y_test, y_pred_dnn))
print("Recall:", recall_score(y_test, y_pred_dnn))
print("F1-Score:", f1_score(y_test, y_pred_dnn))
print("AUC-ROC:", roc_auc_score(y_test, y_prob_dnn))

# ✅ Save DNN model
dnn.save("dnn_audio_model.h5")
print("✅ Saved: dnn_audio_model.h5")


Repo card metadata block was not found. Setting CardData to empty.


Resolving data files:   0%|          | 0/6794 [00:00<?, ?it/s]

Dataset Loaded: dict_keys(['audio'])

🧠 svm_audio_model.pkl Results:
Accuracy: 0.65
Precision: 0.5384615384615384
Recall: 0.875
F1-Score: 0.6666666666666666
AUC-ROC: 0.7604166666666667
✅ Saved: svm_audio_model.pkl

🧠 lr_audio_model.pkl Results:
Accuracy: 0.8
Precision: 0.75
Recall: 0.75
F1-Score: 0.75
AUC-ROC: 0.9583333333333334
✅ Saved: lr_audio_model.pkl

🧠 perceptron_audio_model.pkl Results:
Accuracy: 0.75
Precision: 0.7142857142857143
Recall: 0.625
F1-Score: 0.6666666666666666
AUC-ROC: 0.7291666666666667
✅ Saved: perceptron_audio_model.pkl
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 69ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 44ms/step





🧠 Deep Neural Network Results:
Accuracy: 0.6
Precision: 0.5
Recall: 0.875
F1-Score: 0.6363636363636364
AUC-ROC: 0.6145833333333334
✅ Saved: dnn_audio_model.h5


In [None]:
import pandas as pd

file_path = "dataset.csv"
df = pd.read_csv(file_path)

# Display basic information
print(df.info())
print(df.head())
print("Shape:", df.shape)
print("Missing values:\n", df.isnull().sum())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1386 entries, 0 to 1385
Data columns (total 8 columns):
 #   Column                   Non-Null Count  Dtype 
---  ------                   --------------  ----- 
 0   report                   1386 non-null   object
 1   type_blocker             1386 non-null   int64 
 2   type_regression          1386 non-null   int64 
 3   type_bug                 1386 non-null   int64 
 4   type_documentation       1386 non-null   int64 
 5   type_enhancement         1386 non-null   int64 
 6   type_task                1386 non-null   int64 
 7   type_dependency_upgrade  1386 non-null   int64 
dtypes: int64(7), object(1)
memory usage: 86.8+ KB
None
                                              report  type_blocker  \
0  The mention of Fix Super Stream Example in Doc...             0   
1  It seems like you need a concise summary relat...             0   
2  The issue AMQP 838 opened by Gary Russell invo...             0   
3  I m unable to access exte

In [3]:
from google.colab import files
uploaded = files.upload()

Saving dataset.csv to dataset.csv


In [4]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression, Perceptron
from sklearn.svm import SVC
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import hamming_loss, f1_score
import os
import warnings
from joblib import dump

warnings.filterwarnings("ignore")

# --- 1. Load Dataset ---
df = pd.read_csv("/content/dataset.csv")
print("✅ Dataset loaded successfully!")

# --- 2. Define Features and Labels ---
X_text = df['report']
y = df.iloc[:, 1:]  # Assuming binary labels start from column index 1

# --- 3. Feature Extraction ---
tfidf = TfidfVectorizer(max_features=5000)
X = tfidf.fit_transform(X_text)

# Save the TF-IDF vectorizer
os.makedirs("saved_models", exist_ok=True)
dump(tfidf, "saved_models/tfidf_vectorizer.pkl")

# --- 4. Train-Test Split ---
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# --- 5. Check for Labels with No Positive Examples in Training Set ---
for col in y.columns:
    if y_train[col].sum() == 0:
        print(f"⚠️ Warning: Label '{col}' has no positive samples in training set.")

# --- 6. Filter Labels with No Positives in Training Set ---
y_train_filtered = y_train.loc[:, y_train.sum(axis=0) > 0]
y_test_filtered = y_test[y_train_filtered.columns]

# --- 7. Evaluation Function ---
def evaluate_model(name, model, X_test, y_test):
    y_pred = model.predict(X_test)
    print(f"\n🧠 {name} Evaluation")
    print("Hamming Loss:", hamming_loss(y_test, y_pred))
    print("Micro-F1 Score:", f1_score(y_test, y_pred, average="micro", zero_division=0))
    print("Macro-F1 Score:", f1_score(y_test, y_pred, average="macro", zero_division=0))

# --- 8. Train and Save Models ---

# Logistic Regression
lr = OneVsRestClassifier(LogisticRegression(max_iter=1000))
lr.fit(X_train, y_train_filtered)
evaluate_model("Logistic Regression", lr, X_test, y_test_filtered)
dump(lr, "saved_models/lr_text_model.pkl")

# SVM
svm = OneVsRestClassifier(SVC(probability=True))
svm.fit(X_train, y_train_filtered)
evaluate_model("SVM", svm, X_test, y_test_filtered)
dump(svm, "saved_models/svm_text_model.pkl")

# Perceptron
perceptron = OneVsRestClassifier(Perceptron())
perceptron.fit(X_train, y_train_filtered)
evaluate_model("Perceptron", perceptron, X_test, y_test_filtered)
dump(perceptron, "saved_models/perceptron_text_model.pkl")


✅ Dataset loaded successfully!

🧠 Logistic Regression Evaluation
Hamming Loss: 0.12170263788968826
Micro-F1 Score: 0.8122109158186864
Macro-F1 Score: 0.4456896227693783

🧠 SVM Evaluation
Hamming Loss: 0.1157074340527578
Micro-F1 Score: 0.8224471021159153
Macro-F1 Score: 0.5142038183470409

🧠 Perceptron Evaluation
Hamming Loss: 0.13729016786570744
Micro-F1 Score: 0.7893284268629255
Macro-F1 Score: 0.6186586356023609


['saved_models/perceptron_text_model.pkl']