In [None]:
!pip install catboost

Collecting catboost
  Downloading catboost-1.2.8-cp312-cp312-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading catboost-1.2.8-cp312-cp312-manylinux2014_x86_64.whl (99.2 MB)
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m99.2/99.2 MB[0m [31m7.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: catboost
Successfully installed catboost-1.2.8


In [None]:
import pandas as pd
from sklearn.preprocessing import StandardScaler,LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score,f1_score,ConfusionMatrixDisplay,classification_report
import matplotlib.pyplot as plt

In [None]:
import pandas as pd
data = pd.read_csv('/content/drive/MyDrive/dataset/train.csv')

In [None]:
data

Unnamed: 0,file_id,gender
0,common_voice_hi_26204093,male
1,common_voice_hi_26043935,male
2,common_voice_hi_24632150,male
3,common_voice_hi_24359471,female
4,common_voice_hi_24639127,male
...,...,...
1993,common_voice_hi_24225356,male
1994,common_voice_hi_26114822,male
1995,common_voice_hi_25998821,male
1996,common_voice_hi_23849314,male


#Data is highly Imbalance

In [None]:
gender_counts = data.groupby('gender').size().reset_index(name='count')
display(gender_counts)

Unnamed: 0,gender,count
0,female,334
1,male,1664


#picking a Mini data-set

In [None]:
female_data = data[data['gender'] == 'female'].sample(n=334, random_state=42)
male_data = data[data['gender'] == 'male'].sample(n=650, random_state=42)

new_data = pd.concat([female_data, male_data])

display(new_data)

Unnamed: 0,file_id,gender
186,common_voice_hi_25154337,female
1800,common_voice_hi_24360599,female
412,common_voice_hi_25154389,female
1116,common_voice_hi_26239091,female
323,common_voice_hi_24969916,female
...,...,...
103,common_voice_hi_25982521,male
48,common_voice_hi_26354239,male
1933,common_voice_hi_26204010,male
152,common_voice_hi_23852747,male


In [None]:
new_data

Unnamed: 0,file_id,gender
186,common_voice_hi_25154337,female
1800,common_voice_hi_24360599,female
412,common_voice_hi_25154389,female
1116,common_voice_hi_26239091,female
323,common_voice_hi_24969916,female
...,...,...
103,common_voice_hi_25982521,male
48,common_voice_hi_26354239,male
1933,common_voice_hi_26204010,male
152,common_voice_hi_23852747,male


#Extract Features from the Audio Clips

In [None]:
import os
import librosa
import numpy as np
import pandas as pd
from tqdm import tqdm
from sklearn.preprocessing import LabelEncoder

# === Step 1: Path setup ===
AUDIO_PATH = "drive/MyDrive/dataset/train"   # folder containing .mp3 files
CSV_PATH = "drive/MyDrive/dataset/train.csv"  # your CSV file with file_id, gender
SAMPLE_RATE = 22050
N_MFCC = 13

# === Step 2: Read CSV ===
df_labels = pd.read_csv(CSV_PATH)
print("‚úÖ CSV loaded:", df_labels.shape)
print(df_labels.head())

# === Step 3: Encode gender labels ===
le = LabelEncoder()
df_labels["gender_encoded"] = le.fit_transform(df_labels["gender"])
print("\nüîπ Label encoding mapping:", dict(zip(le.classes_, le.transform(le.classes_))))

# === Step 4: Extract MFCC function ===
def extract_mfcc(file_path):
    y, sr = librosa.load(file_path, sr=SAMPLE_RATE)
    mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=N_MFCC)
    return mfcc

# === Step 5: Find max frame length ===
max_len = 0
for file_id in tqdm(df_labels["file_id"], desc="Finding max frame length"):
    file_path = os.path.join(AUDIO_PATH, f"{file_id}.mp3")
    if os.path.exists(file_path):
        mfcc = extract_mfcc(file_path)
        if mfcc.shape[1] > max_len:
            max_len = mfcc.shape[1]
print(f"üîπ Maximum frame length found: {max_len}")

# === Step 6: Pad/Trim function ===
def pad_mfcc(mfcc, max_len):
    if mfcc.shape[1] < max_len:
        pad_width = max_len - mfcc.shape[1]
        mfcc = np.pad(mfcc, ((0, 0), (0, pad_width)), mode='constant')
    else:
        mfcc = mfcc[:, :max_len]
    return mfcc

# === Step 7: Process all audio files ===
data_rows = []
missing_files = []

for idx, row in tqdm(df_labels.iterrows(), total=len(df_labels), desc="Processing audio"):
    file_id = row["file_id"]
    gender_label = row["gender_encoded"]
    file_path = os.path.join(AUDIO_PATH, f"{file_id}.mp3")

    if not os.path.exists(file_path):
        missing_files.append(file_id)
        continue

    mfcc = extract_mfcc(file_path)
    mfcc = pad_mfcc(mfcc, max_len)
    features = mfcc.flatten()

    # Combine features + label
    data_rows.append(np.append(features, gender_label))

# === Step 8: Convert to DataFrame ===
df_features = pd.DataFrame(data_rows)
df_features.columns = [f"f{i}" for i in range(df_features.shape[1]-1)] + ["label"]

print("\n‚úÖ Final dataset shape:", df_features.shape)

# === Step 9: Save to CSV ===
OUTPUT_CSV = "audio_features_gender_dataset_test.csv"
df_features.to_csv(OUTPUT_CSV, index=False)
print(f"üíæ Dataset saved successfully as: {OUTPUT_CSV}")

if missing_files:
    print("\n‚ö†Ô∏è Missing files (not found in folder):", missing_files[:10], "...")


‚úÖ CSV loaded: (1998, 2)
                    file_id  gender
0  common_voice_hi_26204093    male
1  common_voice_hi_26043935    male
2  common_voice_hi_24632150    male
3  common_voice_hi_24359471  female
4  common_voice_hi_24639127    male

üîπ Label encoding mapping: {'female': np.int64(0), 'male': np.int64(1)}


Finding max frame length: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1998/1998 [01:31<00:00, 21.89it/s]


üîπ Maximum frame length found: 439


Processing audio: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1998/1998 [01:15<00:00, 26.51it/s]



‚úÖ Final dataset shape: (1998, 5708)
üíæ Dataset saved successfully as: audio_features_gender_dataset_test.csv


#Recorrection the Target Label

In [None]:
df = pd.read_csv('/content/audio_features_gender_dataset_test.csv')
df

Unnamed: 0,f0,f1,f2,f3,f4,f5,f6,f7,f8,f9,...,f5698,f5699,f5700,f5701,f5702,f5703,f5704,f5705,f5706,label
0,-574.782532,-574.782532,-574.782532,-574.782532,-574.782532,-574.782532,-574.782532,-574.782532,-455.466614,-314.775024,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,-586.120544,-586.120544,-586.120544,-586.120544,-586.120544,-586.120544,-586.120544,-586.120544,-518.867676,-340.706879,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,-574.140564,-574.140564,-574.140564,-574.140564,-574.140564,-574.140564,-574.140564,-574.140564,-574.140564,-574.140564,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,-584.527344,-530.542542,-476.353546,-465.709595,-460.492523,-456.223358,-450.025024,-445.821930,-443.131165,-451.579071,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,-529.723267,-469.574554,-395.072418,-384.778259,-408.282806,-415.518890,-419.215057,-429.356262,-452.158539,-457.296387,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1993,-674.801453,-674.801453,-653.942444,-642.859985,-658.755249,-661.210266,-664.422119,-666.317017,-664.966248,-663.801514,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1994,-568.691711,-568.691711,-568.691711,-568.691711,-568.691711,-568.691711,-568.691711,-568.691711,-497.911133,-346.384094,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1995,-765.633057,-709.184814,-653.840088,-643.214966,-654.505493,-660.623291,-666.057007,-680.841980,-693.213806,-710.703186,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1996,-565.381165,-535.522644,-511.489624,-512.039978,-511.222229,-507.454773,-504.868011,-504.518555,-511.605469,-513.085999,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [None]:
# Check the current data type of the 'label' column
print("Original data type of 'label' column:", df['label'].dtype)

# Change the data type of the 'label' column to integer
df['label'] = df['label'].astype(int)

# Check the new data type of the 'label' column
print("New data type of 'label' column:", df['label'].dtype)

# Display the updated DataFrame (optional)
display(df.head())

Original data type of 'label' column: float64
New data type of 'label' column: int64


Unnamed: 0,f0,f1,f2,f3,f4,f5,f6,f7,f8,f9,...,f5698,f5699,f5700,f5701,f5702,f5703,f5704,f5705,f5706,label
0,-574.782532,-574.782532,-574.782532,-574.782532,-574.782532,-574.782532,-574.782532,-574.782532,-455.466614,-314.775024,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
1,-586.120544,-586.120544,-586.120544,-586.120544,-586.120544,-586.120544,-586.120544,-586.120544,-518.867676,-340.706879,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
2,-574.140564,-574.140564,-574.140564,-574.140564,-574.140564,-574.140564,-574.140564,-574.140564,-574.140564,-574.140564,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
3,-584.527344,-530.542542,-476.353546,-465.709595,-460.492523,-456.223358,-450.025024,-445.82193,-443.131165,-451.579071,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
4,-529.723267,-469.574554,-395.072418,-384.778259,-408.282806,-415.51889,-419.215057,-429.356262,-452.158539,-457.296387,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1


#save it again

In [None]:
df.to_csv("audio_dataset.csv", index=False)

NameError: name 'df' is not defined

Machine Learning Part

In [None]:
from sklearn.linear_model import LinearRegression, LogisticRegressionCV,RidgeClassifierCV,LassoCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC,LinearSVC
from sklearn.neighbors import KNeighborsClassifier
import pandas as pd
from sklearn.preprocessing import StandardScaler,LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score,f1_score,ConfusionMatrixDisplay,classification_report
import matplotlib.pyplot as plt

In [None]:
df = pd.read_csv("/content/audio_dataset.csv")
x = df.drop("label", axis=1).values
y = df["label"].values


In [None]:
le= LabelEncoder()
y = le.fit_transform(y)

st = StandardScaler()
x = st.fit_transform(x)

In [None]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=42)

In [None]:
m1 = LogisticRegressionCV()
m2 = RidgeClassifierCV()
m3 = DecisionTreeClassifier()
m4 = SVC()
m5 = LinearSVC()
m6 = KNeighborsClassifier()

In [None]:
m1 = m1.fit(x_train,y_train)
m2 = m2.fit(x_train,y_train,)
m3 = m3.fit(x_train,y_train)
m4 = m4.fit(x_train,y_train)
m5 = m5.fit(x_train,y_train)
m6 = m6.fit(x_train,y_train)

In [None]:
y1 = m1.predict(x_test)
y2 = m2.predict(x_test)
y3 = m3.predict(x_test)
y4 = m4.predict(x_test)
y5 = m5.predict(x_test)
y6 = m6.predict(x_test)

In [None]:
accuracy_score1 = accuracy_score(y_test,y1) # LogisticRegressionCV
accuracy_score2 = accuracy_score(y_test,y2) # RidgeClassifierCV
accuracy_score3 = accuracy_score(y_test,y3) # DecisionTreeClassifier
accuracy_score4 = accuracy_score(y_test,y4) # SVC
accuracy_score5 = accuracy_score(y_test,y5) # LinearSVC
accuracy_score6 = accuracy_score(y_test,y6) # KNeighborsClassifier


print(f"accuracy_score1 LogisticRegressionCV: {accuracy_score1}")
print(f"accuracy_score2 RidgeClassifierCV: {accuracy_score2}")
print(f"accuracy_score3 DecisionTreeClassifier: {accuracy_score3}")
print(f"accuracy_score4 SVC: {accuracy_score4}")
print(f"accuracy_score5 LinearSVC: {accuracy_score5}")
print(f"accuracy_score6 KNeighborsClassifier: {accuracy_score6}")


accuracy_score1 LogisticRegressionCV: 0.94
accuracy_score2 RidgeClassifierCV: 0.86
accuracy_score3 DecisionTreeClassifier: 0.8625
accuracy_score4 SVC: 0.9425
accuracy_score5 LinearSVC: 0.83
accuracy_score6 KNeighborsClassifier: 0.9125


In [None]:
x = m1.predict(x_test[0].reshape(1, -1))
gender = le.inverse_transform(x)
gender[0]

'male'