# Voice Data Detects Disease

In [17]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import os
from io import StringIO

#wfdb is the package that can read WFDB format file
import wfdb

We need to read the information text files first associated with all 208 subjects

In [21]:
def read_info_file(file_path):
    data = {}
    with open(file_path, 'r') as file:
        lines = file.readlines()
    for line in lines:
        line = line.strip()
        if line:
            if ':' not in line:  # Check if the line contains a colon
                continue  # If not, skip this line
            key, value = line.split(':', 1)  # Split only on the first colon
            key = key.strip()
            value = value.strip()
            if value.lower() == 'nu' or not value:  # Check for 'NU' or empty string
                value = np.nan  # Replace 'NU' with NaN
            else:
                try:
                    # Remove units and convert to float
                    value = float(value.replace('gr', '').replace('litres', '').replace(',', '.'))
                except ValueError:
                    pass  # Keep as string if not a number
            data[key] = value
    return data


Using wfdb we will read the .hea files associated with all 208 subjects.

In [22]:
def read_signal_files(dat_file_path, hea_file_path):
    # Load the record using wfdb
    record = wfdb.rdrecord(dat_file_path.replace('.dat', ''))
    
    # Extract the signal and annotations from the record
    signal = record.p_signal.flatten()  # Assuming it's a single-channel recording
    
    # Calculate basic statistics on the signal
    mean_signal = np.mean(signal)
    std_signal = np.std(signal)
    max_signal = np.max(signal)
    min_signal = np.min(signal)
    
    # Create a dictionary with the extracted data
    signal_data = {
        'Mean_Signal': mean_signal,
        'Std_Signal': std_signal,
        'Max_Signal': max_signal,
        'Min_Signal': min_signal
    }
    
    return signal_data

Implement a loop to start reading all the files for the 208 subjects within the databse

In [23]:
# Base path to the voice-database folder
base_path = 'voice-database'

# Initialize a list to hold all combined subject data
all_subject_data = []

# Loop over all subject IDs
for subject_id in range(1, 209):  # Assuming IDs from 001 to 208
    # Construct file paths
    info_path = os.path.join(base_path, f'voice{subject_id:03}-info.txt')
    dat_path = os.path.join(base_path, f'voice{subject_id:03}.dat')
    hea_path = os.path.join(base_path, f'voice{subject_id:03}.hea')
    txt_path = os.path.join(base_path, f'voice{subject_id:03}.txt')
    
    # Read and process info file
    info_data = read_info_file(info_path)
    
    # Read and process signal files (.dat and .hea)
    signal_data = read_signal_files(dat_path, hea_path)
    
    # Combine the info and signal data into one record
    combined_data = info_data.copy()
    combined_data.update(signal_data) 
    
    # Append the combined data to the list
    all_subject_data.append(combined_data)

# Convert the list to a DataFrame
all_subjects_df = pd.DataFrame(all_subject_data)


**Exploring the Dataframe we have crated.**

In [24]:
all_subjects_df.head()

Unnamed: 0,Age,Gender,Diagnosis,Occupation status,Voice Handicap Index (VHI) Score,Reflux Symptom Index (RSI) Score,Smoker,Number of cigarettes smoked per day,Alcohol consumption,Amount of water's litres drink every day,...,Carbonated beverages,Tomatoes,Coffee,Chocolate,Soft cheese,Citrus fruits,Mean_Signal,Std_Signal,Max_Signal,Min_Signal
0,32.0,m,hyperkinetic dysphonia,Researcher,15.0,5.0,no,,casual drinker,1.5,...,almost never,sometimes,almost always,almost never,sometimes,sometimes,-0.000478,0.130235,0.590118,-0.462646
1,55.0,m,healthy,Employee,17.0,12.0,casual smoker,2.0,habitual drinker,0.5,...,almost always,sometimes,sometimes,sometimes,almost always,almost always,-0.000198,0.290999,0.90686,-0.87204
2,34.0,m,hyperkinetic dysphonia (nodule),Researcher,42.0,26.0,no,,casual drinker,1.5,...,sometimes,sometimes,almost always,sometimes,almost always,almost never,-0.000311,0.259768,0.925568,-0.756714
3,28.0,f,hypokinetic dysphonia,Researcher,20.0,9.0,casual smoker,,casual drinker,1.0,...,almost never,sometimes,always,sometimes,almost always,sometimes,-0.000523,0.051166,0.185791,-0.151184
4,54.0,f,hypokinetic dysphonia,Researcher,39.0,23.0,no,,casual drinker,1.5,...,never,sometimes,never,sometimes,sometimes,almost always,-0.000494,0.185828,0.586456,-0.534973
