# Voice Data Detects Disease

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import os

#wfdb is the package that can read WFDB format file
import wfdb

We need to read the information text files first associated with all 208 subjects

In [2]:
# Function to read and process the information text files
def read_info_file(file_path):
  data = {}
  with open(file_path, 'r') as file:
    for line in file:
      # Split the line into key and value parts
      if ':' in line:
        parts = line.split(':')
        if len(parts) >= 2:  # Check if there are at least two parts
          key = parts[0].strip()
          value = parts[1].strip()
          if value == 'NU':  # Handle the 'Not Used' values
            value = np.nan
          elif value.replace(',', '.').replace(' gr', '').replace(' litres', '').isdigit():
          # Convert numbers with commas as decimal points and remove units
            value = float(value.replace(',', '.').replace(' gr', '').replace(' litres', ''))
          elif value.isdigit():  # Convert digit strings to integers
            value = int(value)
            data[key] = value
          elif '\t' in line:
            parts = line.strip().split('\t')
            if len(parts) >= 2:  # Check if there are at least two parts
              key = parts[0].strip()
              value = parts[1].strip()
              data[key] = value
                
  # Convert the dictionary into a DataFrame with a single row
  info_df = pd.DataFrame([data])
    
  return info_df


Using wfdb we will read the .hea files associated with all 208 subjects.

In [3]:
def read_signal_files(dat_file_path, hea_file_path):
    # Load the record using wfdb
    record = wfdb.rdrecord(dat_file_path.replace('.dat', ''))
    
    # Extract the signal and annotations from the record
    signal = record.p_signal.flatten()  # Assuming it's a single-channel recording
    
    # Calculate basic statistics on the signal
    mean_signal = np.mean(signal)
    std_signal = np.std(signal)
    max_signal = np.max(signal)
    min_signal = np.min(signal)
    
    # Create a dictionary with the extracted data
    signal_data = {
        'Mean_Signal': mean_signal,
        'Std_Signal': std_signal,
        'Max_Signal': max_signal,
        'Min_Signal': min_signal
    }
    
    return signal_data

Implement a loop to start reading all the files for the 208 subjects within the databse

In [5]:
# Base path to the voice-database folder
base_path = 'voice-database'

# Initialize a list to hold all combined subject data
all_subject_data = []

# Loop over all subject IDs
for subject_id in range(1, 209):  # Assuming IDs from 001 to 208
    # Construct file paths
    info_path = os.path.join(base_path, f'voice{subject_id:03}-info.txt')
    dat_path = os.path.join(base_path, f'voice{subject_id:03}.dat')
    hea_path = os.path.join(base_path, f'voice{subject_id:03}.hea')
    txt_path = os.path.join(base_path, f'voice{subject_id:03}.txt')
    
    # Read and process info file
    info_data = read_info_file(info_path)
    
    # Read and process signal files (.dat and .hea)
    signal_data = read_signal_files(dat_path, hea_path)
    
    # Combine the info and signal data into one record
    combined_data = {**info_data, **signal_data}
    
    # Append the combined data to the list
    all_subject_data.append(combined_data)

# Optionally, convert the list to a DataFrame
all_subjects_df = pd.DataFrame(all_subject_data)


**Exploring the Dataframe we have crated.**

In [6]:
all_subjects_df.head()

Unnamed: 0,Gender:,Diagnosis:,Occupation status:,Smoker:,Alcohol consumption:,Amount of water's litres drink every day:,Carbonated beverages:,Tomatoes:,Coffee:,Chocolate:,Soft cheese:,Citrus fruits:,Mean_Signal,Std_Signal,Max_Signal,Min_Signal
0,"0 m Name: Gender:, dtype: object","0 hyperkinetic dysphonia Name: Diagnosis:, ...","0 Researcher Name: Occupation status:, dtyp...","0 no Name: Smoker:, dtype: object",0 casual drinker Name: Alcohol consumption:...,"0 1,5 Name: Amount of water's litres drink ...","0 almost never Name: Carbonated beverages:,...","0 sometimes Name: Tomatoes:, dtype: object","0 almost always Name: Coffee:, dtype: object","0 almost never Name: Chocolate:, dtype: object","0 sometimes Name: Soft cheese:, dtype: object","0 sometimes Name: Citrus fruits:, dtype: ob...",-0.000478,0.130235,0.590118,-0.462646
1,"0 m Name: Gender:, dtype: object","0 healthy Name: Diagnosis:, dtype: object","0 Employee Name: Occupation status:, dtype:...","0 casual smoker Name: Smoker:, dtype: object",0 habitual drinker Name: Alcohol consumptio...,"0 0,5 Name: Amount of water's litres drink ...",0 almost always Name: Carbonated beverages:...,"0 sometimes Name: Tomatoes:, dtype: object","0 sometimes Name: Coffee:, dtype: object","0 sometimes Name: Chocolate:, dtype: object","0 almost always Name: Soft cheese:, dtype: ...","0 almost always Name: Citrus fruits:, dtype...",-0.000198,0.290999,0.90686,-0.87204
2,"0 m Name: Gender:, dtype: object",0 hyperkinetic dysphonia (nodule) Name: Dia...,"0 Researcher Name: Occupation status:, dtyp...","0 no Name: Smoker:, dtype: object",0 casual drinker Name: Alcohol consumption:...,"0 1,5 Name: Amount of water's litres drink ...","0 sometimes Name: Carbonated beverages:, dt...","0 sometimes Name: Tomatoes:, dtype: object","0 almost always Name: Coffee:, dtype: object","0 sometimes Name: Chocolate:, dtype: object","0 almost always Name: Soft cheese:, dtype: ...","0 almost never Name: Citrus fruits:, dtype:...",-0.000311,0.259768,0.925568,-0.756714
3,"0 f Name: Gender:, dtype: object","0 hypokinetic dysphonia Name: Diagnosis:, d...","0 Researcher Name: Occupation status:, dtyp...","0 casual smoker Name: Smoker:, dtype: object",0 casual drinker Name: Alcohol consumption:...,,"0 almost never Name: Carbonated beverages:,...","0 sometimes Name: Tomatoes:, dtype: object","0 always Name: Coffee:, dtype: object","0 sometimes Name: Chocolate:, dtype: object","0 almost always Name: Soft cheese:, dtype: ...","0 sometimes Name: Citrus fruits:, dtype: ob...",-0.000523,0.051166,0.185791,-0.151184
4,"0 f Name: Gender:, dtype: object","0 hypokinetic dysphonia Name: Diagnosis:, d...","0 Researcher Name: Occupation status:, dtyp...","0 no Name: Smoker:, dtype: object",0 casual drinker Name: Alcohol consumption:...,"0 1,5 Name: Amount of water's litres drink ...","0 never Name: Carbonated beverages:, dtype:...","0 sometimes Name: Tomatoes:, dtype: object","0 never Name: Coffee:, dtype: object","0 sometimes Name: Chocolate:, dtype: object","0 sometimes Name: Soft cheese:, dtype: object","0 almost always Name: Citrus fruits:, dtype...",-0.000494,0.185828,0.586456,-0.534973
