In [None]:
import torch

In [2]:
filename = "dataset/geometry_data.txt"
# - totalLength - segmentNum - length_cm - width_cm 
totalLength = []
segmentNum = []
length_cm = []
width_cm = []

f = open(filename)
line = f.readline()
while line:
    totalLength.append(float(line.split(' ')[0]))
    segmentNum.append(float(line.split(' ')[1]))
    length_cm.append(float(line.split(' ')[2]))
    width_cm.append(float(line.split(' ')[3].replace('\n', '').replace('\r', '')))
    line = f.readline()
f.close()

In [2]:
print(totalLength)

[16.12296, 16.89652, 16.21494, 16.47253, 16.12191, 16.32418, 14.60769, 13.88853, 16.27931, 17.30089, 15.25498, 16.56207, 14.24749, 15.28234, 16.33746, 14.6041, 16.12039, 13.97599, 17.33898, 15.84107, 16.50507, 15.52383, 17.06361, 15.68886, 14.09718, 16.86287, 16.75714, 17.21705, 14.28638, 15.96418, 14.90664, 15.84106, 17.16877, 16.5288, 15.02178, 13.80342, 15.62319, 17.23604, 15.77529, 13.54761, 14.14873, 14.74486, 14.16259, 14.55189, 16.25686, 15.30217, 14.41591, 14.10951, 15.65337, 13.8127, 13.92661, 13.51854, 16.76921, 13.83774, 14.53948, 15.22566, 14.22739, 14.08216, 16.97717, 15.69944, 16.91212, 14.90381, 15.10723, 14.45966, 14.23563, 15.16907, 17.11086, 15.46346, 14.85088, 14.97699, 16.62101, 14.46677, 13.88582, 17.2682, 15.80083, 14.43912, 16.78478, 13.6721, 16.09646, 16.09098, 15.68804, 16.47877, 16.2471, 14.97394, 16.62091, 17.21754, 15.44717, 15.28713, 15.53403, 16.77051, 16.07727, 16.74632, 14.90291, 17.00377, 15.9899, 14.33097, 15.38369, 16.87724, 14.40369, 14.41066, 14.744

# Formants


In [None]:
from scipy.fft import fft
from scipy.io import wavfile
from scipy.interpolate import interp1d
from db_tools import convert_dir_to_wav
import process_phn, numpy as np
from smooth import smooth
from typing import List, Tuple
import matplotlib.pyplot as plt

FFT_SLICE_RADIUS = 50 # number of samples on either side of the center sample of the phone which are used for FFT
MAX_FREQ = 8000 # assumed max frequency of voice data, since TIMIT data is at 16 kHz, so the Nyquist frequency is 8 kHz
FORMANT_OVERLAP = 250 # number of minimum Hz between frequencies which are decided to be formants

def extract_formants(wav_path: str, phn_path: str, plot = False) -> List[Tuple[str, Tuple[int, int, int, int, int]]]:
    """
    Takes a path to a file storing data about the phones in an audio file as well as a path to the audio file.
    Returns a list of tuples, each of which contain (a) the phone for which a set of formants were extracted and (b) a tuple containing the formants.
    """
    out = []
    phn_data = process_phn.extract_monophthong_times(phn_path)
    _, wav_data = wavfile.read(wav_path)

    # If plot argument is True, print the transcript of the recording to provide context for the plots
    if plot:
        txt_data_path = phn_path.replace("PHN", "TXT")
        with open(txt_data_path) as transcript:
            print(transcript.read())

    for phn_instance in phn_data:
        # Determine start and end times of phone, then determine middle sample index
        vowel_phone, vowel_start, vowel_end = phn_instance
        fft_slice_middle = vowel_start + (vowel_end - vowel_start) // 2
        
        # Set vowel_data to be a slice of wav_data at the middle sample index ± FFT_SLICE_RADIUS
        vowel_data = wav_data[fft_slice_middle - FFT_SLICE_RADIUS : fft_slice_middle + FFT_SLICE_RADIUS]

        # Take the absolute value of the real part of the FFT of vowel_data, then stretch it out to MAX_FREQ length and smooth it out a bit
        vowel_data_fft = fft(vowel_data).real
        vowel_data_fft = abs(vowel_data_fft)[:len(vowel_data_fft) // 2] # Get rid of needless second half of FFT, it's symmetrical
        vowel_data_fft_interp = interp1d(np.arange(vowel_data_fft.size), vowel_data_fft)
        vowel_data_fft = vowel_data_fft_interp(np.linspace(0, vowel_data_fft.size - 1, MAX_FREQ))
        vowel_data_fft = smooth(vowel_data_fft, 20)

        # If plot argument is True, plot the spectral slice for each vowel
        if plot:
            print(vowel_phone)
            plt.semilogy(vowel_data_fft,'r')
            plt.suptitle(f"[{vowel_phone}]", fontsize=20)
            plt.xlabel("Frequency (Hz)")
            plt.ylabel("Log-amplitude")
            plt.show()

        # Associate the frequencies with their amplitudes and sort them in increasing amplitude
        formant_candidates = [(freq, amplitude) for freq, amplitude in enumerate(vowel_data_fft)]
        formant_candidates = sorted(formant_candidates, key=lambda x: x[1])

        # Until there are 5 chosen formants, keep adding the remaining most intense frequency unless it overlaps with an already chosen formant
        # Then sort the chosen formants in order of increasing frequency
        formants = []
        while len(formants) < 5:
            if len(list(filter(lambda x: abs(x[0] - formant_candidates[-1][0]) <= FORMANT_OVERLAP, formants))) == 0:
                formants.append(formant_candidates[-1])
            formant_candidates.pop()
        formants = sorted(formants, key=lambda x: x[0])
        out.append(tuple([vowel_phone, tuple(formant[0] for formant in formants)]))
    return out

# FNN

In [None]:
import torch 
import torch.nn as nn
import torchvision.transforms as transforms
import torchvision.datasets as dsets
from torch.autograd import Variable

In [None]:
# Parmeter Input_dim, Hidden_dim, Output_dim 

class FNN(nn.Module):
  def __init__(self, input_dim, hidden_dim, output_dim):
    super(FNN, self).__init__()
    
    #Linear Function
    self.fc1 = nn.Linear(input_dim, hidden_dim)
    #non-Linear Function 
    self.aFun1 = nn.ReLU()
    
    #Linear Function
    self.fc2 = nn.Linear(hidden_dim, hidden_dim)
    #non-Linear Function 
    self.aFun2 = nn.ReLU()
    
    
    #Linear Funcation
    self.fc3 = nn.Linear(hidden_dim, output_dim)
        
  def forward(self, x):
    
    # Linear function
    out= self.fc1(x)
    
    # Non - linearity
    out= self.aFun1(out)
    
    
    # Linear function
    out= self.fc2(out)
    
    # Non - linearity
    out= self.aFun2(out)
    
    # Linear function 
    out= self.fc3(out)
    
    return out

In [None]:
# Intintiate Modle Class

input_dim = 
output_dim = 

# Number of Neurons and Number of activation functions 
hidden_dim = 100    


model= FNN(input_dim, hidden_dim, output_dim)

In [None]:
# Intintiate Loss function

cert = nn.CrossEntropyLoss()

# Intintiate Optimaztion

Optimaztion = torch.optim.SGD(model.parameters(), lr=0.01)