In [1]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler
import numpy as np
import torch
import torch.nn as nn
import pandas as pd


In [57]:
def read_txt_file(filename):

    inputs = []
    outputs = []
    
    with open(filename, 'r') as file:
        lines = file.readlines()
        current_input = []
        current_output = []
        # Line by line we strip and split all values 
        for line in lines:
            values = line.strip().split()
            # If not the end of a record 
            if values and values[0] != '1.0':
                # We add the whole line (if invalid its going to be set to NaN) 
                # First 12 set of values are input, the next 12 are output
                # I only got this by looking through the original matlab file and inferencing this fact
                # I could be wrong so maybe I'll ask the professor 
                input_values = [float(val) if val else np.nan for val in values[:12]]
                output_values = [float(val) if val else np.nan for val in values[12:]]
                current_input.append(input_values)
                current_output.append(output_values)
            # We're at the end
            elif values and values[0] == '1.0':
                inputs.append(current_input)
                outputs.append(current_output)
                current_input = []
                current_output = []
    return inputs, outputs


# Read the files
train_inputs, train_outputs = read_txt_file('ae.train')
test_inputs, test_outputs = read_txt_file('ae.test')

train_outputs = []
for i in range(269):
    speaker_index = (i // 30) + 1  # Assuming 9 speakers, each with 30 time series
    l = len(train_inputs[i])
    teacher = np.zeros((l, 9))
    teacher[:, speaker_index - 1] = 1  # One-hot encoding for speaker index
    train_outputs.append(teacher)

# Create teacher signals for test data
test_outputs = []
speaker_index = 1
block_counter = 0
block_lengths = [31, 35, 88, 44, 29, 24, 40, 50, 29]  # Assuming the same block lengths as in MATLAB code
for i in range(370):
    block_counter += 1
    if block_counter > block_lengths[speaker_index - 1]:
        speaker_index += 1
        block_counter = 1
    l = len(test_inputs[i])
    teacher = np.zeros((l, 9))
    teacher[:, speaker_index - 1] = 1  # One-hot encoding for speaker index
    test_outputs.append(teacher)

# READ: Different recording have different lengths 
# Do we a) shorten the recordings to the shortest one, 
# b) pad the recordings to the longest one, or
# c) something else?
# For now I'm doing b) but keep that in mind
max_len_train_inputs = max(len(ts) for ts in train_inputs)
max_len_train_outputs = max(len(ts) for ts in train_outputs)
max_len_test_inputs = max(len(ts) for ts in test_inputs)
max_len_test_outputs = max(len(ts) for ts in test_outputs)

train_inputs = [np.pad(ts, ((0, max_len_train_inputs - len(ts)), (0, 0)), mode='constant', constant_values=np.nan) for ts in train_inputs]
train_outputs = [np.pad(ts, ((0, max_len_train_outputs - len(ts)), (0, 0)), mode='constant', constant_values=np.nan) for ts in train_outputs]
test_inputs = [np.pad(ts, ((0, max_len_test_inputs - len(ts)), (0, 0)), mode='constant', constant_values=np.nan) for ts in test_inputs]
test_outputs = [np.pad(ts, ((0, max_len_test_outputs - len(ts)), (0, 0)), mode='constant', constant_values=np.nan) for ts in test_outputs]

train_inputs = np.array(train_inputs)
test_inputs = np.array(test_inputs)
train_outputs = np.array(train_outputs)
test_outputs = np.array(test_outputs)



In [63]:
# BOOM 
print(train_inputs)
print(test_inputs)
print(train_outputs.shape)
print(test_outputs.shape)

[[[ 1.860936 -0.207383  0.261557 ... -0.306756 -0.213076  0.088728]
  [ 1.891651 -0.193249  0.235363 ... -0.289431 -0.247722  0.093011]
  [ 1.939205 -0.239664  0.258561 ... -0.314894 -0.227908  0.074638]
  ...
  [ 1.370862 -0.621346  0.600771 ... -0.105327 -0.193044  0.119152]
  [ 1.307289 -0.600573  0.620979 ... -0.167528 -0.175811  0.088565]
  [ 1.334578 -0.542157  0.558104 ... -0.188285 -0.13861   0.054478]]

 [[ 1.462484  0.174066  0.505133 ... -0.23763   0.120636  0.193254]
  [ 1.309815  0.120183  0.503046 ... -0.231087  0.121053  0.202386]
  [ 1.418207  0.015721  0.589994 ... -0.224317  0.175298  0.15667 ]
  ...
  [      nan       nan       nan ...       nan       nan       nan]
  [      nan       nan       nan ...       nan       nan       nan]
  [      nan       nan       nan ...       nan       nan       nan]]

 [[ 1.160837  0.078806  0.237706 ...  0.028707  0.07482   0.146297]
  [ 1.217979 -0.043693  0.378571 ...  0.03897   0.049702  0.164537]
  [ 1.234654 -0.107083  0.504189

In [60]:
import pandas as pd

train_inputs_2d = train_inputs.reshape(-1, 12)  # Flatten the first two dimensions
train_inputs_df = pd.DataFrame(train_inputs_2d, columns=[f'feature_{i}' for i in range(1, 13)])
train_inputs_df['time_series'] = np.repeat(np.arange(train_inputs.shape[0]), train_inputs.shape[1])
train_inputs_df['time_step'] = np.tile(np.arange(train_inputs.shape[1]), train_inputs.shape[0])
train_inputs_df_X = train_inputs_df.set_index(['time_series', 'time_step'])
train_outputs_2d = train_outputs.reshape(-1, 9)
train_outputs_df = pd.DataFrame(train_outputs_2d, columns=[f'speaker_{i}' for i in range(1, 10)])
train_outputs_df['time_series'] = np.repeat(np.arange(train_outputs.shape[0]), train_outputs.shape[1])
train_outputs_df['time_step'] = np.tile(np.arange(train_outputs.shape[1]), train_outputs.shape[0])
train_outputs_df_Y = train_outputs_df.set_index(['time_series', 'time_step'])

In [61]:
test_inputs_2d = test_inputs.reshape(-1, 12)  # Flatten the first two dimensions
test_inputs_df = pd.DataFrame(test_inputs_2d, columns=[f'feature_{i}' for i in range(1, 13)])
test_inputs_df['time_series'] = np.repeat(np.arange(test_inputs.shape[0]), test_inputs.shape[1])
test_inputs_df['time_step'] = np.tile(np.arange(test_inputs.shape[1]), test_inputs.shape[0])
test_inputs_df_X = test_inputs_df.set_index(['time_series', 'time_step'])
test_outputs_2d = test_outputs.reshape(-1, 9)
test_outputs_df = pd.DataFrame(test_outputs_2d, columns=[f'speaker_{i}' for i in range(1, 10)])
test_outputs_df['time_series'] = np.repeat(np.arange(test_outputs.shape[0]), test_outputs.shape[1])
test_outputs_df['time_step'] = np.tile(np.arange(test_outputs.shape[1]), test_outputs.shape[0])
test_outputs_df_Y = test_outputs_df.set_index(['time_series', 'time_step'])


In [51]:
train_inputs_df_X.head(10)

Unnamed: 0_level_0,Unnamed: 1_level_0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,feature_9,feature_10,feature_11,feature_12
time_series,time_step,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
0,0,1.860936,-0.207383,0.261557,-0.214562,-0.171253,-0.118167,-0.277557,0.025668,0.126701,-0.306756,-0.213076,0.088728
0,1,1.891651,-0.193249,0.235363,-0.249118,-0.11289,-0.112238,-0.311997,-0.027122,0.171457,-0.289431,-0.247722,0.093011
0,2,1.939205,-0.239664,0.258561,-0.291458,-0.041053,-0.102034,-0.3833,0.019013,0.16951,-0.314894,-0.227908,0.074638
0,3,1.717517,-0.218572,0.217119,-0.228186,-0.018608,-0.137624,-0.403318,-0.009643,0.164607,-0.323267,-0.210105,0.098098
0,4,1.741191,-0.279891,0.196583,-0.236377,-0.032012,-0.090612,-0.363134,-0.012571,0.124298,-0.351171,-0.216545,0.113899
0,5,1.684695,-0.311977,0.195453,-0.23197,-0.06867,-0.003822,-0.34194,-0.008826,0.085097,-0.364329,-0.204794,0.101838
0,6,1.637373,-0.336227,0.152766,-0.223842,-0.026278,-0.009157,-0.363866,-0.003117,0.055479,-0.358107,-0.181643,0.082056
0,7,1.643283,-0.349773,0.131553,-0.154519,-0.035292,0.023719,-0.381399,-0.021189,0.020397,-0.340491,-0.156417,0.080884
0,8,1.60703,-0.382745,0.179038,-0.115949,-0.060406,0.0578,-0.364642,-0.06923,-0.019788,-0.355996,-0.115129,0.131928
0,9,1.617907,-0.527367,0.179878,-0.083292,0.031747,0.081424,-0.418227,-0.081175,-0.022385,-0.33766,-0.103184,0.102266


In [52]:
train_outputs_df_Y.head(5)

Unnamed: 0_level_0,Unnamed: 1_level_0,speaker_1,speaker_2,speaker_3,speaker_4,speaker_5,speaker_6,speaker_7,speaker_8,speaker_9
time_series,time_step,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0,1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0,2,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0,3,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0,4,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [31]:
mm = MinMaxScaler()
ss = StandardScaler()

# Normalize all values
X_ss = ss.fit_transform(train_inputs_df_X)
y_mm = mm.fit_transform(train_outputs_df_Y) 

ModuleNotFoundError: No module named 'sklearn'

In [53]:
train_inputs_df.to_csv('train_inputs.csv')
train_outputs_df.to_csv('train_outputs.csv')
test_inputs_df.to_csv('test_inputs.csv')
test_outputs_df.to_csv('test_outputs.csv')

In [54]:
# Convert data to PyTorch tensors
from torch.autograd import Variable 
X = train_inputs_df_X.iloc[:, :].values
Y = train_outputs_df_Y.iloc[:, :].values

train_X = Variable(torch.Tensor(X))
train_Y = Variable(torch.Tensor(Y))

#X_train_tensors_final = torch.reshape(train_X,   (X_train_tensors.shape[0], 1, X_train_tensors.shape[1]))
#X_test_tensors_final = torch.reshape(X_test_tensors,  (X_test_tensors.shape[0], 1, X_test_tensors.shape[1]))

In [55]:
class VoiceIdentificationLSTM(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, output_size):
        super(VoiceIdentificationLSTM, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        h_0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size)
        c_0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size)
        out, _ = self.lstm(x, (h_0, c_0))
        out = self.fc(out[:, -1, :])
        return out

# Hyperparameters
input_size = 12
hidden_size = 64
num_layers = 2
output_size = 10  # Number of unique voices (classes)
learning_rate = 0.001
num_epochs = 100
batch_size = 32

# Initialize the model
model = VoiceIdentificationLSTM(input_size, hidden_size, num_layers, output_size)
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

In [56]:

# Training loop
for epoch in range(num_epochs):
    model.train()
    epoch_loss = 0.0
    epoch_acc = 0.0

    # Shuffle the training data
    combined = list(zip(train_inputs, train_outputs))
    np.random.shuffle(combined)
    train_inputs, train_outputs = zip(*combined)

    for i in range(0, len(train_inputs), batch_size):
        inputs = torch.nn.utils.rnn.pad_sequence([inp for inp in train_inputs[i:i+batch_size]], batch_first=True)
        targets = torch.tensor([out[0] for out in train_outputs[i:i+batch_size]], dtype=torch.long)

        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs.view(-1, outputs.size(-1)), targets)
        loss.backward()
        optimizer.step()

        epoch_loss += loss.item() * inputs.size(0)
        epoch_acc += (outputs.argmax(dim=1) == targets).sum().item()

    epoch_loss /= len(train_inputs)
    epoch_acc /= len(train_inputs)
    print(f'Epoch {epoch+1}/{num_epochs}, Loss: {epoch_loss:.4f}, Accuracy: {epoch_acc:.4f}')

    # Evaluation
    model.eval()
    eval_loss = 0.0
    eval_acc = 0.0

    with torch.no_grad():
        for inputs in test_inputs:
            inputs = inputs.unsqueeze(0)
            targets = torch.tensor([out[0] for out in test_outputs], dtype=torch.long)
            outputs = model(inputs)
            loss = criterion(outputs, targets)
            eval_loss += loss.item() * inputs.size(0)
            eval_acc += (outputs.argmax(dim=1) == targets).sum().item()

    eval_loss /= len(test_inputs)
    eval_acc /= len(test_inputs)
    print(f'Evaluation Loss: {eval_loss:.4f}, Accuracy: {eval_acc:.4f}')

# Save the trained model
torch.save(model.state_dict(), 'voice_identification_model.pth')

TypeError: expected Tensor as element 0 in argument 0, but got numpy.ndarray