# Fix seed for reproducibility

In [1]:
my_seed=19951008
from numpy.random import seed
seed(my_seed)
from tensorflow import random
random.set_seed(my_seed)
import torch
torch.manual_seed(my_seed)
#torch.set_deterministic(True)

<torch._C.Generator at 0x7fb6d9945f30>

# Import libraries

In [2]:
! ls ../../..

asvtorch  asvtorch_modified  src


In [3]:
import json
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold
import sys
sys.path.append("../../../asvtorch")
sys.path.append("../../../asvtorch/asvtorch")
from src.utterances.utterance_list import UtteranceList
import numpy as np
from asvtorch.src.backend.vector_processing import VectorProcessor
import itertools
import wandb

# Choose representation

In [4]:
! ls -lh /media/hdd1/khaled/voxceleb_ivector_outputs-correct/ivector_400/utterances

total 2,2G
-rw-rw-r-- 1 khaled khaled 2,2G huhti  1 07:13 trial_ivectors.pickle


In [5]:
folder = "/media/hdd1/khaled/voxceleb_ivector_outputs-correct/ivector_400/utterances/"
label = "trial_ivectors"

# Load and prepare data

In [6]:
%%time
plda_data = UtteranceList.load(label, folder)

Loading: /media/hdd1/khaled/voxceleb_ivector_outputs-correct/ivector_400/utterances/trial_ivectors.pickle
Loaded (45.412 s): /media/hdd1/khaled/voxceleb_ivector_outputs-correct/ivector_400/utterances/trial_ivectors.pickle
CPU times: user 26.6 s, sys: 13.2 s, total: 39.9 s
Wall time: 45.4 s


In [7]:
import pandas as pd
train_metadata = pd.read_csv('age-train.txt')
train_metadata.head()

Unnamed: 0,speaker_age,birth_year,Name,VoxCeleb_ID,video_id
0,44.0,1969.0,David_Boreanaz,id02022,Ahk1sThoa0c
1,67.0,1945.0,Ibrahim_Boubacar_Keïta,id03530,ndYJV-_3wT0
2,57.0,1955.0,Lena_Adelsohn_Liljeroth,id05103,grAhHj9q1K8
3,57.0,1959.0,Marisol_Touraine,id05666,BtjJl0sUac8
4,45.0,1972.0,Paul_Dickov,id06839,dVV1uvOhAn8


In [8]:
test_metadata = pd.read_csv('age-test.txt')
test_metadata.head()

Unnamed: 0,speaker_age,birth_year,Name,VoxCeleb_ID,video_id
0,26.0,1987.0,Arnd_Peiffer,id00778,DoazlQs250s
1,20.0,1995.0,Luke_Shaw,id05351,axdSgSJ2FFc
2,35.0,1979.0,Rose_Byrne,id07564,S-A97trX5Aw
3,42.0,1971.0,Bobby_Jindal,id01121,LLgaXyzg3-A
4,29.0,1984.0,Ryan_Lochte,id07643,VyPP1t5ZVb8


In [9]:
train_metadata['vox_yt_ids'] = train_metadata['VoxCeleb_ID'] + '-' + train_metadata['video_id']
train_vox_yt = list(train_metadata.vox_yt_ids)

In [10]:
test_metadata['vox_yt_ids'] = test_metadata['VoxCeleb_ID'] + '-' + test_metadata['video_id']
test_vox_yt = list(test_metadata.vox_yt_ids)

In [11]:
len(train_vox_yt), len(test_vox_yt)

(1690, 1127)

In [12]:
%%time
from tqdm import tqdm
train_indexes = []
plda_vox_yt_id = []
test_indexes = []
test_plda_vox_yt_id = []
for i, voxID_video_id in enumerate(tqdm(plda_data.get_utt_labels())):
    # Let's now remove the "recording" info from voxID-YT id
    current_id = "-".join(voxID_video_id.split("-")[0:-1])

    if current_id in train_vox_yt:
        train_indexes.append(i)
        plda_vox_yt_id.append(current_id)
    elif current_id in test_vox_yt:
        test_indexes.append(i)
        test_plda_vox_yt_id.append(current_id)

100%|██████████| 1128702/1128702 [01:34<00:00, 11909.07it/s]

CPU times: user 1min 31s, sys: 2.29 s, total: 1min 34s
Wall time: 1min 35s





In [13]:
len(plda_vox_yt_id), len(test_plda_vox_yt_id)

(12722, 8602)

In [14]:
%%time
train_utt_labels = np.array(plda_data.get_utt_labels())[train_indexes]
train_vectors = plda_data.embeddings[train_indexes]
test_utt_labels = np.array(plda_data.get_utt_labels())[test_indexes]
test_vectors = plda_data.embeddings[test_indexes]
y = []
y_test = []
for x in plda_vox_yt_id:
    speaker_age = train_metadata[train_metadata['vox_yt_ids'] == x].speaker_age.item()
    y.append(speaker_age)
for x in test_plda_vox_yt_id:
    speaker_age = test_metadata[test_metadata['vox_yt_ids'] == x].speaker_age.item()
    y_test.append(speaker_age)

CPU times: user 20 s, sys: 932 ms, total: 20.9 s
Wall time: 20.9 s


In [15]:
def get_correct_recordings_index(spk_labels, compute_least_freq=True):
    print("get_correct_recordings_index >>>")
    print("compute least freq:", compute_least_freq)
    if compute_least_freq:
        spk_labels_dict = {i:spk_labels.count(i) for i in set(spk_labels)}
        least_freq_spk = min(list(spk_labels_dict.values()))
    else:
        least_freq_spk = 1
    print("Least freq:", least_freq_spk)
    speaker_indexes = []
    frequency_spk_labels_dict = {}
    for x in set(spk_labels):
        frequency_spk_labels_dict[x] = 0
    for index, spk_id in enumerate(spk_labels):
        frequency_spk_labels_dict[spk_id] += 1
        if frequency_spk_labels_dict[spk_id] > least_freq_spk:
            next
        else:
            speaker_indexes.append(index)
    print("get_correct_recordings_index <<<")
    return speaker_indexes

In [16]:
unbalanced = True
if unbalanced:
    y = np.array(y)
    X = train_vectors
    y_test = np.array(y_test)
    X_test = test_vectors
else:
    #- Filtro registrazioni in modo da avere medesimo numero
    # - Faccio CV
    balanced_recordings_indexes = get_correct_recordings_index(plda_vox_yt_id)
    y = np.array(y)[balanced_recordings_indexes]
    X = train_vectors[balanced_recordings_indexes]
np.savetxt("final_results_age/" + 'train-y_true' + '.txt', np.array(y))

In [17]:
import scipy.linalg
def wccn(X, y, alpha, smooth=False, num_bins = 2):
    if num_bins >= 2:
        
        quantile = 100/num_bins

        if config['overlap_bin']:
            previous_threshold= (0,0)
            age_thresholds = []
            overlapping_age = config['age_overlap']
            for i in range(num_bins+1):
                if previous_threshold == (0,0):
                    current_threshold = (0, np.percentile(y,i*quantile))
                else:
                    current_threshold = (previous_threshold[1] - overlapping_age,  np.percentile(y,i*quantile))
                age_thresholds.append(current_threshold)
                previous_threshold = current_threshold
        else:
            age_thresholds = [(
                np.percentile(y ,i * quantile),
                np.percentile(y, (i + 1) * quantile)
            ) for i in range(num_bins)]
        
        obs = [X[(y <= up) & (y > low), :] for low,up in age_thresholds]
        W = np.zeros((X.shape[1], X.shape[1]))
        for w in obs:
            W = W + np.cov(w, rowvar=False, ddof=0)/(w.shape[0])
        W = W /len(obs)
        L = scipy.linalg.cholesky(np.linalg.inv(W), lower=True)
        return L
    else:
        print("Impossible to apply wccn!")

In [18]:
X.shape, y.shape,

(torch.Size([12722, 400]), (12722,))

In [19]:
from sklearn.model_selection import StratifiedKFold
import sklearn.linear_model

In [20]:
vox_ids = [x.split("-")[0] for x in plda_vox_yt_id]

In [21]:
del plda_data

In [22]:
def final_eval(X_train, y_train, preprocessing_strategy, model_name, spk_labels):
    X_train = X_train.cuda()
    """
    #kfold = sklearn.model_selection.KFold(n_splits=k, shuffle=True, random_state=config['seed'])
    unique_labels = list(set(spk_labels))
    unique_labels = np.array(unique_labels)

    
    train_labels = unique_labels
    
    if unbalanced:
        idx_train = [i for i, x in enumerate(spk_labels) if x in train_labels] 
    else:
        idx_train = train_index
    """

    # Filter train and test
    train_embeddings = X_train
    y_train = y

    if preprocessing_strategy == 'cwl':           
        vector_processor = VectorProcessor.train(train_embeddings, 'cwl', 'cuda:0')
        train_embeddings = vector_processor.process(train_embeddings)
        test_embeddings = vector_processor.process(test_embeddings)
    elif preprocessing_strategy == 'wccn':
        L = wccn(train_embeddings.cpu().numpy(), y_train, 0, num_bins=config['num_bins'])
        train_embeddings = torch.matmul(train_embeddings, torch.from_numpy(L).cuda().float())
        test_embeddings = torch.matmul(test_embeddings, torch.from_numpy(L).cuda().float())
    elif preprocessing_strategy == 'pca':
        from sklearn.decomposition import PCA
        pca = PCA(n_components=int(train_embeddings.shape[1]/2))
        train_embeddings = pca.fit_transform(train_embeddings.cpu().numpy())
        test_embeddings = pca.transform(test_embeddings.cpu().numpy())

    #folder = config['folder_fn'] + config['embedding'] + '_'
    #np.savetxt(folder + model_name + 'y_true' + config['timestamp'] + '.txt', np.array(y_test))


    if preprocessing_strategy != 'pca':
        train_embeddings = train_embeddings.cpu().numpy()
        #test_embeddings = test_embeddings.cpu().numpy()
    if model_name in ['cnn', 'cnn_increasing_conv']:
        train_embeddings = train_embeddings.reshape(train_embeddings.shape[0], train_embeddings.shape[1], 1)
        #test_embeddings = test_embeddings.reshape(test_embeddings.shape[0], test_embeddings.shape[1], 1)
        if model_name == 'cnn':
            model = basic_cnn(
                input_shape=(train_embeddings.shape[1], 1),
                apply_dropout=config['dropout']
                             )
        else:
            model = cnn_increasing_conv(
                input_shape=(train_embeddings.shape[1], 1),
                apply_dropout=config['dropout'])
        model_callbacks = [
            WandbCallback(),
            EarlyStopping(
                         monitor='val_loss',
                         patience=config['patience'],
                         verbose=0,
                         mode='auto',
                         baseline=None,
                         restore_best_weights=True)
        ] 
        model.fit(train_embeddings,
                  y_train,
                  #validation_data=(test_embeddings, y_test),
                  batch_size=config['batch_size'],
                  epochs=config['epochs'],
                  verbose=1,
                  callbacks=model_callbacks
                 )
    elif model_name == 'lm':
        model = sklearn.linear_model.LinearRegression().fit(train_embeddings, y_train)
    elif model_name == 'lasso':
        model = sklearn.linear_model.LassoCV(
            max_iter=50000,
            tol=0.005,
            normalize=True,
            n_jobs=-1).fit(train_embeddings, y_train)
    elif model_name == 'ridge':
        model = sklearn.linear_model.RidgeCV(normalize=True).fit(train_embeddings, y_train)
    elif model_name == 'svr_rbf':
        model = sklearn.svm.SVR().fit(train_embeddings, y_train)
    elif model_name == 'svr_poly':
        model = sklearn.svm.SVR(kernel='poly').fit(train_embeddings, y_train)
    elif model_name == 'svr_lin':
        model = sklearn.svm.SVR(kernel='linear').fit(train_embeddings, y_train)
    elif model_name == 'svr_sigm':
        model = sklearn.svm.SVR(kernel='sigmoid').fit(train_embeddings, y_train)
    y_pred = model.predict(train_embeddings)
    train_mae = sklearn.metrics.mean_absolute_error(y_pred=y_pred, y_true=y_train)
    mae_per_obs = np.abs(y_pred - y_train)
    std_mae = np.std(mae_per_obs)
    wandb.log(
        {
            'Train MAE': train_mae,
            'Scenario': 'test_mode',
            'train_std_mae': std_mae

        }
    )
    return model


In [23]:
test_vox_ids = [x.split("-")[0] for x in test_plda_vox_yt_id]
idx_test_balanced = get_correct_recordings_index(test_vox_ids, compute_least_freq=True)
X_test = X_test[idx_test_balanced]
X_test = X_test.cpu().numpy()
y_test = y_test[idx_test_balanced]
X_test.shape

get_correct_recordings_index >>>
compute least freq: True
Least freq: 1
get_correct_recordings_index <<<


(1127, 400)

# TRAINING and TEST section
## UNBALANCED - RIDGE

In [24]:
model='ridge'
strategy=''
n_bin=None
overlap=None
age=None
import time
timestr = time.strftime("%Y%m%d-%H%M%S")
config = {
    'seed' : 19951008,
    'log_interval' : 1,
    'model_name' : model,
    'feature_norm' : strategy,
    'num_bins': n_bin,
    'overlap_bin': overlap,
    'age_overlap': age,
    'dropout': None,
    'dataset' : 'age',
    'embedding' : 'i-vec',
    'folder_fn': 'ivectors/lin_reg/',
    'unbalanced': unbalanced,
    'timestamp': timestr,
    "final_train_eval": True

}
if model in ['cnn', 'cnn_increasing_conv']:
    config['filter_n'] = n_filt
    config['kernel_size'] = n_kern
    config['pool_size'] = n_pool
    config['dense_n'] = n_dense
    config['batch_norm_everywhere'] = norm_everywhere
    config['add_2nd_maxpool'] = second_max
    config['optimizer'] = optim
    config['additional_conv_max'] = add_conv_max
    config['momentum'] = momentum
    config['decay_rate'] = dec_rate

wandb.init(
    project='voxceleb_enrichment',
    name='_'.join([model,config['embedding'], strategy]),
    config=config
)
model=final_eval(X, y, strategy, model, spk_labels=plda_vox_yt_id)
wandb.run.finish()

Failed to query for notebook name, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable
[34m[1mwandb[0m: Currently logged in as: [33mhechmik[0m (use `wandb login --relogin` to force relogin)
[34m[1mwandb[0m: wandb version 0.10.24 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade


0,1
Train MAE,7.80493
Scenario,test_mode
train_std_mae,6.13329
_step,0
_runtime,55
_timestamp,1617258021


0,1
Train MAE,▁
train_std_mae,▁
_step,▁
_runtime,▁
_timestamp,▁


### Evaluation:

In [26]:
y_pred = model.predict(X_test)
test_mae = sklearn.metrics.mean_absolute_error(y_pred=y_pred, y_true=y_test)
mae_per_obs = np.abs(y_pred - y_test)
std_mae = np.std(mae_per_obs)
test_mae

9.539929539958408

In [27]:
import pickle
model_name="ridge_unbalanced"
# Save to file in the current working directory
pkl_filename = model_name + ".pkl"
with open(pkl_filename, 'wb') as file:
    pickle.dump(model, file)

In [28]:
!mkdir final_results_age

mkdir: cannot create directory ‘final_results_age’: File exists


In [29]:
np.savetxt("final_results_age/" + model_name +'-y_true' + '.txt', np.array(y_test))
np.savetxt("final_results_age/" + model_name + '-y_pred' + '.txt', np.array(y_pred))

## UNBALANCED - Lasso

In [30]:
model='lasso'
strategy=''
n_bin=None
overlap=None
age=None
import time
timestr = time.strftime("%Y%m%d-%H%M%S")
config = {
    'seed' : 19951008,
    'log_interval' : 1,
    'model_name' : model,
    'feature_norm' : strategy,
    'num_bins': n_bin,
    'overlap_bin': overlap,
    'age_overlap': age,
    'dropout': None,
    'dataset' : 'age',
    'embedding' : 'i-vec',
    'folder_fn': 'ivectors/lin_reg/',
    'unbalanced': unbalanced,
    'timestamp': timestr,
    "final_train_eval": True

}
if model in ['cnn', 'cnn_increasing_conv']:
    config['filter_n'] = n_filt
    config['kernel_size'] = n_kern
    config['pool_size'] = n_pool
    config['dense_n'] = n_dense
    config['batch_norm_everywhere'] = norm_everywhere
    config['add_2nd_maxpool'] = second_max
    config['optimizer'] = optim
    config['additional_conv_max'] = add_conv_max
    config['momentum'] = momentum
    config['decay_rate'] = dec_rate

wandb.init(
    project='voxceleb_enrichment',
    name='_'.join([model,config['embedding'], strategy]),
    config=config
)
model=final_eval(X, y, strategy, model, spk_labels=plda_vox_yt_id)
wandb.run.finish()

[34m[1mwandb[0m: wandb version 0.10.24 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade


0,1
Train MAE,7.80129
Scenario,test_mode
train_std_mae,6.13506
_step,0
_runtime,24
_timestamp,1617258051


0,1
Train MAE,▁
train_std_mae,▁
_step,▁
_runtime,▁
_timestamp,▁


### Evaluation

In [31]:
y_pred = model.predict(X_test)
test_mae = sklearn.metrics.mean_absolute_error(y_pred=y_pred, y_true=y_test)
mae_per_obs = np.abs(y_pred - y_test)
std_mae = np.std(mae_per_obs)
test_mae

9.516635293969033

In [32]:
model_name="lasso_unbalanced"
# Save to file in the current working directory
pkl_filename = model_name + ".pkl"
with open(pkl_filename, 'wb') as file:
    pickle.dump(model, file)

In [33]:
np.savetxt("final_results_age/" + model_name +'-y_true' + '.txt', np.array(y_test))
np.savetxt("final_results_age/" + model_name + '-y_pred' + '.txt', np.array(y_pred))

## UNBALANCED - Linear regression

In [34]:
model='lm'
strategy=''
n_bin=None
overlap=None
age=None
import time
timestr = time.strftime("%Y%m%d-%H%M%S")
config = {
    'seed' : 19951008,
    'log_interval' : 1,
    'model_name' : model,
    'feature_norm' : strategy,
    'num_bins': n_bin,
    'overlap_bin': overlap,
    'age_overlap': age,
    'dropout': None,
    'dataset' : 'age',
    'embedding' : 'i-vec',
    'folder_fn': 'ivectors/lin_reg/',
    'unbalanced': unbalanced,
    'timestamp': timestr,
    "final_train_eval": True

}

wandb.init(
    project='voxceleb_enrichment',
    name='_'.join([model,config['embedding'], strategy]),
    config=config
)
model=final_eval(X, y, strategy, model, spk_labels=plda_vox_yt_id)
wandb.run.finish()

[34m[1mwandb[0m: wandb version 0.10.24 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade


0,1
Train MAE,7.75568
Scenario,test_mode
train_std_mae,6.13336
_step,0
_runtime,16
_timestamp,1617258072


0,1
Train MAE,▁
train_std_mae,▁
_step,▁
_runtime,▁
_timestamp,▁


In [35]:
y_pred = model.predict(X_test)
test_mae = sklearn.metrics.mean_absolute_error(y_pred=y_pred, y_true=y_test)
mae_per_obs = np.abs(y_pred - y_test)
std_mae = np.std(mae_per_obs)
test_mae

9.443467508297424

In [36]:
model_name="lm_unbalanced"
# Save to file in the current working directory
pkl_filename = model_name + ".pkl"
with open(pkl_filename, 'wb') as file:
    pickle.dump(model, file)

In [37]:
np.savetxt("final_results_age/" + model_name +'-y_true' + '.txt', np.array(y_test))
np.savetxt("final_results_age/" + model_name + '-y_pred' + '.txt', np.array(y_pred))

## Balanced - Ridge
### Prepare balanced train set

In [38]:
balanced_recordings_indexes = get_correct_recordings_index(plda_vox_yt_id)
y = np.array(y)[balanced_recordings_indexes]
X = train_vectors[balanced_recordings_indexes]

get_correct_recordings_index >>>
compute least freq: True
Least freq: 1
get_correct_recordings_index <<<


In [39]:
unbalanced=False

### Train model

In [40]:
model='ridge'
strategy=''
n_bin=None
overlap=None
age=None
import time
timestr = time.strftime("%Y%m%d-%H%M%S")
config = {
    'seed' : 19951008,
    'log_interval' : 1,
    'model_name' : model,
    'feature_norm' : strategy,
    'num_bins': n_bin,
    'overlap_bin': overlap,
    'age_overlap': age,
    'dropout': None,
    'dataset' : 'age',
    'embedding' : 'i-vec',
    'folder_fn': 'ivectors/lin_reg/',
    'unbalanced': unbalanced,
    'timestamp': timestr,
    "final_train_eval": True

}
if model in ['cnn', 'cnn_increasing_conv']:
    config['filter_n'] = n_filt
    config['kernel_size'] = n_kern
    config['pool_size'] = n_pool
    config['dense_n'] = n_dense
    config['batch_norm_everywhere'] = norm_everywhere
    config['add_2nd_maxpool'] = second_max
    config['optimizer'] = optim
    config['additional_conv_max'] = add_conv_max
    config['momentum'] = momentum
    config['decay_rate'] = dec_rate

wandb.init(
    project='voxceleb_enrichment',
    name='_'.join([model,config['embedding'], strategy]),
    config=config
)
model=final_eval(X, y, strategy, model, spk_labels=plda_vox_yt_id)
wandb.run.finish()

[34m[1mwandb[0m: wandb version 0.10.24 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade


0,1
Train MAE,8.69758
Scenario,test_mode
train_std_mae,6.3138
_step,0
_runtime,19
_timestamp,1617258098


0,1
Train MAE,▁
train_std_mae,▁
_step,▁
_runtime,▁
_timestamp,▁


In [41]:
y_pred = model.predict(X_test)
test_mae = sklearn.metrics.mean_absolute_error(y_pred=y_pred, y_true=y_test)
mae_per_obs = np.abs(y_pred - y_test)
std_mae = np.std(mae_per_obs)
test_mae

10.402044942891498

In [42]:
model_name="ridge_balanced"
# Save to file in the current working directory
pkl_filename = model_name + ".pkl"
with open(pkl_filename, 'wb') as file:
    pickle.dump(model, file)

In [43]:
np.savetxt("final_results_age/" + model_name +'-y_true' + '.txt', np.array(y_test))
np.savetxt("final_results_age/" + model_name + '-y_pred' + '.txt', np.array(y_pred))

## Balanced Lasso

In [44]:
model='lasso'
strategy=''
n_bin=None
overlap=None
age=None
import time
timestr = time.strftime("%Y%m%d-%H%M%S")
config = {
    'seed' : 19951008,
    'log_interval' : 1,
    'model_name' : model,
    'feature_norm' : strategy,
    'num_bins': n_bin,
    'overlap_bin': overlap,
    'age_overlap': age,
    'dropout': None,
    'dataset' : 'age',
    'embedding' : 'i-vec',
    'folder_fn': 'ivectors/lin_reg/',
    'unbalanced': unbalanced,
    'timestamp': timestr,
    "final_train_eval": True

}

wandb.init(
    project='voxceleb_enrichment',
    name='_'.join([model,config['embedding'], strategy]),
    config=config
)
model=final_eval(X, y, strategy, model, spk_labels=plda_vox_yt_id)
wandb.run.finish()

[34m[1mwandb[0m: wandb version 0.10.24 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade


0,1
Train MAE,8.04339
Scenario,test_mode
train_std_mae,5.97349
_step,0
_runtime,8
_timestamp,1617258110


0,1
Train MAE,▁
train_std_mae,▁
_step,▁
_runtime,▁
_timestamp,▁


In [45]:
y_pred = model.predict(X_test)
test_mae = sklearn.metrics.mean_absolute_error(y_pred=y_pred, y_true=y_test)
mae_per_obs = np.abs(y_pred - y_test)
std_mae = np.std(mae_per_obs)
test_mae

10.126262786648606

In [46]:
model_name="lasso_balanced"
# Save to file in the current working directory
pkl_filename = model_name + ".pkl"
with open(pkl_filename, 'wb') as file:
    pickle.dump(model, file)

In [47]:
np.savetxt("final_results_age/" + model_name +'-y_true' + '.txt', np.array(y_test))
np.savetxt("final_results_age/" + model_name + '-y_pred' + '.txt', np.array(y_pred))

## Balanced - Linear regression

In [48]:
model='lm'
strategy=''
n_bin=None
overlap=None
age=None
import time
timestr = time.strftime("%Y%m%d-%H%M%S")
config = {
    'seed' : 19951008,
    'log_interval' : 1,
    'model_name' : model,
    'feature_norm' : strategy,
    'num_bins': n_bin,
    'overlap_bin': overlap,
    'age_overlap': age,
    'dropout': None,
    'dataset' : 'age',
    'embedding' : 'i-vec',
    'folder_fn': 'ivectors/lin_reg/',
    'unbalanced': unbalanced,
    'timestamp': timestr,
    "final_train_eval": True

}

wandb.init(
    project='voxceleb_enrichment',
    name='_'.join([model,config['embedding'], strategy]),
    config=config
)
model=final_eval(X, y, strategy, model, spk_labels=plda_vox_yt_id)
wandb.run.finish()

[34m[1mwandb[0m: wandb version 0.10.24 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade


0,1
Train MAE,7.53145
Scenario,test_mode
train_std_mae,5.73478
_step,0
_runtime,12
_timestamp,1617258128


0,1
Train MAE,▁
train_std_mae,▁
_step,▁
_runtime,▁
_timestamp,▁


In [49]:
y_pred = model.predict(X_test)
test_mae = sklearn.metrics.mean_absolute_error(y_pred=y_pred, y_true=y_test)
mae_per_obs = np.abs(y_pred - y_test)
std_mae = np.std(mae_per_obs)
test_mae

10.332942989725082

In [50]:
model_name="lm_balanced"
# Save to file in the current working directory
pkl_filename = model_name + ".pkl"
with open(pkl_filename, 'wb') as file:
    pickle.dump(model, file)

In [51]:
np.savetxt("final_results_age/" + model_name +'-y_true' + '.txt', np.array(y_test))
np.savetxt("final_results_age/" + model_name + '-y_pred' + '.txt', np.array(y_pred))