# 0. Functions import

In [3]:
!apt-get install libsndfile1 -y

Reading package lists... Done
Building dependency tree       
Reading state information... Done
libsndfile1 is already the newest version (1.0.28-4ubuntu0.18.04.1).
0 upgraded, 0 newly installed, 0 to remove and 8 not upgraded.


In [1]:
import sys,os
sys.path.append('../src/')
sys.path.append('../src_charly/')

import numpy as np
#import glob2
#import joblib
import pathlib
from scipy.io.wavfile import write as write_waveform
from collections import OrderedDict
import matplotlib.pyplot as plt
#import seaborn as sns
import pickle
import pandas as pd

import torch
import torch.nn.functional as F
import torch.nn as nn
from torch import optim

from sklearn.metrics import accuracy_score

from spectrogram_stream import SpectrogramStream
from autoencoders import ConvolutionalAutoencoder
from encoders import ConvolutionalEncoder
from bottlenecks import ConvolutionalBottleneck
from reconstructors import ConvolutionalDecoder
from visualization import spectrogram_to_waveform, compute_reconstruction_plot

In [2]:
from models import spectrogram_model

# 1. Data loading

In [3]:
data_path = '../data'
models_path = '../models'
experiment_name = 'dataset2filtered_b64_baseline_larger_l1'
results_path = os.path.join('results', experiment_name)
frame_step = 46
n_iter = 300
sampling_rate = 16000
n_batch = 10 # batch size
n_epochs = 10 # epochs for training
test_train_split = 0.8
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [82]:
#load labels
label_df = pd.read_csv("../data/labels.tsv", sep='\t').drop(columns="Unnamed: 0").set_index("sound_id")

In [87]:
# load data stream
stream = SpectrogramStream(root_path=data_path, batch_size=n_batch,test_train_split=test_train_split, val_train_split=0.0, subsets_to_load=['train','test'], sr=sampling_rate)

#for w in range(len(stream.train_loader)) :#get training data (step by step)
    #data = next(iter(stream.train_loader))['sound'].unsqueeze(1).float()
    #spectrograms = data['sound'].unsqueeze(1).float().to(device)

Loading Spectrogram dataset...
Number of samples detected: 39718
Spectrogram dataset initialized.
Building Spectrogram dataloaders...
Building the data splitter...
Building the train data loader...
Train loader size: 3178
Building the test data loader...
Test loader size: 795
Spectrogram dataloaders built successfully


# 2. Model training

In [17]:
# instantiate model
model = spectrogram_model(2)
model.to(device)

# optimization and loss
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=0.001, momentum=0.9)

In [18]:
# model training
for epoch in range(n_epochs):
    running_loss = 0
    for w in range(len(stream.train_loader)) :#get training data (step by step)
        data = next(iter(stream.train_loader))
        sound = data['sound'].unsqueeze(1).float().to(device)
        sound_id = data['sound_id']

        # target values for the chosen variable 
        var = "gender"
        target_mod = pd.DataFrame([],index=sound_id).merge(label_df[[var]], left_index=True, right_index=True)[var].values
        #target = torch.Tensor(np.zeros((10,3)))
        #for i in range(len(target_mod)):
        #    target[i][target_mod[i]] = 1
        target_mod = torch.Tensor(target_mod).long().to(device)
        
        # zero the parameter gradients
        optimizer.zero_grad()

        # compute output
        output = model(sound)

        # loss computation and propagation
        loss = criterion(output, target_mod)
        loss.backward()
        optimizer.step()

        running_loss += loss.item()
        
        if w%500 == 0:
            print("Epoch N°", epoch, " batch n°", w, " running loss = ", running_loss/500)
            running_loss = 0
        

Epoch N° 0  batch n° 0  running loss =  0.0014781877994537354


Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x7f0cdf422c10>
Traceback (most recent call last):
  File "/home/klegoff/.local/lib/python3.8/site-packages/torch/utils/data/dataloader.py", line 1203, in __del__
    self._shutdown_workers()
  File "/home/klegoff/.local/lib/python3.8/site-packages/torch/utils/data/dataloader.py", line 1177, in _shutdown_workers
    w.join(timeout=_utils.MP_STATUS_CHECK_INTERVAL)
  File "/usr/lib/python3.8/multiprocessing/process.py", line 149, in join
    res = self._popen.wait(timeout)
  File "/usr/lib/python3.8/multiprocessing/popen_fork.py", line 44, in wait
    if not wait([self.sentinel], timeout):
  File "/usr/lib/python3.8/multiprocessing/connection.py", line 931, in wait
    ready = selector.select(timeout)
  File "/usr/lib/python3.8/selectors.py", line 415, in select
    fd_event_list = self._selector.poll(timeout)
KeyboardInterrupt: 


KeyboardInterrupt: 

In [64]:
save = True
if save :
    torch.save(model.state_dict(), "spectrogram_" + var + "_classifier.statedict")
else :
    state_dict = torch.load("spectrogram_" + var +"_classifier.statedict")

# 3. Model Evaluation

## 1. Load Model

In [15]:
!ls

data				sound_id.pickle
latent_space_exploration.ipynb	spectrogram_age_classifier.statedict
models				spectrogram_classifier.ipynb
models.py			spectrogram_gender_classifier.statedict
projection.pickle		src
__pycache__			tsne_df.pickle
results


In [83]:
var = "gender" # "age", "accent"

In [11]:
# load the corresponding model
label_df[var + "_prediction"] = -1
model = spectrogram_model(2) # for gender : 2, for age : 3, for accent 15
model.load_state_dict(torch.load("models/spectrogram_" + var +"_classifier.statedict"))
device = "cpu"

# load data
stream = SpectrogramStream(root_path=data_path, batch_size=n_batch,test_train_split=test_train_split, val_train_split=0.0, subsets_to_load=['train','test'], sr=sampling_rate)

Loading Spectrogram dataset...
Number of samples detected: 39718
Spectrogram dataset initialized.
Building Spectrogram dataloaders...
Building the data splitter...
Building the train data loader...
Train loader size: 3178
Building the test data loader...
Test loader size: 795
Spectrogram dataloaders built successfully


## 2. Predict on test

In [85]:
# Make prediction to test the model

for w in range(len(stream.test_loader)) :#get test data (step by step)
    data = next(iter(stream.test_loader))
    sound = data['sound'].unsqueeze(1).float().to(device)
    sound_id = data['sound_id']

    # target values for the chosen variable 
    target_mod = pd.DataFrame([],index=sound_id).merge(label_df[[var]], left_index=True, right_index=True)[var].values
    #target = torch.Tensor(np.zeros((10,3)))
    #for i in range(len(target_mod)):
    #    target[i][target_mod[i]] = 1
    #target_mod = torch.Tensor(target_mod).long().to(device)

    # compute output
    output = model(sound)
    
    pred = torch.argmax(output, dim = 1)
    
    for i in range(len(sound_id)):
        label_df[var + "_prediction"].loc[label_df.index == sound_id[i]] = pred[i].item()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, value)
Traceback (most recent call last):
  File "/usr/lib/python3.8/multiprocessing/queues.py", line 235, in _feed
    close()
  File "/usr/lib/python3.8/multiprocessing/connection.py", line 177, in close
    self._close()
  File "/usr/lib/python3.8/multiprocessing/connection.py", line 361, in _close
    _close(self._handle)
OSError: [Errno 9] Bad file descriptor


## 3. Analyze predictions

### 1. Gender prediction

In [86]:
label_df.gender_prediction.value_counts() # -1 values for prediction, correspond to the element of the training set (on which we do not predict)

-1    34707
 0     2998
 1     2013
Name: gender_prediction, dtype: int64

In [87]:
# confusion matrix
pd.crosstab(label_df.gender, label_df.gender_prediction).iloc[:,1:]

gender_prediction,0,1
gender,Unnamed: 1_level_1,Unnamed: 2_level_1
0,2779,400
1,219,1613


In [88]:
# accuracy
y_true = label_df.gender.loc[label_df.gender_prediction != -1]
y_pred = label_df.gender_prediction.loc[label_df.gender_prediction != -1]

accuracy_score(y_true, y_pred)

0.8764717621233287

### 2. Age prediction

In [75]:
label_df.age_prediction.value_counts() # -1 values for prediction, correspond to the element of the training set (on which we do not predict)

-1    34694
 1     4629
 0      395
Name: age_prediction, dtype: int64

In [76]:
# confusion matrix
pd.crosstab(label_df.age, label_df.age_prediction).iloc[:,1:]

age_prediction,0,1
age,Unnamed: 1_level_1,Unnamed: 2_level_1
0,213,1999
1,179,2558
2,3,72


In [78]:
# accuracy
y_true = label_df.age.loc[label_df.age_prediction != -1]
y_pred = label_df.age_prediction.loc[label_df.age_prediction != -1]

accuracy_score(y_true, y_pred) # accuracy is not very good (probably the age feature is irrelevant or not well defined)

0.5515525477707006

# 4. Predict on rigged data

## 4.1. Compute prediction over rigged data

In [4]:
def predict(var, shifted_file):
    
    # load model to predict on the variable
    if var == "gender":
        model = spectrogram_model(2) # for gender : 2, for age : 3, for accent 15
    else : 
        model = spectrogram_model(3) # for gender : 2, for age : 3, for accent 15

    model.load_state_dict(torch.load("../models/spectrogram_" + var +"_classifier.statedict"))
    device = "cuda:0"
    model = model.to(device)
    
    # load shifted dataset
    spectrogram = pickle.load(open("../data/reconstructed/" + shifted_file,"rb"))
    
    if spectrogram.shape[0] != 39700:
        print("len problem")
        return None
    
    spectrogram = torch.tensor(spectrogram)
    
    pred = []
    for n in range(80):
        if n%10==0:
            print(n)

        torch.cuda.empty_cache()

        data = spectrogram[500*n:500*(n+1)].to(device)

        with torch.no_grad():
            out = model.forward(data)

        pred.append(torch.argmax(out, dim = 1).to("cpu"))
    return torch.cat(pred)

In [18]:
files = os.listdir("../data/reconstructed/")
ids = pickle.load(open("../data/sound_id.pickle","rb"))
#load labels
label_df = pd.read_csv("../data/labels.tsv", sep='\t').drop(columns="Unnamed: 0").set_index("sound_id")

In [7]:
pred_list = []

In [8]:
for file in files:
    
    if "gender" in file:
        var = "gender"
    else :
        var = "age"
    
    #label_df[var + "_shift" + file[-9:-7]]
    pred = predict(var, file)
    if type(pred) == type(None):
        pass
    else :
        # add the predicted attributes in the label_df
        pred_list.append(pd.DataFrame(pred, index = ids[:len(pred)],columns=[var + "_shift" + file[-9:-7]]))
    
        #label_df = label_df.merge(pred_df,right_index=True,left_index=True)

0
10
20
30
40
50
60
70
0
10
20
30
40
50
60
70
0
10
20
30
40
50
60
70
0
10
20
30
40
50
60
70
len problem
0
10
20
30
40
50
60
70
0
10
20
30
40
50
60
70


In [10]:
#pickle.dump(pred_list, open("../data/pred_list.pickle","wb"))

In [59]:
for pred in pred_list:
    new_col = pred.columns[0]
    label_df[new_col] = -1
    
    label_df[new_col].iloc[:39700] = pred.values.reshape(39700)

In [63]:
label_df.to_csv("label_df.csv")

## 4.2. Analyze prediction

In [77]:
label_df.age_shift12.value_counts().iloc[:2]

1    36525
0     3175
Name: age_shift12, dtype: int64

In [80]:
# count values of shifted pred vs. real attributes

display(label_df.age.value_counts())

#display(label_df.age_shift12.value_counts().iloc[:2])

display(label_df.age_shift10.value_counts().iloc[:2])

#display(label_df.age_shift21.value_counts().iloc[:2])

display(label_df.age_shift01.value_counts().iloc[:2])

#display(label_df.age_shift20.value_counts().iloc[:2])

1    21480
0    17612
2      626
Name: age, dtype: int64

1    36531
0     3169
Name: age_shift10, dtype: int64

1    37274
0     2426
Name: age_shift01, dtype: int64

In [79]:
display(label_df.gender.value_counts())

display(label_df.gender_shift10.value_counts().iloc[:2])

0    25190
1    14528
Name: gender, dtype: int64

0    21391
1    18309
Name: gender_shift10, dtype: int64

In [66]:
for col in label_df.columns:
    if "age" in col:
        pd.crosstab

client_id
accent
age
gender
original_sentence
sentence
intervals
useful_signal_proportions
age_shift12
age_shift10
age_shift21
age_shift01
age_shift20
gender_shift10


In [70]:
pd.crosstab(label_df["age"], label_df["age_shift12"]).iloc[]

age_shift12,-1,0,1
age,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,18,1438,16156
1,0,1686,19794
2,0,51,575


In [72]:
pd.crosstab(label_df["gender"], label_df["gender_shift10"]).iloc[]

gender_shift10,-1,0,1
gender,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,18,13579,11593
1,0,7812,6716


In [118]:
# fichier incomplet : 'reconstruction_shifted_age12.pickle'
# probleme : les ids de nos données ne sont pas uniques