<a href="https://colab.research.google.com/github/inderpreetsingh01/Speech/blob/main/Speaker_Identification_v3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install datasets evaluate transformers[sentencepiece]

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting datasets
  Downloading datasets-2.4.0-py3-none-any.whl (365 kB)
[K     |████████████████████████████████| 365 kB 5.1 MB/s 
[?25hCollecting evaluate
  Downloading evaluate-0.2.2-py3-none-any.whl (69 kB)
[K     |████████████████████████████████| 69 kB 10.3 MB/s 
[?25hCollecting transformers[sentencepiece]
  Downloading transformers-4.21.1-py3-none-any.whl (4.7 MB)
[K     |████████████████████████████████| 4.7 MB 60.5 MB/s 
Collecting fsspec[http]>=2021.11.1
  Downloading fsspec-2022.7.1-py3-none-any.whl (141 kB)
[K     |████████████████████████████████| 141 kB 64.1 MB/s 
[?25hCollecting huggingface-hub<1.0.0,>=0.1.0
  Downloading huggingface_hub-0.8.1-py3-none-any.whl (101 kB)
[K     |████████████████████████████████| 101 kB 14.8 MB/s 
Collecting responses<0.19
  Downloading responses-0.18.0-py3-none-any.whl (38 kB)
Collecting xxhash
  Downloading xxhash-3.0.0-cp37-cp37m-

## Importing Model and libs 
Wav2Vec2 model from huggingface transformers has been used.

In [2]:
import torch
import pandas as pd
import numpy as np
from datasets import load_dataset
from transformers import Wav2Vec2ForSequenceClassification, Wav2Vec2FeatureExtractor

In [3]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
device

device(type='cuda')

## Loading Dataset 

*   Model is finetuned on librispeech_asr dataset
*   Only data for 4 users has been laoded to test pipeline. 
*   Dataset is loaded as stream since size of full training dataset is large. 

In [8]:
dataset_streamed = load_dataset("ami-wav2vec2/ami_single_headset_segmented_and_chunked", split="train", streaming=True)
# next(iter(dataset_streamed))
dataset = list(dataset_streamed.take(4))
dataset[0]



{'builder_name': None,
 'citation': "@inproceedings{10.1007/11677482_3,\nauthor = {Carletta, Jean and Ashby, Simone and Bourban, Sebastien and Flynn, Mike and Guillemot, Mael and Hain, Thomas and Kadlec, Jaroslav and Karaiskos, Vasilis and Kraaij, Wessel and Kronenthal, Melissa and Lathoud, Guillaume and Lincoln, Mike and Lisowska, Agnes and McCowan, Iain and Post, Wilfried and Reidsma, Dennis and Wellner, Pierre},\ntitle = {The AMI Meeting Corpus: A Pre-Announcement},\nyear = {2005},\nisbn = {3540325492},\npublisher = {Springer-Verlag},\naddress = {Berlin, Heidelberg},\nurl = {https://doi.org/10.1007/11677482_3},\ndoi = {10.1007/11677482_3},\nabstract = {The AMI Meeting Corpus is a multi-modal data set consisting of 100 hours of meeting\nrecordings. It is being created in the context of a project that is developing meeting\nbrowsing technology and will eventually be released publicly. Some of the meetings\nit contains are naturally occurring, and some are elicited, particularly using 

In [11]:
dataset[1]

{'_data_files': [{'filename': 'dataset.arrow'}],
 '_fingerprint': '68622b9eeb68aedd',
 '_format_columns': ['audio', 'text'],
 '_format_kwargs': {},
 '_format_type': None,
 '_indexes': {},
 '_indices_data_files': None,
 '_output_all_columns': False,
 '_split': None}

In [15]:
type(dataset[1]['_data_files'][0]['filename'])

str

In [4]:
dataset_streamed = load_dataset("librispeech_asr", split="train.clean.100", streaming=True)
# next(iter(dataset_streamed))
dataset = list(dataset_streamed.take(2000))
dataset[0]



{'file': '374-180298-0000.flac',
 'audio': {'path': '374-180298-0000.flac',
  'array': array([ 7.01904297e-04,  7.32421875e-04,  7.32421875e-04, ...,
         -2.74658203e-04, -1.83105469e-04, -3.05175781e-05]),
  'sampling_rate': 16000},
 'text': 'CHAPTER SIXTEEN I MIGHT HAVE TOLD YOU OF THE BEGINNING OF THIS LIAISON IN A FEW LINES BUT I WANTED YOU TO SEE EVERY STEP BY WHICH WE CAME I TO AGREE TO WHATEVER MARGUERITE WISHED',
 'speaker_id': 374,
 'chapter_id': 180298,
 'id': '374-180298-0000'}

## Preparing Train and Test Dataset

In [5]:
# count of samples for each speaker along with speaker_id is extracted
speaker_id = []
for data in dataset:
  speaker_id.append(data['speaker_id'])
speaker_id_df = pd.DataFrame(speaker_id)
speaker_id_df.value_counts()

3240    127
8238    123
7635    122
5750    122
587     117
1246    117
7505    115
7800    115
374     113
1088    112
5456    112
5789    112
1263    109
2514    108
226     107
1743    103
5390     85
4214     81
dtype: int64

In [6]:
# available speaker_ids
speaker_id_df[0].unique()

array([ 374, 7800, 2514, 3240, 1088, 5456, 5750, 1246, 8238, 1263, 7505,
        587,  226, 1743, 4214, 5789, 7635, 5390])

In [7]:
# removing speakers for which leas than 100 audio samples are present
speakers_to_remove = speaker_id_df.value_counts()[speaker_id_df.value_counts()<100].index.values
speakers_to_remove = [i[0] for i in speakers_to_remove]
speakers_to_remove

[5390, 4214]

In [8]:
train_dataset = []
test_dataset = []
speaker_id2label = {}

for i, speaker_id in enumerate(speaker_id_df[0].unique()):
  if speaker_id not in speakers_to_remove:
    speaker_id2label[speaker_id] = i+1
    j=0
    for data in dataset:
      if (data['speaker_id'] == speaker_id):
        if (j<90):
          train_dataset.append({'label':i+1, 'audio':data['audio']['array']})
        else:
          test_dataset.append({'label':i+1, 'audio':data['audio']['array']})
        j+=1

In [9]:
print(f"Number of Speakers: {len(speaker_id2label)}")

Number of Speakers: 16


In [10]:
speaker_id2label

{374: 1,
 7800: 2,
 2514: 3,
 3240: 4,
 1088: 5,
 5456: 6,
 5750: 7,
 1246: 8,
 8238: 9,
 1263: 10,
 7505: 11,
 587: 12,
 226: 13,
 1743: 14,
 5789: 16,
 7635: 17}

In [11]:
type(test_dataset[0]['audio'])

numpy.ndarray

In [12]:
print(f"""
Number of training samples: {len(train_dataset)}
Number of test samples: {len(test_dataset)}
""")


Number of training samples: 1440
Number of test samples: 394



In [13]:
train_df = pd.DataFrame(train_dataset)
train_df.head()

Unnamed: 0,label,audio
0,1,"[0.000701904296875, 0.000732421875, 0.00073242..."
1,1,"[-9.1552734375e-05, -0.000152587890625, -0.000..."
2,1,"[-0.000244140625, -0.000244140625, -0.00018310..."
3,1,"[-0.000244140625, -0.000396728515625, -0.00057..."
4,1,"[0.000274658203125, 0.00030517578125, 0.000213..."


In [14]:
test_df = pd.DataFrame(test_dataset)
test_df.head()

Unnamed: 0,label,audio
0,1,"[-0.000213623046875, -0.0008544921875, -0.0013..."
1,1,"[6.103515625e-05, 6.103515625e-05, 3.051757812..."
2,1,"[0.000396728515625, -0.000274658203125, -0.000..."
3,1,"[-0.00067138671875, -0.000518798828125, -0.000..."
4,1,"[0.0, 3.0517578125e-05, 0.0, 6.103515625e-05, ..."


## Loading pretrained Model and FeatureExtractor

In [15]:
checkpoint = "superb/wav2vec2-base-superb-sid"
model = Wav2Vec2ForSequenceClassification.from_pretrained(checkpoint)
feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(checkpoint)

  "Passing `gradient_checkpointing` to a config initialization is deprecated and will be removed in v5 "


In [16]:
model.to(device)

Wav2Vec2ForSequenceClassification(
  (wav2vec2): Wav2Vec2Model(
    (feature_extractor): Wav2Vec2FeatureEncoder(
      (conv_layers): ModuleList(
        (0): Wav2Vec2GroupNormConvLayer(
          (conv): Conv1d(1, 512, kernel_size=(10,), stride=(5,), bias=False)
          (activation): GELUActivation()
          (layer_norm): GroupNorm(512, 512, eps=1e-05, affine=True)
        )
        (1): Wav2Vec2NoLayerNormConvLayer(
          (conv): Conv1d(512, 512, kernel_size=(3,), stride=(2,), bias=False)
          (activation): GELUActivation()
        )
        (2): Wav2Vec2NoLayerNormConvLayer(
          (conv): Conv1d(512, 512, kernel_size=(3,), stride=(2,), bias=False)
          (activation): GELUActivation()
        )
        (3): Wav2Vec2NoLayerNormConvLayer(
          (conv): Conv1d(512, 512, kernel_size=(3,), stride=(2,), bias=False)
          (activation): GELUActivation()
        )
        (4): Wav2Vec2NoLayerNormConvLayer(
          (conv): Conv1d(512, 512, kernel_size=(3,), strid

## Preparing DataLoader
*   DataLoader is prepared for both train and test dataset.
*   Help to easily iterate over data and load into required format to feed into model.
*   Custom collate function is used to do dynamic padding, padding such that length is made equal to longest sequence in batch.
*  This allows to reudce the memory requirement as compare to uniform padding.


 

In [17]:
def collate_batch(batch):
  audio_list = []
  label_list = []
  for (label, audio) in batch:
    audio_list.append(audio)
    label_list.append(label)

  audio_features = feature_extractor(audio_list, sampling_rate=16000, padding='longest', return_tensors="pt")
  encoding = {}
  encoding['input_values'] = audio_features['input_values']
  encoding['attention_mask'] = audio_features['attention_mask']
  encoding['labels'] = torch.from_numpy(np.array(label_list)).reshape((len(batch),))
  return encoding

In [18]:
from torch.utils.data import DataLoader

train_dataset = list(zip(train_df.label.values, train_df.audio.values))
test_dataset = list(zip(test_df.label.values, test_df.audio.values))

train_dataloader = DataLoader(
    train_dataset, shuffle=True, batch_size=8, collate_fn=collate_batch
)
test_dataloader = DataLoader(
    test_dataset, batch_size=8, collate_fn=collate_batch
)

In [19]:
# for data in train_dataloader:
  # print(data)
  # print(data['input_values'].shape)
  # print(data['attention_mask'].shape)
  # print(data['labels'].shape)
  # break

## Training

### Optimizer

In [20]:
from transformers import AdamW
optimizer = AdamW(model.parameters(), lr=5e-5)



### Learning rate Scheduler

In [21]:
from transformers import get_scheduler

num_epochs = 3
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps,
)
print(num_training_steps)

540


### Training loop
1. Fetching a batch
2. Forward Pass
3. Computing loss
4. Calculating gradients of params wrt loss
5. Updating weights using gradients
6. step of learning rate scheduler
7. Resetting gradients to zero to prevent accumulation 

In [22]:
from tqdm.auto import tqdm

progress_bar = tqdm(range(num_training_steps))

model.train()
for epoch in range(num_epochs):
    print(f'Epoch {epoch}')
    # Fetching a batch
    for iter, batch in enumerate(train_dataloader):
        batch = {k: v.to(device) for k, v in batch.items()}
        # Forward Pass
        outputs = model(**batch)
        # Computing loss
        loss = outputs.loss
        if (iter%10 == 0):
          print(f'loss: {loss}, iter: {iter+1}')
        # Calculating gradients of params wrt loss
        loss.backward()
        # Updating weights using gradients
        optimizer.step()
        # Step of learning rate scheduler
        lr_scheduler.step()
        # Resetting gradients to zero to prevent accumulation
        optimizer.zero_grad()
        progress_bar.update(1)

  0%|          | 0/540 [00:00<?, ?it/s]

Epoch 0
loss: 29.98179054260254, iter: 1
loss: 9.77797794342041, iter: 11
loss: 3.824599266052246, iter: 21
loss: 0.0009927225764840841, iter: 31
loss: 0.0022532951552420855, iter: 41
loss: 2.0710842609405518, iter: 51
loss: 4.371391296386719, iter: 61
loss: 1.7398827075958252, iter: 71
loss: 6.5265398916380946e-06, iter: 81
loss: 0.004136262461543083, iter: 91
loss: 0.7181386947631836, iter: 101
loss: 2.6596339012030512e-05, iter: 111
loss: 0.0, iter: 121
loss: 1.4901152667334827e-07, iter: 131
loss: 3.843908416456543e-05, iter: 141
loss: 0.0, iter: 151
loss: 8.493644259033317e-07, iter: 161
loss: 0.0, iter: 171
Epoch 1
loss: 2.086160861836106e-07, iter: 1
loss: 9.089675359064131e-07, iter: 11
loss: 2.682206172721635e-07, iter: 21
loss: 0.0, iter: 31
loss: 1.996739683818305e-06, iter: 41
loss: 0.0, iter: 51
loss: 0.0, iter: 61
loss: 0.0, iter: 71
loss: 0.0, iter: 81
loss: 0.0, iter: 91
loss: 22.162242889404297, iter: 101
loss: 1.3388437032699585, iter: 111
loss: 8.940693874137651e-08,

## Evaluation


In [23]:
predictions = []
labels = []

model.eval()
for batch in test_dataloader:
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
        outputs = model(**batch)
      
    labels.append(batch['labels'].reshape(-1,).cpu().numpy())
    logits = outputs.logits
    predictions.append(torch.argmax(logits, dim=-1).reshape(-1,).cpu().numpy())

labels = np.concatenate(labels, axis=0)
predictions = np.concatenate(predictions, axis=0)
accuracy = (labels == predictions).sum()/len(predictions)
print(f"Accuracy: {accuracy*100}")

Accuracy: 100.0
