In [1]:
# Make necessary directories

!mkdir /content/code
!mkdir /content/data
%cd /content/code

mkdir: cannot create directory ‘/content/code’: File exists
mkdir: cannot create directory ‘/content/data’: File exists
/content/code


Please place all code in `code/` and all data in `data/`.


The following cell copies files from Drive to data

In [2]:
!cp -r /content/drive/MyDrive/data/* /content/data

In [3]:
# Extract data (dev)

%cd /content/data
!tar -x -f /content/data/dev-clean.tar.gz
%cd /content/code

/content/data
/content/code


## Model

In [4]:
import numpy as np
import tensorflow as tf

from librispeech import get_librispeech_data
from model import Wav2VecPretraining

### Loading data

In [5]:
names, audio, transcripts = get_librispeech_data('../data/LibriSpeech/dev-clean')
print(np.shape(names), np.shape(audio), np.shape(transcripts))

Loading book ../data/LibriSpeech/dev-clean/3081
Loading book ../data/LibriSpeech/dev-clean/3576
Loading book ../data/LibriSpeech/dev-clean/652
Loading book ../data/LibriSpeech/dev-clean/2428
Loading book ../data/LibriSpeech/dev-clean/1993
Loading book ../data/LibriSpeech/dev-clean/2803
Loading book ../data/LibriSpeech/dev-clean/251
Loading book ../data/LibriSpeech/dev-clean/6319
Loading book ../data/LibriSpeech/dev-clean/1462
Loading book ../data/LibriSpeech/dev-clean/84
Loading book ../data/LibriSpeech/dev-clean/2412
Loading book ../data/LibriSpeech/dev-clean/5694
Loading book ../data/LibriSpeech/dev-clean/8842
Loading book ../data/LibriSpeech/dev-clean/2277
Loading book ../data/LibriSpeech/dev-clean/1988
Loading book ../data/LibriSpeech/dev-clean/1673
Loading book ../data/LibriSpeech/dev-clean/2035
Loading book ../data/LibriSpeech/dev-clean/5895
Loading book ../data/LibriSpeech/dev-clean/3853
Loading book ../data/LibriSpeech/dev-clean/3170
Loading book ../data/LibriSpeech/dev-clean/3

  result = asarray(a).shape


In [6]:
import IPython

index = 1
print(names[index], transcripts[index])
IPython.display.Audio(audio[index], rate=16000)

3081-166546-0001 INSTANTLY THEY ABSORBED ALL MY ATTENTION THOUGH I DARED NOT GIVE THEM A DIRECT LOOK AND CONTINUED TO OBSERVE THEM ONLY IN THE GLASS



In [7]:
# Group and reshape audio
# 10 seconds of audio
audio_window_size =  10 * 16000
audio_input = tf.keras.utils.pad_sequences(audio,
                                           maxlen=audio_window_size,
                                           padding='post',
                                           truncating='post',
                                           dtype='float32')
# Turn each sample into a 1-D vector
audio_input = tf.expand_dims(audio_input, -1)

In [8]:
import onehot as oh

import importlib
importlib.reload(oh)

transcript_window_size = 499
encoded_transcripts = oh.convert_to_chars(transcripts)
encoded_transcripts = tf.keras.utils.pad_sequences(encoded_transcripts,
                                                   maxlen=transcript_window_size,
                                                   padding='post',
                                                   truncating='post',
                                                   dtype='int')
# encoded_transcripts = tf.one_hot(encoded_transcripts, 27)
encoded_transcripts = tf.constant(encoded_transcripts)
# To test: decode the first encoded transcript
oh.decode_transcripts(encoded_transcripts[:10])

['WHEN WE TOOK OUR SEATS AT THE BREAKFAST TABLE IT WAS WITH THE FEELING OF BEING NO LONGER LOOKED UPON AS CONNECTED IN ANY WAY WITH THIS CASE                                                                                                                                                                                                                                                                                                                                                                        ',
 'INSTANTLY THEY ABSORBED ALL MY ATTENTION THOUGH I DARED NOT GIVE THEM A DIRECT LOOK AND CONTINUED TO OBSERVE THEM ONLY IN THE GLASS                                                                                                                                                                                                                                                                                                                                                                           

### Pre-training the Model for the first time

Skip this section when refining the model

In [9]:
!rm -r /content/*.py

rm: cannot remove '/content/*.py': No such file or directory


In [10]:
import cnn
import transformer
import quantization
import mask
import losses
import gumbelsoftmax
import model as m

import importlib
importlib.reload(cnn)
importlib.reload(transformer)
importlib.reload(quantization)
importlib.reload(m)
importlib.reload(mask)
importlib.reload(losses)
importlib.reload(gumbelsoftmax)

model = m.Wav2VecPretraining(127)

output = model(audio_input[:2])

tf.config.run_functions_eagerly(True)
model.compile(optimizer=tf.keras.optimizers.Adam(5e-4), run_eagerly=True)
model.fit(audio_input[:2], batch_size=2)





<keras.callbacks.History at 0x7fe1dc4b9c40>

In [11]:
from losses import contrastive_loss, diversity_loss

audio_sample = audio_input[:2]

norm_audio = model.normalization(audio_sample)
audio_features = model.cnn(norm_audio)

quantized_features = model.quantization(audio_features)

# Mask audio features
mask, masked_features = model.mask(audio_features)
# TODO: prediction should be same size as quantization. Not sure
# what the correct size for either would be.
prediction = model.transformer(masked_features, masked_features)

# Compare prediction with actual quantization
c_loss = contrastive_loss(quantized_features, prediction, mask)
# Add diversity loss
d_loss = diversity_loss(prediction)

loss = c_loss + model.diversity_weight * d_loss

In [12]:
model.summary()
model.transformer.summary()

Model: "wav2_vec_pretraining"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 normalization (Normalizatio  multiple                 3         
 n)                                                              
                                                                 
 speech_cnn (SpeechCNN)      multiple                  4204032   
                                                                 
 mask (Mask)                 multiple                  512       
                                                                 
 speech_transformer (SpeechT  multiple                 47342620  
 ransformer)                                                     
                                                                 
 quantization_module (Quanti  multiple                 512447    
 zationModule)                                                   
                                              

### Train the model for more time

In [13]:
tf.config.run_functions_eagerly(False)
num_examples=2000
model.fit(audio_input[:num_examples], batch_size=10)



<keras.callbacks.History at 0x7fe1dc23ad00>

## Fine-tuning

In [14]:
import cnn
import transformer
import quantization
import mask
import losses
import model as m

import importlib
importlib.reload(cnn)
importlib.reload(transformer)
importlib.reload(quantization)
importlib.reload(m)
importlib.reload(mask)
importlib.reload(losses)

fine_tuned = m.Wav2VecFineTuning(pretrained_model=model,
                                 num_classes=27,
                                 hidden_size=512)
fine_tuned.compile(optimizer=tf.keras.optimizers.Adam(5e-4),
                   loss=tf.keras.losses.SparseCategoricalCrossentropy(
                       from_logits=True))
fine_tuned.fit(audio_input[:1], encoded_transcripts[:1], batch_size=1)





<keras.callbacks.History at 0x7fe1dc2178e0>

In [15]:
num_examples=2703
fine_tuned.fit(audio_input[:num_examples], encoded_transcripts[:num_examples],
               batch_size=10,
               epochs=1)



<keras.callbacks.History at 0x7fdf5f501100>

In [16]:
import onehot as oh
from gumbelsoftmax import gumbel_softmax

import importlib
importlib.reload(oh)

num_preds = 5
temperature = 1.5

preds = fine_tuned(audio_input[:num_preds])
print(tf.shape(preds))
preds = preds / temperature
decoded = oh.decode_prob_transcripts(preds)

for a,t,d in zip(audio, transcripts, decoded):
  IPython.display.display(IPython.display.Audio(a, rate=16000))
  print(t)
  print(d)

tf.Tensor([  5 499  27], shape=(3,), dtype=int32)


WHEN WE TOOK OUR SEATS AT THE BREAKFAST TABLE IT WAS WITH THE FEELING OF BEING NO LONGER LOOKED UPON AS CONNECTED IN ANY WAY WITH THIS CASE

WX  DI MY R FT DLE GMA   AAN DE   P  S   DS    RYH TFBWKD  TET HOASMDGE  H   OM    MRNEOS  N Z LSSIUPC NO GZICGIAEEI  Y OR P  VFP TT HII GI   HKRET  MNS       UIWH T    EHN    RO SCRE ECH  C   YPRRYO  D  SR G    YDER ASN HAO    ZLE CCR E   ES E   IIRI SE    L B IINTTRP AH  FICN      HO   SA FP  RTNC  R L  IUS  SM F   NT  NAOSN QOAAANA EH  SN   N  RAYA M A EC  RTW   EAXE  D   L  EYS  TEAFB TH DBAL UTDHOWIIEA  EFN    RTW  F   DVU  R YDS W M   SL  O    GM  HSM PM EANPH UI SCO   P     PMTB  M A NT 


INSTANTLY THEY ABSORBED ALL MY ATTENTION THOUGH I DARED NOT GIVE THEM A DIRECT LOOK AND CONTINUED TO OBSERVE THEM ONLY IN THE GLASS

M RA BNPCELIE    HI E CI I  D C  Y Y EY  A DE         N   Y O T DT PMNS  U NWSH  GPFP AL  A WFSP SSA    Y R M F    UK T UTI DDAGIP JSLS OR  S  S IGL   TEO  N  ED A   OOLIR   DGGUUM ARH  OI R TLRMYY F S DSHCMVTHO  NP F     T OKYLHSRZ TAGSDG D  T   D  O   EBH BNE   EB   A  ED Q INNI B  SRS TSN RE   ESE  G HI    SW NM    TOSIY  M T     C  Y        V L  DD W  T FA F U  YFVA  LT TMA N   HGRFM RA BK  PUCA  WBN     TLRSN S HQ VSC   S S   NZ   DEEK   R SE SDEB  H O    SCE M W IHH    DG ROCGMA T WU   D 


YES AND A VERY RESPECTABLE ONE

WSLD DM ND FASOF PPV    R W  I F  WD T      R KC L OIP YAPCFG R  H  E N LLA  RR  AV  C P   Y  N RHMESI   D    NE    DDRISAJUS  I  TQ K  E HPWLHI CB EE    P H HHNEGAFSTM  A  ETEME  CP HDL  G T  HG H NY    M L B YLEXE  TR HMS EENSEB I      SEW ST OYWN  P WJULCF  D   LHAY  IS C   MNUMWGOOW       I B     W  F T  DO  E  E  J GI     EIS   I MRLAF   RJT  L L D T LAO MRTESUU H  EIB OWEGRAUID AN L   LE H R  DA  A E RL U LMHIAO E C IFH    R F NU  M EO C  DT EN  Y   Y  TEVIC ANHD D MT E    HITL  C NL AO  


THE LADY IS NOT THE MOTHER OF THE BOYS BUT THEIR AUNT

U H ESKEIIPIL WAAOUO TYH AD LS  B    HH  E  OHY ONE W    T  WLHAT   I    MS N RT AHG  N  OI G  U GIH  S YL    K QMNRSWOBHE L AWN     DC   SI D   N  IE    HR  O RDY I HM  NUEDUI  D SMDOD  HHTWHDO    N  DN N B LOATA   N ND O E X   N  SWO    L A  N IV BG YB A   GD  LTSH   O H  KNDLC E  B D  O Z    R EYM LRS  T N  E W H YYGA   RS  UDD RDRNIUW TSOAHPRND   T  INE F HD U C H     O  D  CHCUSOCU Y SEBL      DL S  PGSI  R W UE    O N  D NE NAT E Y AE LB AOH    DUSOR   UH       I  ME   G F H     TUO I   O


THE BOYS BELONG TO THE GENTLEMAN WHO IS A WIDOWER

       D IXDN  I  E DSHHFAERD    RN W  O    T HL FC I H EAP R  W SUDLFT OAEYASF F  E  ON T  D     O  RNA L   AYEN DY UD  AD  R N  DFE   E CAGY  UA  TFT A   AI B ST  INN RTC  OV   T D    LL   R  OS KT   UA   TTTN  RL O U  S   YS DR OI   ATPRBRR  HEASR WF   L W E S WE V ES  GH  SFVIO   IR  G TDSER S IH  Y H      OPS  ENI   HR NGL IF A  NHSPAALOS  YT  GYAOO H TEOW J HF EE R  BIA  R       PAEL   F   UM     G  Y EP S     RAGIY NCF  W N H H E ARDH Y WN    I H        J   STXO O    TQYEANI  H T   S    


In [17]:
tf.keras.models.save_model(
    model,
    'pretrain')
tf.keras.models.save_model(
    fine_tuned, 
    'fine_tuned')
!tar --create --gzip --file pretrain.tar.gz pretrain
!tar --create --gzip --file fine_tuned.tar.gz fine_tuned
!cp pretrain.tar.gz fine_tuned.tar.gz /content/drive/MyDrive/dl



In [9]:
del model

NameError: ignored

## Continue Training

Currently, this block only trains on labeled data.

In [9]:
!cp /content/drive/MyDrive/dl/pretrain.tar.gz /content/drive/MyDrive/dl/fine_tuned.tar.gz .
!tar --extract --file fine_tuned.tar.gz
!tar --extract --file pretrain.tar.gz

In [10]:
import numpy as np
import tensorflow as tf
from losses import FlattenedSparseCategoricalCrossentropy

In [11]:
pretrained = tf.keras.models.load_model('pretrain')
fine_tuned = tf.keras.models.load_model('fine_tuned',
  custom_objects={
      'FlattenedSparseCategoricalCrossentropy': FlattenedSparseCategoricalCrossentropy
  })

### Fine-tuning

In [12]:
fine_tuned.compile(optimizer=tf.keras.optimizers.Adam(5e-4),
                   loss=FlattenedSparseCategoricalCrossentropy(
                       from_logits=True))

num_examples=2703
fine_tuned.fit(audio_input[:num_examples], encoded_transcripts[:num_examples],
               batch_size=5,
               epochs=1)





<keras.callbacks.History at 0x7fe651acf520>

In [13]:
tf.keras.models.save_model(
    pretrained,
    'pretrain')
tf.keras.models.save_model(
    fine_tuned, 
    'fine_tuned')
!tar --create --gzip --file pretrain.tar.gz pretrain
!tar --create --gzip --file fine_tuned.tar.gz fine_tuned
!cp pretrain.tar.gz fine_tuned.tar.gz /content/drive/MyDrive/dl



In [14]:
import onehot as oh
from gumbelsoftmax import gumbel_softmax

import importlib
importlib.reload(oh)

num_preds = 5
temperature = 1.5

preds = fine_tuned(audio_input[:num_preds])
print(tf.shape(preds))
preds = preds / temperature
decoded = oh.decode_prob_transcripts(preds)

for a,t,d in zip(audio, transcripts, decoded):
  IPython.display.display(IPython.display.Audio(a, rate=16000))
  print(t)
  print(d)

tf.Tensor([  5 499  27], shape=(3,), dtype=int32)


WHEN WE TOOK OUR SEATS AT THE BREAKFAST TABLE IT WAS WITH THE FEELING OF BEING NO LONGER LOOKED UPON AS CONNECTED IN ANY WAY WITH THIS CASE

 R ST    AGH   CA JV  S  N TH ID  EN    U HNEE       O  O  V UH I I   R DT I   DHG  CI   APS TH    AME TS   RL T  HU  TA X     D  HI  IO SD       HTI V NOE    XS L HTR O    AXI HSETEN     WB S  RNO  LDEB S RL  E   BOUCHN  RDP XOT  P ECP R EBCS O  ASW MA FW   F    EY TIP W   AU PW HSA RO       O  TTIHC   RE O  YLS O  VAIKOES D D      TS SN EEWE  W  RT       YCG C    Y WC E   NH S   DWO    B    O A   TM  F YOENN   KN  L  I HGEOC ER HTH   U JE     H T  LOTC        EVTU LA       O RI IJA  UAENH JH 


INSTANTLY THEY ABSORBED ALL MY ATTENTION THOUGH I DARED NOT GIVE THEM A DIRECT LOOK AND CONTINUED TO OBSERVE THEM ONLY IN THE GLASS

A  DS HKY TU OZF     EI DE  M EU A  H A    B      U IDNOE  O   NFD X M E    DA    N  U   C BSUTHO ESED ITO E  NL    ANM    PI A   HH F N   A UF S I H   I    ESWF  I  A FN A  A  WO        R I   RR N  EEDNRS O   E E  IM   RO  F P AET  I I  E L KRT  B D T       CQO  IAASMAW    T  EG  SD  TP BDP AR TF   R T       NRO IVR  T  OEV   HERNR    N U G       W  AIYE  T  ONUV RH   E  AI     H  T   D A VY  TS B IR  UV  ME H    W   G  K  W H IY O    DE   SL   KL ODK     T    RMT   W UV FT OF VA H  OVD H M   


YES AND A VERY RESPECTABLE ONE

O   IIRMI   PC  K  HIM M MMML  ON T ST FV DH O  H  C N L R E  YTO  C     T   CERRBL R   V   ENT   HE T  MO E     B E    G   OY H  AL HF  AOG TPT RM EY  O  D  X HG FCO EAW   H T I    C        E M  X O  RC CWIANTT  TKIR   RE YEBVC     ME H U O C BE W ANSTTLEI RIET SLYCYS E A  H TCU  SDO D   L A R    FE  ENM  MAGW R  NDT Y E  LEH  EMN SKIIHL S  I  BT O PL  EN    H   H I  M R  HRFO   L     BUNET  OS O  T EF   A  V KL  HE RUNRTMA EN V TMBDLYMTO  N   PS    O    S    H  LE    U   M  I    LST YAEOUH SD


THE LADY IS NOT THE MOTHER OF THE BOYS BUT THEIR AUNT

N  TFNAEH RS   E X UOSO   T S  EO CE AN  C P H U  HEF     C F  L  I O   S           KDE   C   T G   DT SI MS  K  NDIIM  EY URT  E   F    H FT  O WTMD O DA F A T   D   T   O    O  AH     DIT   EE   OW MNGGN  WTBNLO  EA H SDA  KY     FOO  ZR DS  C STE IL S    B  YDO RT NN  BA  C   A F  B  LO  TBS L  O AUMI    GT  SOG UHS   PU          GODO HT MS H HS    E AWH          O    LM  H  ALC TE   U   M   M  E  NN     A T F  D UR C EH F  OWTELM  F  EHI A         ULLARVI  VW OMN      Y H  LUCHE  IWNAM     


THE BOYS BELONG TO THE GENTLEMAN WHO IS A WIDOWER

  FM  RE Q HI R OT AEL IE D NEHRHG   WP TF  OENPA U      YT  NENMNTHCT UNOOAW MU A  ND RO  ISE SII P M  TLTD IO N  RI  HL   LT NE   D EOC A  LYR DEQ Y JC  AOIG AH   N  TFE R  T  NHFE   O  RTMT   U      UDIRR CS  RN  I F US   CSPB     H DS G   RT  IRA    L    H    YO IF N UECYH  DCRUT  T RE H   ZKEV   A DH E DEK   O NTD I  HCI   G OSIDNG EVY  E     F LU    H C D H BN  S  RD VCA TS F    F  E  NIDCEC H  D O  A  L BH WVD VG   IN  IBU OR  OI   E  S RPOQG   JY  VEJ  H  AA T C   DE F  GKH  R E L   E  
