In [1]:
#Core Python, Pandas, and kaldi_io
import numpy as np
import pandas as pd
import string
from collections import Counter
import kaldi_io

#Scikit
from sklearn import manifold
from sklearn.model_selection import train_test_split
from sklearn.neighbors import NearestNeighbors
from sklearn.metrics import pairwise_distances,average_precision_score
from sklearn.metrics.pairwise import pairwise_kernels
from scipy import stats

#Plotting
from matplotlib import pyplot as plt
import seaborn as sns
from scipy.spatial.distance import pdist


#Torch and utilities
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import TensorDataset,DataLoader,random_split,ConcatDataset

from data_helpers import DataHelper

################################################################################
###          (please add 'export KALDI_ROOT=<your_path>' in your $HOME/.profile)
###          (or run as: KALDI_ROOT=<your_path> python <your_script>.py)
################################################################################



Stats of the MuliView Trainset

In [2]:
filepath = "Data/multi_view/testset.list"

In [3]:
count = 0
words = []
with open(filepath) as fp:
    lines = fp.readlines()
    for line in lines: 
        count += 1
        #print("Line{}: {}".format(count, line.strip()))
        word = line.strip().split("_")[0]
        #print(word)
        words.append(word)

In [4]:
w = Counter(words)

In [5]:
print("Number of Examples {}".format(len(words)))
print("Number of Unique Words {}".format(len(w.keys())))
print("Maximum frequency {}".format(max(w.values())))
print("Minimum frequency {}".format(min(w.values())))
print("Maximum Character Length {}".format(max(list(map(len,w.keys())))))
print("Minimum Character Length {}".format(min(list(map(len,w.keys())))))

Number of Examples 11024
Number of Unique Words 3391
Maximum frequnecy 155
Minimum frequnecy 1
Maximum Character Length 16
Minimum Character Length 4


In [6]:
w.most_common()

[('recycling', 155),
 ('because', 111),
 ('benefits', 109),
 ('punishment', 107),
 ('insurance', 82),
 ('something', 71),
 ('company', 64),
 ('recycle', 63),
 ('plastic', 63),
 ('exactly', 63),
 ('people', 62),
 ('vacation', 61),
 ('really', 58),
 ('probably', 57),
 ('program', 50),
 ('things', 50),
 ('everything', 49),
 ('anything', 48),
 ('society', 47),
 ('sometimes', 46),
 ('newspapers', 45),
 ('expenses', 42),
 ('retirement', 42),
 ('employees', 41),
 ('problem', 40),
 ('companies', 40),
 ('actually', 40),
 ('interesting', 38),
 ('important', 37),
 ('situation', 35),
 ('expensive', 35),
 ('testing', 34),
 ('somebody', 34),
 ('savings', 34),
 ('problems', 33),
 ('especially', 33),
 ('absolutely', 33),
 ("that's", 32),
 ('understand', 31),
 ('basically', 31),
 ('anyway', 28),
 ('recycled', 28),
 ('government', 27),
 ('definitely', 27),
 ('college', 27),
 ('aluminum', 26),
 ('penalty', 26),
 ('newspaper', 25),
 ('organization', 25),
 ('university', 25),
 ('dollars', 25),
 ('family', 

In [7]:
w_c = Counter(w.values())
w_c.most_common()

[(1, 1838),
 (2, 591),
 (3, 275),
 (4, 189),
 (5, 100),
 (6, 73),
 (7, 65),
 (8, 44),
 (9, 32),
 (10, 24),
 (12, 20),
 (11, 15),
 (14, 12),
 (13, 9),
 (17, 8),
 (20, 7),
 (25, 6),
 (21, 6),
 (23, 6),
 (16, 5),
 (18, 5),
 (15, 5),
 (19, 4),
 (63, 3),
 (22, 3),
 (34, 3),
 (40, 3),
 (27, 3),
 (33, 3),
 (26, 2),
 (28, 2),
 (31, 2),
 (50, 2),
 (24, 2),
 (35, 2),
 (42, 2),
 (155, 1),
 (45, 1),
 (64, 1),
 (49, 1),
 (46, 1),
 (71, 1),
 (48, 1),
 (58, 1),
 (111, 1),
 (62, 1),
 (57, 1),
 (32, 1),
 (47, 1),
 (38, 1),
 (37, 1),
 (107, 1),
 (109, 1),
 (41, 1),
 (82, 1),
 (61, 1)]

Stats of AMI Dataset

In [11]:
load_list = ['Data/feats_cmvn.ark']
num_examples = np.Inf
dh = DataHelper(load_list,num_examples)
dh.load_data(char_threshold = 4, frequency_bounds = (0,np.Inf))
dh.process_data()
c,word_to_num,num_to_word = dh.generate_key_dicts()

inputs,labels = dh.give_inputs_and_labels()

Length before filtering on char length 317927
Length after filtering on char length 244494
Length before filtering on frequency_bounds 244494
Length after filtering on frequency_bounds 244494
Finished Loading the Data, 244494 examples
Number of Unique words  11177


In [12]:
c = Counter(dh.keys)

In [13]:
print("Number of Examples {}".format(len(dh.keys)))
print("Number of Unique Words {}".format(len(c.keys())))
print("Maximum frequency {}".format(max(c.values())))
print("Minimum frequency {}".format(min(c.values())))
print("Maximum Character Length {}".format(max(list(map(len,c.keys())))))
print("Minimum Character Length {}".format(min(list(map(len,c.keys())))))

Number of Examples 244494
Number of Unique Words 11177
Maximum frequency 2946
Minimum frequency 1
Maximum Character Length 20
Minimum Character Length 4


In [14]:
c.most_common()

[('yeah', 2946),
 ('okay', 2359),
 ('well', 2148),
 ('just', 1802),
 ('thats', 1682),
 ('right', 1673),
 ('that', 1546),
 ('like', 1519),
 ('think', 1378),
 ('because', 1329),
 ('maybe', 1271),
 ('mmhmm', 1264),
 ('something', 1205),
 ('this', 1184),
 ('have', 1156),
 ('then', 1105),
 ('about', 1088),
 ('remote', 1081),
 ('what', 1045),
 ('know', 987),
 ('buttons', 967),
 ('dont', 950),
 ('control', 913),
 ('really', 885),
 ('also', 882),
 ('some', 880),
 ('very', 877),
 ('with', 861),
 ('actually', 839),
 ('more', 826),
 ('should', 825),
 ('thing', 821),
 ('people', 816),
 ('button', 809),
 ('things', 788),
 ('here', 782),
 ('want', 769),
 ('there', 765),
 ('design', 745),
 ('different', 744),
 ('make', 741),
 ('probably', 729),
 ('time', 706),
 ('mean', 704),
 ('five', 664),
 ('which', 664),
 ('meeting', 658),
 ('good', 648),
 ('could', 632),
 ('alright', 622),
 ('kind', 620),
 ('screen', 618),
 ('from', 613),
 ('three', 598),
 ('next', 584),
 ('idea', 579),
 ('much', 562),
 ('would'

In [15]:
f_c = Counter(c.values())
f_c.most_common()

[(1, 4398),
 (2, 1552),
 (3, 872),
 (4, 542),
 (5, 377),
 (6, 318),
 (7, 233),
 (8, 215),
 (9, 160),
 (10, 138),
 (11, 119),
 (12, 102),
 (13, 88),
 (14, 86),
 (17, 71),
 (15, 70),
 (16, 66),
 (19, 58),
 (22, 55),
 (18, 50),
 (20, 49),
 (21, 49),
 (24, 48),
 (26, 44),
 (25, 36),
 (23, 36),
 (29, 34),
 (27, 34),
 (28, 31),
 (31, 28),
 (35, 26),
 (32, 25),
 (33, 24),
 (36, 24),
 (48, 22),
 (34, 22),
 (30, 21),
 (44, 19),
 (38, 19),
 (51, 18),
 (43, 18),
 (41, 18),
 (42, 18),
 (46, 15),
 (47, 15),
 (40, 15),
 (45, 15),
 (39, 15),
 (57, 14),
 (54, 14),
 (37, 14),
 (73, 13),
 (61, 11),
 (56, 11),
 (50, 11),
 (70, 10),
 (60, 10),
 (55, 9),
 (74, 9),
 (59, 9),
 (72, 9),
 (58, 9),
 (97, 8),
 (63, 8),
 (82, 8),
 (71, 8),
 (66, 8),
 (52, 8),
 (80, 7),
 (69, 7),
 (86, 7),
 (67, 7),
 (90, 7),
 (53, 7),
 (108, 7),
 (79, 7),
 (88, 7),
 (62, 7),
 (81, 6),
 (78, 6),
 (65, 6),
 (109, 6),
 (101, 6),
 (102, 6),
 (64, 6),
 (137, 5),
 (121, 5),
 (126, 5),
 (91, 5),
 (147, 5),
 (141, 5),
 (115, 5),
 (159, 5

In [18]:
from siamese_dataset import SiameseTriplets
num_examples = np.Inf
frequency_bounds = (5,np.Inf)

In [19]:
train_sm_dataset = SiameseTriplets(num_examples = num_examples, split_set = "train", frequency_bounds = frequency_bounds)

Length before filtering on char length 317927
Length after filtering on char length 173657
Length before filtering on frequency_bounds 173657
Length after filtering on frequency_bounds 162522
Finished Loading the Data, 162522 examples
Number of Unique words  3280
[1968, 656, 656]
For trainset number of unique words are 1968
torch.Size([19680, 3, 40, 100])
