In [0]:
from google.colab import drive
drive.mount('/content/drive')
# !ln -s 

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


In [0]:
!cp /content/drive/'My Drive'/project/project/trim/sample_submission.csv .
!cp /content/drive/'My Drive'/project/trim/orig/export.pkl .

In [0]:
!ln -s /content/drive/'My Drive'/project/trim/test/ /content/

In [0]:
!awk '{sub(/[^,]*/,"");sub(/,/,"")} 1' sample_submission.csv > audio.csv

In [0]:
from fastai.vision import *
from tqdm import tqdm_notebook
import IPython
import IPython.display
import PIL

In [0]:
ROOT = Path('/content')
FOLDER = 'test'
SOURCE = ROOT/FOLDER
LIST = ROOT/'audio.csv'
CATMODEL = 'export.pkl'
df = pd.read_csv(LIST)

In [0]:
import librosa
import librosa.display

def read_audio(conf, pathname, trim_long_data):
    y, sr = librosa.load(pathname, sr=conf.sampling_rate)
    # trim silence
    if 0 < len(y): # workaround: 0 length causes error
        y, _ = librosa.effects.trim(y) # trim, top_db=default(60)
    # make it unified length to conf.samples
    if len(y) > conf.samples: # long enough
        if trim_long_data:
            y = y[0:0+conf.samples]
    else: # pad blank
        padding = conf.samples - len(y)    # add padding at both ends
        offset = padding // 2
        y = np.pad(y, (offset, conf.samples - len(y) - offset), 'constant')
    return y

def audio_to_melspectrogram(conf, audio):
    spectrogram = librosa.feature.melspectrogram(audio, 
                                                 sr=conf.sampling_rate,
                                                 n_mels=conf.n_mels,
                                                 hop_length=conf.hop_length,
                                                 n_fft=conf.n_fft,
                                                 fmin=conf.fmin,
                                                 fmax=conf.fmax)
    spectrogram = librosa.power_to_db(spectrogram)
    spectrogram = spectrogram.astype(np.float32)
    return spectrogram

def show_melspectrogram(conf, mels, title='Log-frequency power spectrogram'):
    librosa.display.specshow(mels, x_axis='time', y_axis='mel', 
                             sr=conf.sampling_rate, hop_length=conf.hop_length,
                            fmin=conf.fmin, fmax=conf.fmax)
    plt.colorbar(format='%+2.0f dB')
    plt.title(title)
    plt.show()

def read_as_melspectrogram(conf, pathname, trim_long_data, debug_display=False):
    x = read_audio(conf, pathname, trim_long_data)
    mels = audio_to_melspectrogram(conf, x)
    if debug_display:
        IPython.display.display(IPython.display.Audio(x, rate=conf.sampling_rate))
        show_melspectrogram(conf, mels)
    return mels


class conf:
    # Preprocessing settings
    sampling_rate = 44100
    duration = 2
    hop_length = 347*duration # to make time steps 128
    fmin = 20
    fmax = sampling_rate // 2
    n_mels = 128
    n_fft = n_mels * 20
    samples = sampling_rate * duration

In [0]:
def mono_to_color(X, mean=None, std=None, norm_max=None, norm_min=None, eps=1e-6):
    # Stack X as [X,X,X]
    X = np.stack([X, X, X], axis=-1)

    # Standardize
    mean = mean or X.mean()
    std = std or X.std()
    Xstd = (X - mean) / (std + eps)
    _min, _max = Xstd.min(), Xstd.max()
    norm_max = norm_max or _max
    norm_min = norm_min or _min
    if (_max - _min) > eps:
        # Scale to [0, 255]
        V = Xstd
        V[V < norm_min] = norm_min
        V[V > norm_max] = norm_max
        V = 255 * (V - norm_min) / (norm_max - norm_min)
        V = V.astype(np.uint8)
    else:
        # Just zero
        V = np.zeros_like(Xstd, dtype=np.uint8)
    return V

def convert_wav_to_image(df, source, img_dest=''):
    X = []
    # for row in df.iterrows():
    for i, row in tqdm_notebook(df.iterrows()):
        x = read_as_melspectrogram(conf, source/str(row.fname), trim_long_data=False)
        x_color = mono_to_color(x)
        X.append(x_color)
    return X

Xval = convert_wav_to_image(df, source=SOURCE)

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




In [0]:
from fastai import *
from fastai.vision import *
from fastai.vision.data import *
import random

CUR_X_FILES, CUR_X = list(df.fname.values), Xval

def open_fat2019_image(fn, convert_mode, after_open)->Image:
    # print(fn, "FN")
    # open
    idx = CUR_X_FILES.index(fn.split('/')[-1])
    # idx = int(fn.split('/')[-1])
    x = PIL.Image.fromarray(CUR_X[idx])
    # crop
    time_dim, base_dim = x.size
    crop_x = random.randint(0, time_dim - base_dim)
    x = x.crop([crop_x, 0, crop_x+base_dim, base_dim])    
    # standardize
    return Image(pil2tensor(x, np.float32).div_(255))

vision.data.open_image = open_fat2019_image

In [0]:
def _one_sample_positive_class_precisions(scores, truth):
    """Calculate precisions for each true class for a single sample.

    Args:
      scores: np.array of (num_classes,) giving the individual classifier scores.
      truth: np.array of (num_classes,) bools indicating which classes are true.

    Returns:
      pos_class_indices: np.array of indices of the true classes for this sample.
      pos_class_precisions: np.array of precisions corresponding to each of those
        classes.
    """
    num_classes = scores.shape[0]
    pos_class_indices = np.flatnonzero(truth > 0)
    # Only calculate precisions if there are some true classes.
    if not len(pos_class_indices):
        return pos_class_indices, np.zeros(0)
    # Retrieval list of classes for this sample.
    retrieved_classes = np.argsort(scores)[::-1]
    # class_rankings[top_scoring_class_index] == 0 etc.
    class_rankings = np.zeros(num_classes, dtype=np.int)
    class_rankings[retrieved_classes] = range(num_classes)
    # Which of these is a true label?
    retrieved_class_true = np.zeros(num_classes, dtype=np.bool)
    retrieved_class_true[class_rankings[pos_class_indices]] = True
    # Num hits for every truncated retrieval list.
    retrieved_cumulative_hits = np.cumsum(retrieved_class_true)
    # Precision of retrieval list truncated at each hit, in order of pos_labels.
    precision_at_hits = (
            retrieved_cumulative_hits[class_rankings[pos_class_indices]] /
            (1 + class_rankings[pos_class_indices].astype(np.float)))
    return pos_class_indices, precision_at_hits


def calculate_per_class_lwlrap(truth, scores):
    """Calculate label-weighted label-ranking average precision.

    Arguments:
      truth: np.array of (num_samples, num_classes) giving boolean ground-truth
        of presence of that class in that sample.
      scores: np.array of (num_samples, num_classes) giving the classifier-under-
        test's real-valued score for each class for each sample.

    Returns:
      per_class_lwlrap: np.array of (num_classes,) giving the lwlrap for each
        class.
      weight_per_class: np.array of (num_classes,) giving the prior of each
        class within the truth labels.  Then the overall unbalanced lwlrap is
        simply np.sum(per_class_lwlrap * weight_per_class)
    """
    assert truth.shape == scores.shape
    num_samples, num_classes = scores.shape
    # Space to store a distinct precision value for each class on each sample.
    # Only the classes that are true for each sample will be filled in.
    precisions_for_samples_by_classes = np.zeros((num_samples, num_classes))
    for sample_num in range(num_samples):
        pos_class_indices, precision_at_hits = (
            _one_sample_positive_class_precisions(scores[sample_num, :],
                                                  truth[sample_num, :]))
        precisions_for_samples_by_classes[sample_num, pos_class_indices] = (
            precision_at_hits)
    labels_per_class = np.sum(truth > 0, axis=0)
    weight_per_class = labels_per_class / float(np.sum(labels_per_class))
    # Form average of each column, i.e. all the precisions assigned to labels in
    # a particular class.
    per_class_lwlrap = (np.sum(precisions_for_samples_by_classes, axis=0) /
                        np.maximum(1, labels_per_class))
    # overall_lwlrap = simple average of all the actual per-class, per-sample precisions
    #                = np.sum(precisions_for_samples_by_classes) / np.sum(precisions_for_samples_by_classes > 0)
    #           also = weighted mean of per-class lwlraps, weighted by class label prior across samples
    #                = np.sum(per_class_lwlrap * weight_per_class)
    return per_class_lwlrap, weight_per_class


# Wrapper for fast.ai library
def lwlrap(scores, truth, **kwargs):
    score, weight = calculate_per_class_lwlrap(to_np(truth), to_np(scores))
    return torch.Tensor([(score * weight).sum()])

In [0]:
CUR_X_FILES, CUR_X = list(df.fname.values), Xval

test = ImageList.from_csv(ROOT, LIST, folder=FOLDER)
learn = load_learner(ROOT, CATMODEL, test=test)
preds, _ = learn.TTA(ds_type=DatasetType.Test)



In [0]:
preds.shape

torch.Size([300, 80])

In [0]:
df[learn.data.classes] = preds

In [0]:
df.head()

Unnamed: 0,fname,Accelerating_and_revving_and_vroom,Accordion,Acoustic_guitar,Applause,Bark,Bass_drum,Bass_guitar,Bathtub_(filling_or_washing),Bicycle_bell,Burping_and_eructation,Bus,Buzz,Car_passing_by,Cheering,Chewing_and_mastication,Child_speech_and_kid_speaking,Chink_and_clink,Chirp_and_tweet,Church_bell,Clapping,Computer_keyboard,Crackle,Cricket,Crowd,Cupboard_open_or_close,Cutlery_and_silverware,Dishes_and_pots_and_pans,Drawer_open_or_close,Drip,Electric_guitar,Fart,Female_singing,Female_speech_and_woman_speaking,Fill_(with_liquid),Finger_snapping,Frying_(food),Gasp,Glockenspiel,Gong,...,Harmonica,Hi-hat,Hiss,Keys_jangling,Knock,Male_singing,Male_speech_and_man_speaking,Marimba_and_xylophone,Mechanical_fan,Meow,Microwave_oven,Motorcycle,Printer,Purr,Race_car_and_auto_racing,Raindrop,Run,Scissors,Screaming,Shatter,Sigh,Sink_(filling_or_washing),Skateboard,Slam,Sneeze,Squeak,Stream,Strum,Tap,Tick-tock,Toilet_flush,Traffic_noise_and_roadway_noise,Trickle_and_dribble,Walk_and_footsteps,Water_tap_and_faucet,Waves_and_surf,Whispering,Writing,Yell,Zipper_(clothing)
0,4260ebea.wav,1.6e-05,1.301009e-07,1.791e-05,9e-06,0.000396,0.0006765875,5.999716e-06,0.025446,7.103402e-06,0.0003968805,2.2e-05,0.0006239201,3e-06,8e-06,0.022601,2.746397e-05,0.002044,0.0003295713,7.272942e-05,0.00017,0.01923914,0.015638,0.001087899,2.2e-05,0.000227,0.007139,0.000517,4.2e-05,0.1075511,9.608684e-06,6e-06,4.238789e-05,0.0005959356,0.021438,0.0001295589,0.017614,1.2e-05,5.605094e-06,9.44953e-06,...,3.715312e-05,0.004131241,0.002148,0.001181251,5.2e-05,6.142961e-06,3.796283e-05,0.000195068,3.282991e-05,0.000634,2.1e-05,2.7e-05,0.004553,0.000231,8.306584e-07,0.02370699,0.000538,0.028998,1.066365e-06,0.001034,1.17446e-06,0.286765,1.1e-05,4.1e-05,0.000183,0.000903,0.001613018,7.44852e-06,0.01436,0.142096,0.000443,5e-06,0.129391,0.000424,0.285869,8e-05,0.0002,0.028328,4.8e-05,0.001024
1,426eb1e0.wav,2e-06,5.988592e-09,4.120077e-10,0.352521,0.00019,5.139839e-07,5.831533e-07,0.006901,1.024968e-08,9.377709e-07,3.1e-05,8.370474e-06,5.3e-05,0.192986,0.006885,2.298369e-06,6e-06,0.000296175,4.137635e-06,0.055701,0.0001673141,0.036197,3.005337e-06,0.173045,1.5e-05,3.4e-05,1.1e-05,4.8e-05,2.58622e-05,3.539825e-08,0.000156,8.928399e-08,1.623092e-07,0.010126,5.88977e-07,0.000552,2e-06,2.669847e-08,9.692362e-09,...,1.866751e-06,7.502555e-07,7.1e-05,3.017919e-05,7.6e-05,1.704177e-07,3.652291e-07,1.846693e-09,2.452682e-05,1.8e-05,2e-06,3.2e-05,0.000138,0.002209,4.905682e-07,2.505451e-06,0.050219,0.001153,7.36628e-07,0.00029,7.382902e-07,9.7e-05,0.032293,4.7e-05,4.3e-05,0.000278,0.06030676,1.654383e-10,0.000287,7.9e-05,1.8e-05,0.000294,0.006659,0.047669,0.000322,0.013891,4.2e-05,0.001746,5e-06,0.013451
2,428d70bb.wav,0.004635,1.373146e-05,7.225299e-06,3.6e-05,0.001789,0.0007508134,5.35163e-06,0.003111,0.001358622,0.01684369,0.000479,0.002190173,0.003539,4.7e-05,0.003861,0.001295488,0.02144,0.001688127,0.0003249056,0.011242,0.0007232074,0.000284,0.001824447,0.00034,0.00353,0.012438,0.012565,0.024025,0.0163711,3.374201e-05,0.000687,0.0001751012,0.0002547792,0.000864,0.005639917,0.001213,0.02683,9.452584e-05,0.004911627,...,0.00072865,0.008193563,0.122601,0.002835424,0.001426,3.825687e-05,0.0006285759,0.0003637198,0.0004160294,0.002831,0.013509,0.011789,0.010729,0.000537,8.552849e-05,0.02029289,0.000736,0.010823,0.001962265,0.064268,0.1524536,0.002713,0.000347,0.009296,0.056323,0.008914,0.001221619,7.127979e-06,0.001746,0.003255,0.006596,0.001406,0.001147,0.001707,0.003774,0.000654,0.01003,0.001833,0.001804,0.016847
3,4292b1c9.wav,2.6e-05,2.694748e-08,5.906003e-06,6.8e-05,3.5e-05,9.226314e-08,6.661804e-08,0.00013,2.960586e-10,0.4766417,8e-06,2.18642e-07,4e-06,2.3e-05,4e-06,7.902773e-07,3e-06,5.71476e-08,5.79842e-07,0.000238,4.549833e-08,1e-05,3.425768e-09,0.000646,0.001013,4.4e-05,1.4e-05,0.001205,6.053093e-09,3.782321e-06,0.065347,2.52192e-07,2.860907e-07,1.6e-05,1.333445e-08,4e-06,0.004861,5.814206e-10,4.327004e-06,...,1.207932e-07,5.689253e-06,0.014436,1.198241e-08,3.8e-05,0.0005572794,0.003498761,5.636853e-10,7.300709e-08,0.024978,1.5e-05,7e-06,0.000869,0.000309,7.226987e-06,8.380139e-10,1e-06,2e-06,0.0003199444,9.5e-05,0.003728718,1.7e-05,0.010168,0.000213,0.05265,0.000794,5.608883e-07,2.571149e-06,2e-06,1e-06,3e-06,5.8e-05,1.2e-05,0.000124,3e-06,2.3e-05,0.000651,0.000109,0.002237,0.006589
4,429c5071.wav,0.002832,8.944511e-05,0.000826267,0.000288,0.005887,0.02261876,0.0002156085,0.005728,0.03052366,0.006425518,0.003208,0.001596837,0.005799,0.000224,0.013515,0.0006055528,0.007963,0.003476025,0.001215813,0.004213,0.003155947,0.012492,0.0007722504,0.000926,0.020993,0.00798,0.004207,0.16746,0.004311579,0.0003059989,0.007362,0.0003483712,0.0006290712,0.002892,0.001415574,0.006153,0.002902,0.001105773,0.00453375,...,0.0006175287,0.006451563,0.018794,0.009363737,0.088759,0.0001281071,0.003116667,0.003151969,0.0008621545,0.008212,0.03836,0.008775,0.017013,0.007702,0.0007246559,0.00487539,0.031859,0.01703,0.008419545,0.043317,0.01644225,0.00987,0.005963,0.114319,0.038765,0.02916,0.001700379,0.0007618472,0.031388,0.004961,0.00482,0.002247,0.003867,0.038572,0.005983,0.003021,0.001547,0.029281,0.002336,0.023473


In [0]:
df_res=df.drop(columns=['fname'])

In [0]:
for c in df_res.columns:
  df_res[c] = pd.to_numeric(df_res[c])

In [0]:
df_res['Sound'] = df_res.idxmax(axis=1)

In [0]:
df_res=pd.concat([df['fname'],df_res['Sound']], axis=1)

In [0]:

df_res.to_csv('category.csv', index=False)

In [0]:
df.head()

Unnamed: 0,fname,Accelerating_and_revving_and_vroom,Accordion,Acoustic_guitar,Applause,Bark,Bass_drum,Bass_guitar,Bathtub_(filling_or_washing),Bicycle_bell,Burping_and_eructation,Bus,Buzz,Car_passing_by,Cheering,Chewing_and_mastication,Child_speech_and_kid_speaking,Chink_and_clink,Chirp_and_tweet,Church_bell,Clapping,Computer_keyboard,Crackle,Cricket,Crowd,Cupboard_open_or_close,Cutlery_and_silverware,Dishes_and_pots_and_pans,Drawer_open_or_close,Drip,Electric_guitar,Fart,Female_singing,Female_speech_and_woman_speaking,Fill_(with_liquid),Finger_snapping,Frying_(food),Gasp,Glockenspiel,Gong,...,Harmonica,Hi-hat,Hiss,Keys_jangling,Knock,Male_singing,Male_speech_and_man_speaking,Marimba_and_xylophone,Mechanical_fan,Meow,Microwave_oven,Motorcycle,Printer,Purr,Race_car_and_auto_racing,Raindrop,Run,Scissors,Screaming,Shatter,Sigh,Sink_(filling_or_washing),Skateboard,Slam,Sneeze,Squeak,Stream,Strum,Tap,Tick-tock,Toilet_flush,Traffic_noise_and_roadway_noise,Trickle_and_dribble,Walk_and_footsteps,Water_tap_and_faucet,Waves_and_surf,Whispering,Writing,Yell,Zipper_(clothing)
0,4260ebea.wav,1.6e-05,1.301009e-07,1.791e-05,9e-06,0.000396,0.0006765875,5.999716e-06,0.025446,7.103402e-06,0.0003968805,2.2e-05,0.0006239201,3e-06,8e-06,0.022601,2.746397e-05,0.002044,0.0003295713,7.272942e-05,0.00017,0.01923914,0.015638,0.001087899,2.2e-05,0.000227,0.007139,0.000517,4.2e-05,0.1075511,9.608684e-06,6e-06,4.238789e-05,0.0005959356,0.021438,0.0001295589,0.017614,1.2e-05,5.605094e-06,9.44953e-06,...,3.715312e-05,0.004131241,0.002148,0.001181251,5.2e-05,6.142961e-06,3.796283e-05,0.000195068,3.282991e-05,0.000634,2.1e-05,2.7e-05,0.004553,0.000231,8.306584e-07,0.02370699,0.000538,0.028998,1.066365e-06,0.001034,1.17446e-06,0.286765,1.1e-05,4.1e-05,0.000183,0.000903,0.001613018,7.44852e-06,0.01436,0.142096,0.000443,5e-06,0.129391,0.000424,0.285869,8e-05,0.0002,0.028328,4.8e-05,0.001024
1,426eb1e0.wav,2e-06,5.988592e-09,4.120077e-10,0.352521,0.00019,5.139839e-07,5.831533e-07,0.006901,1.024968e-08,9.377709e-07,3.1e-05,8.370474e-06,5.3e-05,0.192986,0.006885,2.298369e-06,6e-06,0.000296175,4.137635e-06,0.055701,0.0001673141,0.036197,3.005337e-06,0.173045,1.5e-05,3.4e-05,1.1e-05,4.8e-05,2.58622e-05,3.539825e-08,0.000156,8.928399e-08,1.623092e-07,0.010126,5.88977e-07,0.000552,2e-06,2.669847e-08,9.692362e-09,...,1.866751e-06,7.502555e-07,7.1e-05,3.017919e-05,7.6e-05,1.704177e-07,3.652291e-07,1.846693e-09,2.452682e-05,1.8e-05,2e-06,3.2e-05,0.000138,0.002209,4.905682e-07,2.505451e-06,0.050219,0.001153,7.36628e-07,0.00029,7.382902e-07,9.7e-05,0.032293,4.7e-05,4.3e-05,0.000278,0.06030676,1.654383e-10,0.000287,7.9e-05,1.8e-05,0.000294,0.006659,0.047669,0.000322,0.013891,4.2e-05,0.001746,5e-06,0.013451
2,428d70bb.wav,0.004635,1.373146e-05,7.225299e-06,3.6e-05,0.001789,0.0007508134,5.35163e-06,0.003111,0.001358622,0.01684369,0.000479,0.002190173,0.003539,4.7e-05,0.003861,0.001295488,0.02144,0.001688127,0.0003249056,0.011242,0.0007232074,0.000284,0.001824447,0.00034,0.00353,0.012438,0.012565,0.024025,0.0163711,3.374201e-05,0.000687,0.0001751012,0.0002547792,0.000864,0.005639917,0.001213,0.02683,9.452584e-05,0.004911627,...,0.00072865,0.008193563,0.122601,0.002835424,0.001426,3.825687e-05,0.0006285759,0.0003637198,0.0004160294,0.002831,0.013509,0.011789,0.010729,0.000537,8.552849e-05,0.02029289,0.000736,0.010823,0.001962265,0.064268,0.1524536,0.002713,0.000347,0.009296,0.056323,0.008914,0.001221619,7.127979e-06,0.001746,0.003255,0.006596,0.001406,0.001147,0.001707,0.003774,0.000654,0.01003,0.001833,0.001804,0.016847
3,4292b1c9.wav,2.6e-05,2.694748e-08,5.906003e-06,6.8e-05,3.5e-05,9.226314e-08,6.661804e-08,0.00013,2.960586e-10,0.4766417,8e-06,2.18642e-07,4e-06,2.3e-05,4e-06,7.902773e-07,3e-06,5.71476e-08,5.79842e-07,0.000238,4.549833e-08,1e-05,3.425768e-09,0.000646,0.001013,4.4e-05,1.4e-05,0.001205,6.053093e-09,3.782321e-06,0.065347,2.52192e-07,2.860907e-07,1.6e-05,1.333445e-08,4e-06,0.004861,5.814206e-10,4.327004e-06,...,1.207932e-07,5.689253e-06,0.014436,1.198241e-08,3.8e-05,0.0005572794,0.003498761,5.636853e-10,7.300709e-08,0.024978,1.5e-05,7e-06,0.000869,0.000309,7.226987e-06,8.380139e-10,1e-06,2e-06,0.0003199444,9.5e-05,0.003728718,1.7e-05,0.010168,0.000213,0.05265,0.000794,5.608883e-07,2.571149e-06,2e-06,1e-06,3e-06,5.8e-05,1.2e-05,0.000124,3e-06,2.3e-05,0.000651,0.000109,0.002237,0.006589
4,429c5071.wav,0.002832,8.944511e-05,0.000826267,0.000288,0.005887,0.02261876,0.0002156085,0.005728,0.03052366,0.006425518,0.003208,0.001596837,0.005799,0.000224,0.013515,0.0006055528,0.007963,0.003476025,0.001215813,0.004213,0.003155947,0.012492,0.0007722504,0.000926,0.020993,0.00798,0.004207,0.16746,0.004311579,0.0003059989,0.007362,0.0003483712,0.0006290712,0.002892,0.001415574,0.006153,0.002902,0.001105773,0.00453375,...,0.0006175287,0.006451563,0.018794,0.009363737,0.088759,0.0001281071,0.003116667,0.003151969,0.0008621545,0.008212,0.03836,0.008775,0.017013,0.007702,0.0007246559,0.00487539,0.031859,0.01703,0.008419545,0.043317,0.01644225,0.00987,0.005963,0.114319,0.038765,0.02916,0.001700379,0.0007618472,0.031388,0.004961,0.00482,0.002247,0.003867,0.038572,0.005983,0.003021,0.001547,0.029281,0.002336,0.023473


# DEEPSPEECH

In [0]:
!pip3 install deepspeech
# Download pre-trained English model and extract
!curl -LO https://github.com/mozilla/DeepSpeech/releases/download/v0.6.1/deepspeech-0.6.1-models.tar.gz
!tar xvf deepspeech-0.6.1-models.tar.gz
# Download example audio files
!curl -LO https://github.com/mozilla/DeepSpeech/releases/download/v0.6.1/audio-0.6.1.tar.gz
!tar xvf audio-0.6.1.tar.gz
!apt install sox

Collecting deepspeech
[?25l  Downloading https://files.pythonhosted.org/packages/62/8b/82512dbb0a70a6b3ce11d3ee9165a8a2d5830f9e42f631b640685e5045dc/deepspeech-0.6.1-cp36-cp36m-manylinux1_x86_64.whl (9.6MB)
[K     |████████████████████████████████| 9.6MB 5.5MB/s 
Installing collected packages: deepspeech
Successfully installed deepspeech-0.6.1
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100   620    0   620    0     0   1610      0 --:--:-- --:--:-- --:--:--  1610
100 1172M  100 1172M    0     0  33.9M      0  0:00:34  0:00:34 --:--:-- 35.9M
._deepspeech-0.6.1-models
deepspeech-0.6.1-models/
deepspeech-0.6.1-models/._lm.binary
deepspeech-0.6.1-models/lm.binary
deepspeech-0.6.1-models/._output_graph.pbmm
deepspeech-0.6.1-models/output_graph.pbmm
deepspeech-0.6.1-models/._output_graph.pb
deepspeech-0.6.1-models/output_graph.pb
deepspeech-0.6.1-models/._trie
deepspeech-0.6.1-

In [0]:
def transcribe(fn):
  # Transcribe an audio file
  os.system(f'sox audio/{fn} --bits 16 --channels 1 --rate 16000 --encoding signed-integer --endian little --compression 0.0 --no-dither .temp.wav')
  os.system(f"deepspeech --model deepspeech-0.6.1-models/output_graph.pbmm --lm deepspeech-0.6.1-models/lm.binary --trie deepspeech-0.6.1-models/trie --audio temp.wav > .transcript")
  with open('transcript', 'r') as file:
    data = file.read()
  return data

In [0]:
!cd blah && sox /content/test/4260ebea.wav file_out.wav trim 0 1 : newfile : restart

sox WARN trim: Last 1 position(s) not reached (audio shorter than expected).


In [0]:
!mkdir blah

In [0]:
transcribe('2830-3980-0043.wav')

'experience proof less\n'

## LOOP

In [0]:
with open('transcript', 'r') as file:
    data = file.read()
data

''