In [87]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

In this notebook we will get to know our data better. We'll be splitting the given training dataset into training and testing datasets of smaller size.

In [88]:
mypath = 'freesound-audio-tagging/'

data=pd.read_csv(mypath+'/train.csv')
data.head()

Unnamed: 0,fname,label,manually_verified
0,00044347.wav,Hi-hat,0
1,001ca53d.wav,Saxophone,1
2,002d256b.wav,Trumpet,0
3,0033e230.wav,Glockenspiel,1
4,00353774.wav,Cello,1


In [89]:
ds_size = data['label'].size
n_categories= data['label'].nunique()

print("Dataset size: %d \nNumber of categories: %d"%(ds_size, n_categories))

Dataset size: 9473 
Number of categories: 41


In [90]:
#Count number of entries for each category
data['label'].value_counts()

Trumpet                  300
Flute                    300
Squeak                   300
Saxophone                300
Fart                     300
Shatter                  300
Acoustic_guitar          300
Hi-hat                   300
Clarinet                 300
Double_bass              300
Laughter                 300
Fireworks                300
Tearing                  300
Violin_or_fiddle         300
Bass_drum                300
Applause                 300
Snare_drum               300
Cello                    300
Oboe                     299
Gong                     292
Knock                    279
Writing                  270
Cough                    243
Bark                     239
Tambourine               221
Burping_or_eructation    210
Cowbell                  191
Harmonica                165
Drawer_open_or_close     158
Meow                     155
Electric_piano           150
Gunshot_or_gunfire       147
Microwave_oven           146
Keys_jangling            139
Telephone     

In [91]:
#we'll use categories with 300 samples
categories =pd. concat([data['label'].value_counts()[:4],data['label'].value_counts()[5:18]], axis=0)
print(categories)

Trumpet             300
Flute               300
Squeak              300
Saxophone           300
Shatter             300
Acoustic_guitar     300
Hi-hat              300
Clarinet            300
Double_bass         300
Laughter            300
Fireworks           300
Tearing             300
Violin_or_fiddle    300
Bass_drum           300
Applause            300
Snare_drum          300
Cello               300
Name: label, dtype: int64


In [92]:
#get an array of the categories we'll be using
categories = categories.index.tolist()
print(categories)

['Trumpet', 'Flute', 'Squeak', 'Saxophone', 'Shatter', 'Acoustic_guitar', 'Hi-hat', 'Clarinet', 'Double_bass', 'Laughter', 'Fireworks', 'Tearing', 'Violin_or_fiddle', 'Bass_drum', 'Applause', 'Snare_drum', 'Cello']


In [93]:
#number of training samples
n_training = 30
#number of testing samples
n_testing = 5
#first entry
train_df = data[data.label == categories[0]][:n_training]
test_df = data[data.label == categories[0]][n_training:n_testing+n_training]

for category in categories[1:]:
    train_df =train_df.append(data[data.label == category][:n_training], ignore_index=True)
    test_df =test_df.append(data[data.label == category][n_training:n_testing+n_training], ignore_index=True)

#reset index 
train_df.reset_index(drop=True,inplace = True)
test_df.reset_index(drop=True,inplace = True)

print(train_df )   
print(test_df)

            fname    label  manually_verified
0    002d256b.wav  Trumpet                  0
1    034e4ffa.wav  Trumpet                  0
2    03e13ae7.wav  Trumpet                  0
3    04335030.wav  Trumpet                  0
4    04490642.wav  Trumpet                  0
..            ...      ...                ...
505  147f4395.wav    Cello                  0
506  14fd19da.wav    Cello                  0
507  154df401.wav    Cello                  1
508  161be325.wav    Cello                  0
509  188acf6a.wav    Cello                  1

[510 rows x 3 columns]
           fname    label  manually_verified
0   16d7ab94.wav  Trumpet                  1
1   17cbf99c.wav  Trumpet                  0
2   180cd1d7.wav  Trumpet                  1
3   1902bf23.wav  Trumpet                  0
4   1a13793b.wav  Trumpet                  0
..           ...      ...                ...
80  18c3695b.wav    Cello                  0
81  195f544d.wav    Cello                  1
82  199b1adf.wav   

In [94]:
#save data to a csv file
train_df.to_csv(mypath+"training_new.csv", index=False)
test_df.to_csv(mypath+"testing_new.csv", index=False)