keunwoochoi · keunwoochoi · Aug 15, 2020 · Aug 2, 2020 · Aug 3, 2020 · Aug 13, 2020
diff --git a/README.md b/README.md
diff --git a/examples/example_codes.ipynb b/examples/example_codes.ipynb
diff --git a/examples/how-to-use-Kapre.ipynb b/examples/how-to-use-Kapre.ipynb
@@ -0,0 +1,304 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# How to use Kapre - example"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "2020/8/14\n",
+      "Tensorflow: 2.3.0\n",
+      "Librosa: 0.8.0\n",
+      "Image data format: channels_last\n",
+      "Kapre: 0.3.0-rc\n"
+     ]
+    }
+   ],
+   "source": [
+    "import librosa\n",
+    "import kapre\n",
+    "import tensorflow as tf\n",
+    "from tensorflow.keras.models import Sequential\n",
+    "import numpy as np\n",
+    "\n",
+    "from datetime import datetime\n",
+    "now = datetime.now()\n",
+    "\n",
+    "print('%s/%s/%s' % (now.year, now.month, now.day))\n",
+    "print('Tensorflow: {}'.format(tf.__version__))\n",
+    "print('Librosa: {}'.format(librosa.__version__))\n",
+    "print('Image data format: {}'.format(tf.keras.backend.image_data_format()))\n",
+    "print('Kapre: {}'.format(kapre.__version__))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Loading an mp3 file"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 19,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Audio length: 453888 samples, 10.29 seconds. \n",
+      "Audio sample rate: 44100 Hz\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/Users/keunwoochoi/miniconda3/envs/kapre/lib/python3.7/site-packages/librosa/core/audio.py:162: UserWarning: PySoundFile failed. Trying audioread instead.\n",
+      "  warnings.warn(\"PySoundFile failed. Trying audioread instead.\")\n"
+     ]
+    }
+   ],
+   "source": [
+    "src, sr = librosa.load('../srcs/bensound-cute.mp3', sr=None, mono=True)\n",
+    "print('Audio length: %d samples, %04.2f seconds. \\n' % (len(src), len(src) / sr) +\n",
+    "      'Audio sample rate: %d Hz' % sr)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Trim it and make it a 2d.\n",
+    "\n",
+    "If your file is mono, librosa.load returns a 1D array. Kapre always expects 2d array, so make it 2d.\n",
+    "\n",
+    "On my computer, I use default `image_data_format == 'channels_last'`. I'll keep it in that way for the audio data, too."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 20,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "The shape of an item (44100, 1)\n"
+     ]
+    }
+   ],
+   "source": [
+    "len_second = 1.0 # Let's trim it to make it quick\n",
+    "src = src[:int(sr*len_second)]\n",
+    "src = np.expand_dims(src, axis=1)\n",
+    "input_shape = src.shape\n",
+    "print('The shape of an item', input_shape)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Let's make it a batch of 4 items\n",
+    "\n",
+    "to make it more like a proper dataset. You should have many files indeed."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 21,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "The shape of a batch:  (4, 44100, 1)\n"
+     ]
+    }
+   ],
+   "source": [
+    "x = np.array([src] * 4)\n",
+    "print('The shape of a batch: ',x.shape)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# A Keras model\n",
+    "\n",
+    "A simple model with 10-class and single-label classification."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 33,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Model: \"sequential_5\"\n",
+      "_________________________________________________________________\n",
+      "Layer (type)                 Output Shape              Param #   \n",
+      "=================================================================\n",
+      "stft-layer (STFT)            (None, 42, 1025, 1)       0         \n",
+      "=================================================================\n",
+      "Total params: 0\n",
+      "Trainable params: 0\n",
+      "Non-trainable params: 0\n",
+      "_________________________________________________________________\n"
+     ]
+    }
+   ],
+   "source": [
+    "from kapre.time_frequency import STFT, Magnitude, MagnitudeToDecibel\n",
+    "\n",
+    "\n",
+    "model = Sequential()\n",
+    "# A STFT layer\n",
+    "model.add(STFT(n_fft=2048, win_length=2018, hop_length=1024,\n",
+    "               window_fn=None, pad_end=False,\n",
+    "               input_data_format='channels_last', output_data_format='channels_last',\n",
+    "               input_shape=input_shape,\n",
+    "              name='stft-layer'))\n",
+    "model.summary()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "- The model has no trainable parameters because `STFT` layer uses `tf.signal.stft()` function which is just an implementation of FFT-based short-time Fourier transform.\n",
+    "- The output shape is `(batch, time, frequency, channels)`. \n",
+    "  - `42` (time) is the number of STFT frames. A shorter hop length would make it (nearly) proportionally longer. If `pad_end=True`, the input audio signals become a little longer, hence the number of frames would get longer, too.\n",
+    "  - `1025` is the number of STFT bins and decided as `n_fft // 2 + 1`. \n",
+    "  - `1` channel: because the input signal was single-channel.\n",
+    "- The output of `STFT` layer is `complex` number."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Let's add more layers like a real model!"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 34,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from tensorflow.keras.layers import Conv2D, BatchNormalization, ReLU, GlobalAveragePooling2D, Dense, Softmax"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 35,
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [],
+   "source": [
+    "model.add(Magnitude())\n",
+    "model.add(MagnitudeToDecibel())\n",
+    "model.add(Conv2D(32, (3, 3), strides=(2, 2)))\n",
+    "model.add(BatchNormalization())\n",
+    "model.add(ReLU())\n",
+    "model.add(GlobalAveragePooling2D())\n",
+    "model.add(Dense(10))\n",
+    "model.add(Softmax())\n",
+    "\n",
+    "# Compile the model\n",
+    "model.compile('adam', 'categorical_crossentropy') # if single-label classification\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 36,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Model: \"sequential_5\"\n",
+      "_________________________________________________________________\n",
+      "Layer (type)                 Output Shape              Param #   \n",
+      "=================================================================\n",
+      "stft-layer (STFT)            (None, 42, 1025, 1)       0         \n",
+      "_________________________________________________________________\n",
+      "magnitude (Magnitude)        (None, 42, 1025, 1)       0         \n",
+      "_________________________________________________________________\n",
+      "magnitude_to_decibel (Magnit (None, 42, 1025, 1)       0         \n",
+      "_________________________________________________________________\n",
+      "conv2d_1 (Conv2D)            (None, 20, 512, 32)       320       \n",
+      "_________________________________________________________________\n",
+      "batch_normalization (BatchNo (None, 20, 512, 32)       128       \n",
+      "_________________________________________________________________\n",
+      "re_lu (ReLU)                 (None, 20, 512, 32)       0         \n",
+      "_________________________________________________________________\n",
+      "global_average_pooling2d (Gl (None, 32)                0         \n",
+      "_________________________________________________________________\n",
+      "dense (Dense)                (None, 10)                330       \n",
+      "_________________________________________________________________\n",
+      "softmax (Softmax)            (None, 10)                0         \n",
+      "=================================================================\n",
+      "Total params: 778\n",
+      "Trainable params: 714\n",
+      "Non-trainable params: 64\n",
+      "_________________________________________________________________\n"
+     ]
+    }
+   ],
+   "source": [
+    "model.summary()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "- I added `Magnitude()` which is a simple `abs()` operation on the complex numbers.\n",
+    "- `MagnitudeToDecibel` maps the numbers to a decibel scale."
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.7.7"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 1
+}