<a href="https://colab.research.google.com/github/jasperSha/cloud_music/blob/main/neuralnet.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import auth
auth.authenticate_user()

In [None]:
!gcloud config set project cluster-music

In [None]:
import os
# create virtual directory for image data
os.makedirs('song_images', exist_ok=True)

# create virtual directory for frequency data
os.makedirs('song_freqs', exist_ok=True)

In [None]:
# load metadata
!gsutil cp gs://deepclustermusic/gcp_meta.csv .

# load images
!gsutil cp gs://deepclustermusic/song_images './song_images'

# load frequency data
!gsutil cp gs://deepclustermusic/song_freqs './song_freqs'

In [None]:
'''
spectrogram image size=(224, 224)

ideal params (leveraged with resnet18 model)
  batch_size:
    value: 64
  sample_rate:
    value: 44100
  hop_length:
    value: 308 
  win_length:
    value: 2205
  n_mels:
    value: 224
  n_fft:
    value: 4096
  normalize:
    value: True
  mix_up:
    value: 0.1
  f_max:
    value: 18000
  arch:
    value: resnet18
  n_epochs:
    values: [10, 20, 80] 
  trial_num:
    values: [1, 2, 3, 4, 5]
  fold:
    values: [1, 2, 3, 4, 5]

for audio classification with resnet18 pre-trained
about 89.54% accuracy, in 80 epochs, little over 14 minutes, although I think this is with some heavy rental fees

'''

In [None]:
# tensorflow syntax here
def self_attention(in_shape, ch, k=8):
  height, width, channel = in_shape
  x = layers.Input(shape = [height, width, channel])
	
  f = layers.Conv2D(ch // k, kernel_size = (1, 1), strides = (1, 1), padding='same', use_bias=True)(x)
  f = layers.MaxPooling2D()(f)
  f = layers.Reshape((-1, f.shape[-1]))(f)
	
  g = layers.Conv2D(ch // k, kernel_size = (1, 1), strides = (1, 1), padding='same', use_bias=True)(x)
  g = layers.Reshape((-1, g.shape[-1]))(g)
	
  h = layers.Conv2D(ch // 2, kernel_size = (1, 1), strides = (1, 1), padding='same', use_bias=True)(x)
  h = layers.MaxPooling2D()(h)
  h = layers.Reshape((-1, h.shape[-1]))(h)
	
  s = tf.matmul(g, f, transpose_b=True)
  s = keras.layers.Softmax()(s)
		
  o = tf.matmul(s, h)

  o = layers.Reshape((height, width, ch // 2))(o)
  o = layers.Conv2D(channel, kernel_size = (1, 1), strides = (1, 1), padding='same', use_bias=True)(o)
  o = Scalar()(o)
  o = o + x

  SA = keras.Model(inputs=x, outputs=o)
	
  return SA


class Scalar(layers.Layer):
  def __init__(self):
    super(Scalar, self).__init__()

  def build(self, input_shape):
    self.gamma = tf.Variable(initial_value=tf.zeros(1), trainable=True)
    self._trainable_weights=[self.gamma]

  def call(self, inputs):
    return layers.Rescaling(self.gamma)(inputs)

In [None]:
def conv_model():
  input = layers.Input(shape=(256, 128))

  image = SpectralNorm(layers.Conv2D(128, (5, 5), strides=(1, 1), padding='same', input_shape=[256, 128, 1]))(input)
  image = layers.LeakyReLU(image)
  image = layers.Dropout(0.3)(image)

