### Basic help resource: https://github.com/WillKoehrsen/recurrent-neural-networks

### Extracted data from source & preparing dataset file

In [1]:
from urllib.request import *
import json
import pandas as pd

In [2]:
datasetURL = "https://api.patentsview.org/patents/query?q={%22_and%22:[{%22_text_all%22:{%22patent_title%22:%22music%22}}]}&f=[%22patent_title%22,%22patent_abstract%22]&o={%22per_page%22:3000}"
connection = urlopen(datasetURL)
response = json.load(connection)
print(response["total_patent_count"])

2729


In [3]:
df = pd.read_json(json.dumps(response["patents"]))
df = df.dropna()
df.to_csv("./dataset/patents_music.csv",index=False)
df.head()

Unnamed: 0,patent_title,patent_abstract
0,System for electronically generating music,A musical instrument for electronically produc...
1,Systems and methods for target training includ...,Systems and methods for target training includ...
2,"Method for obtaining music data, earphone and ...",The present disclosure relates to a method for...
3,Smart gallery and automatic music video creati...,Various embodiments provide a so-called smart ...
4,Apparatus and method for providing streaming m...,An apparatus and a method for sharing contents...


### Cleaning dataset

In [4]:
df = pd.read_csv("./dataset/patents_music.csv")
print(df.shape)
df = df.dropna()
print(df.shape)

(2602, 2)
(2454, 2)


In [7]:
abstracts = df['patent_abstract'].tolist()
abstracts[100][:300]

'Tools and techniques are provided for identifying, collecting, and processing music-related content within a radio broadcast environment. In one embodiment, a method is provided for processing music-related broadcast radio data. The method includes receiving a plurality of broadcast radio station si'

In [14]:
from keras.preprocessing.text import Tokenizer
tokenizer = Tokenizer(num_words=None, filters='!"#$%&()*+,-./:;<=>?\\[]{}\t\n|@_^~`', lower=True, split=' ')
tokenizer.fit_on_texts(abstracts)
sequences = tokenizer.texts_to_sequences(abstracts)
sequences[100][:15]

[1847, 4, 706, 17, 48, 9, 300, 1701, 4, 102, 6, 191, 49, 118, 2]

In [15]:
idx_words = tokenizer.index_word
' '.join(idx_words[w] for w in sequences[100][:30])

'tools and techniques are provided for identifying collecting and processing music related content within a radio broadcast environment in one embodiment a method is provided for processing music related broadcast'

### Training data preparation

In [17]:
import numpy as np

In [20]:
features = []
labels = []

training_length = 50

for seq in sequences:
    for i in range(training_length, len(seq)):
        
        extract = seq[i - training_length:i + 1]
        features.append(extract[:-1])
        labels.append(extract[-1])
        
features = np.array(features)
print(features.shape)

(185839, 50)


In [22]:
num_words = len(idx_words)+1
print(num_words)
label_array = np.zeros((len(features),num_words),dtype = np.int8)
for example_index, word_index in enumerate(labels):
    label_array[example_index][word_index]=1
print(label_array.shape)

8878
(185839, 8878)


### Model building & training

In [None]:
from keras.models import Sequential
from keras.layers import LSTM, Dense, Dropout, Masking, Embedding

model = Sequential()

model.add(
    Embedding(input_dim=num_words,
              input_length = training_length,
              output_dim=100,
              weights=[embedding_matrix],
              trainable=False,
              mask_zero=True))

model.add(Masking(mask_value=0.0))
model.add(LSTM(64, return_sequences=False, dropout=0.1, recurrent_dropout=0.1))
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(num_words, activation='softmax'))

model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

In [None]:
from keras.callbacks import EarlyStopping, ModelCheckpoint

callbacks = [EarlyStopping(monitor='val_loss', patience=5),
             ModelCheckpoint('../models/model.h5'), save_best_only=True, 
                             save_weights_only=False)]

history = model.fit(X_train,  y_train, 
                    batch_size=2048, epochs=150,
                    callbacks=callbacks,
                    validation_data=(X_valid, y_valid))