In [31]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import json
# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory
import os
import en_core_web_sm

from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Input, Dense, GRU, Embedding, Bidirectional, Activation
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import SimpleRNN
from tensorflow.keras.layers import Conv1D
from tensorflow.keras.layers import Dropout
from tensorflow.keras.layers import BatchNormalization
from tensorflow.keras.layers import GlobalMaxPooling1D
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences


In [2]:
class LoadingData():
            
    def __init__(self):
        train_file_path = os.path.join("..","input","nlp-benchmarking-data-for-intent-and-entity","benchmarking_data","Train")
        validation_file_path = os.path.join("..","input","nlp-benchmarking-data-for-intent-and-entity","benchmarking_data","Validate")
        category_id = 0
        self.cat_to_intent = {}
        self.intent_to_cat = {}
        
        for dirname, _, filenames in os.walk(train_file_path):
            for filename in filenames:
                file_path = os.path.join(dirname, filename)
                intent_id = filename.replace(".json","")
                self.cat_to_intent[category_id] = intent_id
                self.intent_to_cat[intent_id] = category_id
                category_id+=1
        print(self.cat_to_intent)
        print(self.intent_to_cat)
        '''Training data'''
        training_data = list() 
        for dirname, _, filenames in os.walk(train_file_path):
            for filename in filenames:
                file_path = os.path.join(dirname, filename)
                intent_id = filename.replace(".json","")
                training_data+=self.make_data_for_intent_from_json(file_path,intent_id,self.intent_to_cat[intent_id])
        self.train_data_frame = pd.DataFrame(training_data, columns =['query', 'intent','category'])   
        
        np.random.shuffle(self.train_data_frame.values)

        
        '''Validation data'''
        validation_data = list()    
        for dirname, _, filenames in os.walk(validation_file_path):
            for filename in filenames:
                file_path = os.path.join(dirname, filename)
                intent_id = filename.replace(".json","")
                validation_data +=self.make_data_for_intent_from_json(file_path,intent_id,self.intent_to_cat[intent_id])                
        self.validation_data_frame = pd.DataFrame(validation_data, columns =['query', 'intent','category'])
        
        np.random.shuffle(self.validation_data_frame.values)
        
    def make_data_for_intent_from_json(self,json_file,intent_id,cat):
        json_d = json.load(open(json_file))         
        
        json_dict = json_d[intent_id]

        sent_list = list()
        for i in json_dict:
            each_list = i['data']
            sent =""
            for i in each_list:
                sent = sent + i['text']+ " "
            sent =sent[:-1]
            for i in range(3):
                sent = sent.replace("  "," ")
            sent_list.append((sent,intent_id,cat))
        return sent_list
            

In [3]:
load_data_obj = LoadingData()

{0: 'RateBook', 1: 'SearchScreeningEvent', 2: 'PlayMusic', 3: 'SearchCreativeWork', 4: 'GetWeather', 5: 'BookRestaurant', 6: 'AddToPlaylist'}
{'RateBook': 0, 'SearchScreeningEvent': 1, 'PlayMusic': 2, 'SearchCreativeWork': 3, 'GetWeather': 4, 'BookRestaurant': 5, 'AddToPlaylist': 6}


In [4]:
load_data_obj.train_data_frame

Unnamed: 0,query,intent,category
0,rate The Lotus and the Storm zero of 6 \n,RateBook,0
1,Rate The Fall-Down Artist 5 stars .,RateBook,0
2,Rate the current novel one points,RateBook,0
3,rate The Ape-Man Within 4,RateBook,0
4,I give The Penalty three stars,RateBook,0
...,...,...,...
13779,I'd like to put Ryo Yamazaki onto my sylvia pl...,AddToPlaylist,6
13780,add sweets edison to relaxing playlist,AddToPlaylist,6
13781,Add ana carolina to chill,AddToPlaylist,6
13782,Add decade in the sun best of stereophonics to...,AddToPlaylist,6


In [5]:
load_data_obj.validation_data_frame

Unnamed: 0,query,intent,category
0,rate this album four out of 6 stars,RateBook,0
1,Give this textbook four stars .,RateBook,0
2,rate A Twist in the Tale zero out of 6 points,RateBook,0
3,Rate The children of Niobe 1 out of 6 points .,RateBook,0
4,Give zero stars to Halo: Ghosts of Onyx,RateBook,0
...,...,...,...
695,add we have a theme song to my House Afterwork...,AddToPlaylist,6
696,add the song to my We Everywhere playlist,AddToPlaylist,6
697,Add Roel van Velzen to my party of the century...,AddToPlaylist,6
698,Add the artist to the political punks playlist.,AddToPlaylist,6


In [27]:
class Preprocessing():
    def __init__(self):
        self.x_train = None
        self.y_train = None
        self.x_valid = None
        self.y_valid = None
        self.spacy_model = en_core_web_sm.load()
        self.tokenizer = None

    def createData(self):
        self.tokenizer = Tokenizer(num_words=None)
        self.max_len = 50
        self.x_train, self.x_valid, self.y_train, self.y_valid = train_test_split(load_data_obj.train_data_frame['query'].tolist(),load_data_obj.train_data_frame['category'].tolist(),test_size=0.1)
        self.tokenizer.fit_on_texts(list(self.x_train) + list(self.x_valid))
        self.x_train = self.tokenizer.texts_to_sequences(self.x_train)
        self.x_valid = self.tokenizer.texts_to_sequences(self.x_valid)

        #zero pad the sequences
        self.x_train = pad_sequences(self.x_train, maxlen=self.max_len)
        self.x_valid = pad_sequences(self.x_valid, maxlen=self.max_len)
        self.y_train = to_categorical(self.y_train)
        self.y_valid = to_categorical(self.y_valid)
        self.word_index = self.tokenizer.word_index
        
    def getSpacyEmbeddings(self,sentneces):
        sentences_vectors = list()
        for item in sentneces:
            query_vec = self.spacy_model(item) 
            sentences_vectors.append(query_vec.vector)
        return sentences_vectors
    
    
    
    
        

In [28]:
preprocess_obj = Preprocessing()
preprocess_obj.createData()

In [16]:
preprocess_obj.y_train.shape

(12405, 7)

In [17]:
preprocess_obj.y_valid.shape

(1379, 7)

In [33]:
class DesignModel():
    def __init__(self):
        self.model = None
        self.x_train = preprocess_obj.x_train
        self.y_train = preprocess_obj.y_train
        self.x_valid = preprocess_obj.x_valid
        self.y_valid = preprocess_obj.y_valid
        
    def simple_rnn(self):
        self.model = Sequential()
        self.model.add(Embedding(len(preprocess_obj.word_index) + 1,100,input_length=preprocess_obj.max_len))
        self.model.add(SimpleRNN(100))
        self.model.add(Dense(len(load_data_obj.cat_to_intent), activation='sigmoid'))
        self.model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
        
        
    def model_train(self,batch_size,num_epoch):
        print("Fitting to model")
        self.model.fit(self.x_train, self.y_train, batch_size=batch_size, epochs=num_epoch, validation_data=[self.x_valid, self.y_valid])
        print("Model Training complete.")

    def save_model(self,model,model_name):    
        self.model.save("intent_models/"+model_name+".h5")
        print("Model saved to Model folder.")

In [34]:
model_obj = DesignModel()
model_obj.simple_rnn()
model_obj.model_train(64,5)

Fitting to model


ValueError: A target array with shape (12405, 7) was passed for an output of shape (None, 1) while using as loss `categorical_crossentropy`. This loss expects targets to have the same shape as the output.