In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from keras.preprocessing.text import Tokenizer
from keras_preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D, Bidirectional
from sklearn.model_selection import train_test_split
from keras.utils.np_utils import to_categorical
from keras.callbacks import EarlyStopping
from keras.layers import Dropout
import re

from nltk.corpus import stopwords

STOPWORDS = set(stopwords.words('english'))

In [None]:
df = pd.read_excel('../input/WD_Cleaned_Dataset.xlsx', usecols=["Email Query", "New - Category"], engine="openpyxl")

In [None]:
df.rename(columns={"Email Query": "Queries", "New - Category": "Categories"}, inplace=True)
df.head()

In [None]:
df['Encoded'] = df.Categories.astype('category').cat.codes
new_cat = [f"category_{str(i)}" for i in df.Encoded]
df['NewCategory']= new_cat

In [None]:
df.head()

In [None]:
df.NewCategory.value_counts().sort_values(ascending=False)

In [None]:
df = df.reset_index(drop=True)
REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]')
BAD_SYBMOLS_RE = re.compile('[^0-9a-z #+_]')
STOPWORDS = set(stopwords.words('english'))

def clean_text(text):
    text = text.lower()
    text = REPLACE_BY_SPACE_RE.sub(' ', text)
    text = BAD_SYBMOLS_RE.sub('', text)
    text = text.replace('x', '')
    text = " ".join(word for word in text.split() if word not in STOPWORDS)
    return text

In [None]:
def remove_url(text):
    url = re.compile(f'https?://\S+|www\.\S+')
    return url.sub(r"", text)

def remove_html(text):
    html = re.compile(r"<.*?>")
    return html.sub(r"", text)

In [None]:
df['Queries'] = str(df.Queries)
df['Queries'] = df.Queries.apply(clean_text)

df["Queries"] = df.Queries.apply(remove_url)
df["Queries"] = df.Queries.apply(remove_html)

In [None]:
MAX_NB_WORDS = 50000
MAX_SEQUENCE_LENGTH = 600
EMBEDDING_DIM = 250

tokenizer = Tokenizer(num_words=MAX_NB_WORDS, filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~', lower=True)
tokenizer.fit_on_texts(df['Queries'].values)

word_index = tokenizer.word_index

print(f'Found {len(word_index)} unique token.')

In [None]:
X = tokenizer.texts_to_sequences(df['Queries'].values)
X = pad_sequences(X, maxlen=MAX_SEQUENCE_LENGTH)
print("Shape of Data tensor:", X.shape)

#### CONVERTING CATEGORICAL TO NUMERICAL LABELS

In [None]:
Y = pd.get_dummies(df.NewCategory).values
print("Shape of label tensor: ", Y.shape)

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)
print(X_train.shape, Y_train.shape)
print(X_test.shape, Y_test.shape)

In [None]:
model = Sequential()
model.add(Embedding(MAX_NB_WORDS, EMBEDDING_DIM, input_length=X.shape[1]))
model.add(Dropout(0.5))
model.add(LSTM(EMBEDDING_DIM, recurrent_dropout=0.2))
model.add(Dense(3410, activation='softmax'))

model.compile(loss="categorical_crossentropy", optimizer='adam', metrics=['accuracy'])

epochs=2
batch_size=64

history = model.fit(X_train, Y_train, epochs=epochs, batch_size=batch_size, validation_split=0.1, callbacks=[EarlyStopping(monitor='val_loss', patience=3, min_delta=0.0001)])