In [None]:
import matplotlib.pyplot as plt
plt.style.use("ggplot")
import re
import pandas as pd
import seaborn as sns

from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report,confusion_matrix
from sklearn.externals import joblib 

from keras.preprocessing import sequence
from keras.models import Model, Input, Sequential
from keras.layers import Flatten ,Dense, Embedding, GlobalMaxPooling1D
from keras.preprocessing.text import Tokenizer
from keras.optimizers import Adam

# from nltk.corpus import stopwords
# from nltk.tokenize import word_tokenize
# from nltk.stem import WordNetLemmatizer

from random import random
import numpy as np
%matplotlib inline


# Read in data

In [None]:
df = pd.read_csv("../data/data2.csv",encoding="utf-8")
df = df[['Keyword','google_class']]
df.columns = ['keyword','google_class']
df.google_class = df.google_class.astype(str)
df.google_class = df.google_class.apply(lambda x: x.replace(" / ","_").replace(" ","_"))

# Transform label to column based

In [None]:
df['class_list'] = df.google_class.str.split(",")

In [None]:
mlb = MultiLabelBinarizer()
df2 = df.join(pd.DataFrame(mlb.fit_transform(df.pop('class_list')),
                          columns=mlb.classes_,
                          index=df.index))

# Show data stats

In [None]:
df_count = df2.drop(['keyword', 'google_class'], axis=1)
counts = []
categories = list(df_count.columns.values)
for i in categories:
    counts.append((i, df_count[i].sum()))
df_stats = pd.DataFrame(counts, columns=['category', 'number_of_queries'])
df_stats.sort_values('number_of_queries',ascending=False).head()

### How many queries have more than 1 label

In [None]:
rowsums = df2.iloc[:,2:].sum(axis=1)
x=rowsums.value_counts()
#plot
plt.figure(figsize=(8,5))
ax = sns.barplot(x.index, x.values)
plt.title("Multiple categories per query")
plt.ylabel('# of Occurrences', fontsize=12)
plt.xlabel('# of categories', fontsize=12)

### Number of words per query

In [None]:
lens = df2.keyword.str.len()
lens.hist(bins = np.arange(0,89,1))

# clean text

In [None]:
#strip all punctuations and white spaces except dollar sign and hashtags
def clean_text(text):
    removelist = '$#'
    text = text.lower()
    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"can't", "can not ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r"\'scuse", " excuse ", text)
    #text = re.sub('\W', ' ', text)
    text = re.sub(r"[^\w"+removelist+"]", " ", text)
    text = re.sub('\s+', ' ', text)
    text = text.strip(' ')
    return text

In [None]:
df2['keyword2'] = df2['keyword'].map(lambda x : clean_text(x))
df2['keyword2'][13]

In [None]:
df2.drop(columns=['keyword','google_class'],inplace=True)

# Export final DF

In [None]:
df2.to_csv("label_encoded_data.csv",index=False)

In [None]:
df = pd.read_csv('label_encoded_data.csv')
df.head()

# Tokenize the data

In [None]:
train, test = train_test_split(df, random_state=42, test_size=0.33, shuffle=True)

X_train = train.keyword2
y_train = train.drop(columns= ['keyword2'])
X_test = test.keyword2
y_test = test.drop(columns = ['keyword2'])

print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

In [None]:
number_feature_cols = len(y_test.columns)
print(number_feature_cols)

In [None]:
max_features = 20000  # number of words we want to keep
maxlen = 100  # max length of the comments in the model
batch_size = 64  # batch size for the model
embedding_dims = 20  # dimension of the hidden variable, i.e. the embedding dimension

In [None]:
tok = Tokenizer(num_words=max_features)
tok.fit_on_texts(list(X_train) + list(X_test))
x_train = tok.texts_to_sequences(X_train)
x_test = tok.texts_to_sequences(X_test)
print(len(x_train), 'train sequences')
print(len(x_test), 'test sequences')
print('Average train sequence length: {}'.format(np.mean(list(map(len, x_train)), dtype=int)))
print('Average test sequence length: {}'.format(np.mean(list(map(len, x_test)), dtype=int)))

# Pad tokens to desired length

In [None]:
x_train = sequence.pad_sequences(x_train, maxlen=maxlen)
x_test = sequence.pad_sequences(x_test, maxlen=maxlen)
print('x_train shape:', x_train.shape)
print('x_test shape:', x_test.shape)

# Build Model

In [None]:
comment_input = Input((maxlen,))

# we start off with an efficient embedding layer which maps
# our vocab indices into embedding_dims dimensions
comment_emb = Embedding(max_features, embedding_dims, input_length=maxlen, 
                        embeddings_initializer="uniform")(comment_input)

# we add a GlobalMaxPooling1D, which will extract features from the embeddings
# of all words in the comment
h = GlobalMaxPooling1D()(comment_emb)

# We project onto a six-unit output layer, and squash it with a sigmoid:
output = Dense(number_feature_cols, activation='sigmoid')(h)

model = Model(inputs=comment_input, outputs=output)

model.compile(loss='binary_crossentropy',
              optimizer=Adam(0.01),
              metrics=['accuracy', categorical_accuracy,binary_accuracy])

# Train Model

In [None]:
history = model.fit(x_train, y_train, batch_size=batch_size, epochs=3, validation_split=0.1)

# Export Model

In [None]:
model_json = model.to_json()
with open("keras_model1.json", "w") as json_file:
    json_file.write(model_json)

### Read model

In [None]:
from keras.models import model_from_json

In [None]:
json_file = open('keras_model1.json', 'r')
loaded_model_json = json_file.read()
json_file.close()
loaded_model = model_from_json(loaded_model_json)

# Evaluate model

In [None]:
X_test.head()

In [None]:
X_test_array = np.array(sequence.pad_sequences(tok.texts_to_sequences([list(X_test)]), maxlen=maxlen))
predicted = model.predict(X_test_array)[0]

In [None]:
y_test.head()
y_test2 = y_test.reset_index()
y_test2.drop(columns=['index'],inplace=True)
y_test2.head()
n = y_test2.values.tolist()

In [None]:
y_true = n
y_pred = np.where(predicted>0.2,1,0)
y_true = np.array(y_true[0])
y_true
y_pred

In [None]:
from keras.metrics import binary_accuracy,categorical_accuracy

In [None]:
print(classification_report(y_true,y_pred))
conf_mat = confusion_matrix(y_true,y_pred)
sns.heatmap(conf_mat, annot=True, fmt='d',
            xticklabels=[0,1], yticklabels=[0,1])
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.title('Model Confusion Matrix')



# Visualize model training performance

In [None]:
acc = history.history['acc']
val_acc = history.history['val_acc']
loss = history.history['loss']
val_loss = history.history['val_loss']

epochs = range(len(acc))

plt.plot(epochs, acc, 'bo', label='Training acc')
plt.plot(epochs, val_acc, 'b', label='Validation acc')
plt.title('Training and validation accuracy')
plt.legend()

plt.figure()

plt.plot(epochs, loss, 'bo', label='Training loss')
plt.plot(epochs, val_loss, 'b', label='Validation loss')
plt.title('Training and validation loss')
plt.legend()

plt.show()


# Now lets predict from a test data

In [None]:
df_raw = pd.read_csv("../data/data2.csv")
df_raw.head()
len(df_raw)
len(df)

df_raw.google_class.ix[21047]

df_raw.google_class = df_raw.google_class.astype(str)

reset_test = test.reset_index()

In [None]:
reset_test.rename(columns={'index':'idx'},inplace=True)
test_queries = reset_test.keyword2.tolist()[:10]
test_indexes = reset_test.idx.tolist()[:10]
t = zip(test_queries,test_indexes)

In [None]:
print(predicted)

In [None]:
for tup in t:
    print ("query : {0}".format(tup[0]))
    textArray = np.array(sequence.pad_sequences(tok.texts_to_sequences([tup[0]]), maxlen=maxlen))
    predicted = model.predict(textArray)[0]

#     predicted contains list of probabilities for each of the 140 class. You will set your own threshold.
#     Example: If > some_threshold then 1 else 0.
    
    predicted_list = []
    selected_categories = y_test.columns
    for i, prob in enumerate(predicted):
        if prob > 0.2:
            predicted_list.append(selected_categories[i])
    print( "predicted tags : {0}".format(predicted_list))
    print("true tags : {0}".format(df_raw.google_class.ix[tup[1]].split(",")))
    
    count = 0
    for i in predicted_list:
        if i in df_raw.google_class.ix[tup[1]].split(","):
            count +=1
    percent = round(count/len(df_raw.google_class.ix[tup[1]].split(",")),2)*100
    print("percentage of predicted in true tags: {0} %".format(str(percent)))
    print()
    print("****************************************")



# Try word2vec trained on biz_chat data


In [None]:
df_bz = pd.read_csv("../data/biz_chat_pre_npi.csv")
df_bz.head()

In [None]:
from gensim.utils import simple_preprocess
from nltk.corpus import stopwords
from gensim.models import Word2Vec, Phrases
stop_words = stopwords.words('english')
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

### Simple cleaning and tokenization of chat utterances

In [None]:
df_bz['clean_text'] = df_bz['text'].apply(lambda x: simple_preprocess(x) )

In [None]:
df_bz.head()
len(df_bz)

In [None]:
df_bz.clean_text.tolist()[:5]

### Build and train a W2V model

In [None]:
docs = df_bz.clean_text.tolist()
gs_model = Word2Vec(
        docs,
        size=150,
        window=10,
        min_count=2,
        workers=10)

# gs_model.train(docs, total_examples=len(docs), epochs=10)
 


### Try out word2vec similarity

In [None]:
gs_model.wv.most_similar('iwatch')

### Instantiate word_vectors

In [None]:
word_vectors = gs_model.wv

In [None]:
vocab = list(word_vectors.vocab.keys())
len(vocab)

# Now we can start feeding w2v to keras

In [None]:
print("Number of word vectors: {}".format(len(word_vectors.vocab)))

In [None]:
MAX_NB_WORDS = len(word_vectors.vocab)
MAX_SEQUENCE_LENGTH = 200

In [None]:
documents = df.keyword2.values.tolist()
documents[:3]

In [None]:
# train, test = train_test_split(df, random_state=42, test_size=0.33, shuffle=True)

In [None]:
len(train)
len(test)

In [None]:
from keras.preprocessing.sequence import pad_sequences

word_index = {t[0]: i+1 for i,t in enumerate(vocab)}

sequences = [[word_index.get(t, 0) for t in doc]
             for doc in documents[:len(train)]]

test_sequences = [[word_index.get(t, 0)  for t in doc] 
                  for doc in documents[len(train):]]

# pad
data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH, 
                     padding="pre", truncating="post")
list_classes = selected_categories 
y = train[list_classes].values
print('Shape of data tensor:', data.shape)
print('Shape of label tensor:', y.shape)

test_data = pad_sequences(test_sequences, maxlen=MAX_SEQUENCE_LENGTH, padding="pre",
                          truncating="post")
print('Shape of test_data tensor:', test_data.shape)

In [None]:
len(list_classes)

In [None]:
WV_DIM = 100
nb_words = min(MAX_NB_WORDS, len(word_vectors.vocab))
# we initialize the matrix with random numbers
wv_matrix = (np.random.rand(nb_words, WV_DIM) - 0.5) / 5.0
for word, i in word_index.items():
    if i >= MAX_NB_WORDS:
        continue
    try:
        embedding_vector = word_vectors[word]
        # words not found in embedding index will be all-zeros.
        wv_matrix[i] = embedding_vector
    except:
        pass  

In [None]:
from keras.layers import Dense, Input,CuDNNLSTM, Embedding, Dropout,SpatialDropout1D, Bidirectional
from keras.models import Model
from keras.optimizers import Adam
from keras.layers.normalization import BatchNormalization

In [None]:
model = Sequential()
wv_layer = Embedding(nb_words,
                     WV_DIM,
                     mask_zero=False,
                     weights=[wv_matrix],
                     input_length=MAX_SEQUENCE_LENGTH,
                     trainable=False)
model.add(wv_layer)
model.add(Flatten())
model.add(Dense(len(list_classes), activation='sigmoid'))
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])
print(model.summary())


In [None]:
wv_layer = Embedding(nb_words,
                     WV_DIM,
                     mask_zero=False,
                     weights=[wv_matrix],
                     input_length=MAX_SEQUENCE_LENGTH,
                     trainable=False)

# Inputs
comment_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
comment_input
embedded_sequences = wv_layer(comment_input)

# biGRU
embedded_sequences = SpatialDropout1D(0.2)(embedded_sequences)
x = Bidirectional(CuDNNLSTM(64, return_sequences=False))(embedded_sequences)

# Output
x = Dropout(0.2)(x)
x = BatchNormalization()(x)
preds = Dense(number_feature_cols, activation='sigmoid')(x)

# build the model
model = Model(inputs=[comment_input], outputs=preds)
model.compile(loss='binary_crossentropy',
              optimizer=Adam(lr=0.001, clipnorm=.25, beta_1=0.7, beta_2=0.99),
              metrics=[])

In [None]:
len(data)
len(y)

In [None]:
hist = model.fit([data], y, validation_split=0.1,
                 epochs=10, batch_size=256, shuffle=True)