In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.layers import Dropout, Dense
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn import metrics
from sklearn.model_selection import train_test_split 

In [2]:
vocab_size = 10000
embedding_dim = 16
max_length = 500
trunc_type='post'
padding_type='post'
oov_tok = "<OOV>"
training_size = 35000

In [3]:
with open('corpus.txt', 'r', encoding='utf-8') as file:
    description = list(file.readlines())
    
with open('final_categories.txt', 'rt') as file:
    primary_category = list(file.readlines())
    
df = pd.DataFrame()
df["description"] = description 
df["primary_category"] = primary_category

df.head(3)

Unnamed: 0,description,primary_category
0,alisha solid woman cycling short cotton lycra ...,Clothing\n
1,fabhomedecor fabric double sofa bed finish col...,Furniture\n
2,belly sandal wedge heel casuals belly price ma...,Footwear\n


In [4]:
df["description"] = df["description"].str.replace('\n', '') 
df["primary_category"] = df["primary_category"].str.replace('\n', '')

df.head(3)

Unnamed: 0,description,primary_category
0,alisha solid woman cycling short cotton lycra ...,Clothing
1,fabhomedecor fabric double sofa bed finish col...,Furniture
2,belly sandal wedge heel casuals belly price ma...,Footwear


In [5]:
X = np.array(df["description"])
y = np.array(df["primary_category"])

training_sentences, testing_sentences, training_labels, testing_labels = train_test_split(X, y, test_size=0.20, random_state=73)

print(training_sentences.shape)
print(testing_sentences.shape)
print(training_labels.shape)
print(testing_labels.shape)

(15732,)
(3934,)
(15732,)
(3934,)


In [6]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
training_labels = le.fit_transform(training_labels)

training_labels

array([ 3, 20,  5, ..., 13, 26, 17])

In [7]:
testing_labels = le.transform(testing_labels)

testing_labels

array([17,  6,  3, ..., 24, 17, 13])

In [8]:
tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_tok)
tokenizer.fit_on_texts(training_sentences)

word_index = tokenizer.word_index

training_sequences = tokenizer.texts_to_sequences(training_sentences)
training_padded = pad_sequences(training_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

testing_sequences = tokenizer.texts_to_sequences(testing_sentences)
testing_padded = pad_sequences(testing_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

In [9]:
print(len(training_sequences))
print(training_padded)

15732
[[1940  232 2460 ...    0    0    0]
 [  13    1  728 ...    0    0    0]
 [2599   10  184 ...    0    0    0]
 ...
 [2766  401  223 ...    0    0    0]
 [ 975  126   94 ...    0    0    0]
 [ 233   58   24 ...    0    0    0]]


In [10]:
training_padded = np.array(training_padded)
training_labels = np.array(training_labels)
testing_padded = np.array(testing_padded)
testing_labels = np.array(testing_labels)

In [14]:
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length),
    tf.keras.layers.GlobalAveragePooling1D(),
    tf.keras.layers.Dense(24, activation='relu'),
    tf.keras.layers.Dense(1, activation='softmax')
])
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [15]:
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 500, 16)           160000    
_________________________________________________________________
global_average_pooling1d_1 ( (None, 16)                0         
_________________________________________________________________
dense_2 (Dense)              (None, 24)                408       
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 25        
Total params: 160,433
Trainable params: 160,433
Non-trainable params: 0
_________________________________________________________________


In [16]:
num_epochs = 10
history = model.fit(training_padded, training_labels, epochs=num_epochs, 
                    validation_data=(testing_padded, testing_labels), verbose=2)

Epoch 1/10
492/492 - 3s - loss: -2.9233e+02 - accuracy: 0.0230 - val_loss: -1.2199e+03 - val_accuracy: 0.0308
Epoch 2/10
492/492 - 1s - loss: -4.5135e+03 - accuracy: 0.0230 - val_loss: -9.5691e+03 - val_accuracy: 0.0308
Epoch 3/10
492/492 - 1s - loss: -1.8412e+04 - accuracy: 0.0230 - val_loss: -2.9817e+04 - val_accuracy: 0.0308
Epoch 4/10
492/492 - 1s - loss: -4.5574e+04 - accuracy: 0.0230 - val_loss: -6.4707e+04 - val_accuracy: 0.0308
Epoch 5/10
492/492 - 1s - loss: -8.8291e+04 - accuracy: 0.0230 - val_loss: -1.1614e+05 - val_accuracy: 0.0308
Epoch 6/10
492/492 - 2s - loss: -1.4827e+05 - accuracy: 0.0230 - val_loss: -1.8559e+05 - val_accuracy: 0.0308
Epoch 7/10
492/492 - 1s - loss: -2.2656e+05 - accuracy: 0.0230 - val_loss: -2.7390e+05 - val_accuracy: 0.0308
Epoch 8/10
492/492 - 1s - loss: -3.2412e+05 - accuracy: 0.0230 - val_loss: -3.8225e+05 - val_accuracy: 0.0308
Epoch 9/10
492/492 - 1s - loss: -4.4196e+05 - accuracy: 0.0230 - val_loss: -5.1138e+05 - val_accuracy: 0.0308
Epoch 10/1

In [37]:
print(history)

<tensorflow.python.keras.callbacks.History object at 0x0000023B47313670>


In [38]:
model.predict([["style foot belly ballerina shoe ballerina flat style foot belly price ballet shoe fit perfectly casual party wear specification style foot belly general occasion casual ideal woman shoe heel height inch outer material color black slipper"]])



UnimplementedError:  Cast string to float is not supported
	 [[node sequential_3/Cast (defined at <ipython-input-38-17486c8be718>:1) ]] [Op:__inference_predict_function_27545]

Function call stack:
predict_function


In [8]:
def TFIDF(X_train, X_test, MAX_NB_WORDS=75000):
    tfidf_vec = TfidfVectorizer(max_features = MAX_NB_WORDS)
    X_train = tfidf_vec.fit_transform(X_train).toarray()
    X_test = tfidf_vec.transform(X_test).toarray()
    print("tf-idf with", str(np.array(X_train).shape[1]), "features")
    return (X_train, X_test)

In [11]:
def Build_Model_DNN_Text(shape, nClasses, dropout=0.5):
    """
    buildModel_DNN_Tex(shape, nClasses,dropout)
    Build Deep neural networks Model for text classification
    Shape is input feature space
    nClasses is number of classes
    """
    model = Sequential()
    node = 512 # number of nodes
    nLayers = 4 # number of  hidden layer
    model.add(Dense(node,input_dim=shape,activation='relu'))
    model.add(Dropout(dropout))
    for i in range(0,nLayers):
        model.add(Dense(node, input_dim=node, activation='relu'))
        model.add(Dropout(dropout))
    model.add(Dense(nClasses, activation='softmax'))
    model.compile(loss='sparse_categorical_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])
    return model

In [12]:
X_train_tfidf, X_test_tfidf = TFIDF(training_sentences, testing_sentences)

model_DNN = Build_Model_DNN_Text(X_train_tfidf.shape[1], 28)
history = model_DNN.fit(X_train_tfidf, training_labels,
                              validation_data=(X_test_tfidf, testing_labels),
                              epochs=100,
                              batch_size=128,
                              verbose=2)

predicted = model_DNN.predict_classes(X_test_tfidf)
print(metrics.classification_report(testing_labels, predicted))

tf-idf with 13595 features
Epoch 1/50
123/123 - 10s - loss: 1.6849 - accuracy: 0.4729 - val_loss: 0.9535 - val_accuracy: 0.6769
Epoch 2/50
123/123 - 9s - loss: 0.6458 - accuracy: 0.7916 - val_loss: 0.3558 - val_accuracy: 0.9026
Epoch 3/50
123/123 - 9s - loss: 0.3112 - accuracy: 0.9006 - val_loss: 0.2966 - val_accuracy: 0.9164
Epoch 4/50
123/123 - 9s - loss: 0.2084 - accuracy: 0.9305 - val_loss: 0.2407 - val_accuracy: 0.9385
Epoch 5/50
123/123 - 9s - loss: 0.1532 - accuracy: 0.9512 - val_loss: 0.2381 - val_accuracy: 0.9459
Epoch 6/50
123/123 - 9s - loss: 0.1167 - accuracy: 0.9647 - val_loss: 0.1874 - val_accuracy: 0.9583
Epoch 7/50
123/123 - 10s - loss: 0.0861 - accuracy: 0.9723 - val_loss: 0.1863 - val_accuracy: 0.9642
Epoch 8/50
123/123 - 10s - loss: 0.0787 - accuracy: 0.9762 - val_loss: 0.1801 - val_accuracy: 0.9652
Epoch 9/50
123/123 - 9s - loss: 0.0656 - accuracy: 0.9808 - val_loss: 0.1874 - val_accuracy: 0.9682
Epoch 10/50
123/123 - 9s - loss: 0.0487 - accuracy: 0.9865 - val_loss:



              precision    recall  f1-score   support

           0       0.98      0.97      0.98       193
           1       0.96      0.89      0.92       121
           2       0.95      0.93      0.94        43
           3       0.96      0.96      0.96       135
           4       0.78      0.93      0.85        15
           5       1.00      0.99      0.99      1218
           6       0.96      0.94      0.95       126
           7       1.00      1.00      1.00         1
           8       1.00      0.99      0.99       259
           9       1.00      1.00      1.00        39
          10       0.57      0.80      0.67        10
          11       1.00      0.86      0.92         7
          12       0.67      0.50      0.57         4
          13       0.98      0.99      0.99       185
          14       0.50      0.67      0.57         3
          15       0.98      0.99      0.98       130
          16       0.45      0.94      0.61        16
          17       1.00    

In [1]:
# summarize history for accuracy
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()

# summarize history for loss
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()

NameError: name 'plt' is not defined