In [9]:
# This notebook is a simple version of what can be a more avanced product classification system
# Goal (Y): predict product segment
# Prior (X): item description

from google.colab import drive
drive.mount('/content/gdrive')
import pandas as pd
import numpy as np
df = pd.read_csv("/content/gdrive/My Drive/Neilsen/ItemsForModels.csv")

data = df[['SEGMENT','LOCAL BRAND','Item Description']]
data.columns = ['SEGMENT',"LOCAL_BRAND",'DESCRIPTION']
print("original data description")
display(data.describe())

# drop rows with low frequency segments (30 or less)
n = 50
for col in ['SEGMENT','LOCAL_BRAND']:
  data = data[data.groupby(col)[col].transform('count').ge(n)]
print("data description after dropping low frequency items")
display(data.describe())



Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).
original data description


Unnamed: 0,SEGMENT,LOCAL_BRAND,DESCRIPTION
count,36615,36615,39621
unique,11,375,39518
top,JUISFLDR,AO,SEVEN UP DIET SUGAR FREE CANS 354-355ML REG/B (#)
freq,12067,14052,6


data description after dropping low frequency items


Unnamed: 0,SEGMENT,LOCAL_BRAND,DESCRIPTION
count,32946,32946,32946
unique,10,88,32871
top,JUISFLDR,AO,SEVEN UP DIET SUGAR FREE CANS 354-355ML REG/B (#)
freq,10762,14052,6


In [10]:
import sys
!{sys.executable} -m pip install spacy-transformers torch
!{sys.executable} -m pip install -U spacy
!{sys.executable} -m spacy download en_trf_xlnetbasecased_lg

Requirement already up-to-date: spacy in /usr/local/lib/python3.6/dist-packages (2.2.3)
[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_trf_xlnetbasecased_lg')


In [0]:
from tqdm import tqdm_notebook as tqdm
import spacy
import torch
from keras.layers import Dense, Dropout, Activation, LSTM, Embedding
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from sklearn import preprocessing

le = preprocessing.LabelEncoder()

le.fit(data['SEGMENT'])

data['SEGMENT_CODE']=le.transform(data['SEGMENT']) 

In [42]:
# dataFrac=data.sample(frac=0.50)
dataFrac=data.sample(frac=1.00)

is_using_gpu = spacy.prefer_gpu()
if is_using_gpu:
    torch.set_default_tensor_type("torch.cuda.FloatTensor")

nlp = spacy.load("en_trf_xlnetbasecased_lg")

x=[]
y=[]
for index,row in tqdm(dataFrac.iterrows(), total=dataFrac.shape[0]):
    txt = row["DESCRIPTION"]
    x.append(nlp(txt)._.trf_last_hidden_state)
    y.append(row["SEGMENT_CODE"])


HBox(children=(IntProgress(value=0, max=32946), HTML(value='')))




In [43]:
# X.shape,Y.shape
import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences
import cupy
x2=[cupy.asnumpy(a) for a in x]
X=pad_sequences(x2)
Y=to_categorical(np.array(y))
X.shape,Y.shape

((32946, 37, 768), (32946, 10))

In [44]:
from keras.models import Sequential
from keras.layers import LSTM, Dense
import numpy as np
from keras.callbacks import ModelCheckpoint

checkpoint = ModelCheckpoint('model-{epoch:03d}-{acc:03f}-{val_acc:03f}.h5', verbose=1, monitor='val_loss',save_best_only=True, mode='auto')  

data_dim = 768
timesteps = X.shape[1]
num_classes = 10

# expected input data shape: (batch_size, timesteps, data_dim)
model = Sequential()
model.add(LSTM(256, return_sequences=True, input_shape=(timesteps, data_dim))) 
model.add(LSTM(64, return_sequences=True))  
model.add(LSTM(32)) 
model.add(Dense(10, activation='softmax'))

model.compile(loss='categorical_crossentropy', optimizer='rmsprop',  metrics=['accuracy'])

model.fit(X, Y, batch_size=64, epochs=10, validation_split=0.1)

Train on 29651 samples, validate on 3295 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f556ee1e550>