In [46]:
import pandas as pd
import nltk
import numpy as np

#load inthe NTLK stopwords to remove articles, preposition and other words that are not actionable
from nltk.corpus import stopwords
# This allows to create individual objects from a bog of words
from nltk.tokenize import word_tokenize, sent_tokenize
# Lemmatizer helps to reduce words to the base form
from nltk.stem import WordNetLemmatizer

In [47]:
df=pd.read_csv('summer-products-with-rating-and-performance_2020-08.csv')
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /Users/jasmineli/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/jasmineli/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/jasmineli/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [48]:
def process_sentence(sentence):
    new_tokens = word_tokenize(sentence)
    new_tokens = [t.lower() for t in new_tokens]
    new_tokens =[t for t in new_tokens if t not in stopwords.words('english')]
    new_tokens = [t for t in new_tokens if t.isalpha()]
    lemmatizer = WordNetLemmatizer()
    new_tokens =[lemmatizer.lemmatize(t) for t in new_tokens]
    return " ".join(new_tokens)

In [49]:
titles = df['title_orig'].tolist()
tokens = [process_sentence(t) for t in titles]
df['title_pre'] = tokens
df.head()

Unnamed: 0,title,title_orig,price,retail_price,currency_buyer,units_sold,uses_ad_boosts,rating,rating_count,rating_five_count,...,merchant_rating,merchant_id,merchant_has_profile_picture,merchant_profile_picture,product_url,product_picture,product_id,theme,crawl_month,title_pre
0,2020 Summer Vintage Flamingo Print Pajamas Se...,2020 Summer Vintage Flamingo Print Pajamas Se...,16.0,14,EUR,100,0,3.76,54,26.0,...,4.128521,595097d6a26f6e070cb878d1,0,,https://www.wish.com/c/5e9ae51d43d6a96e303acdb0,https://contestimg.wish.com/api/webimage/5e9ae...,5e9ae51d43d6a96e303acdb0,summer,2020-08,summer vintage flamingo print pajama set casua...
1,SSHOUSE Summer Casual Sleeveless Soirée Party ...,Women's Casual Summer Sleeveless Sexy Mini Dress,8.0,22,EUR,20000,1,3.45,6135,2269.0,...,3.899673,56458aa03a698c35c9050988,0,,https://www.wish.com/c/58940d436a0d3d5da4e95a38,https://contestimg.wish.com/api/webimage/58940...,58940d436a0d3d5da4e95a38,summer,2020-08,woman casual summer sleeveless sexy mini dress
2,2020 Nouvelle Arrivée Femmes Printemps et Été ...,2020 New Arrival Women Spring and Summer Beach...,8.0,43,EUR,100,0,3.57,14,5.0,...,3.989831,5d464a1ffdf7bc44ee933c65,0,,https://www.wish.com/c/5ea10e2c617580260d55310a,https://contestimg.wish.com/api/webimage/5ea10...,5ea10e2c617580260d55310a,summer,2020-08,new arrival woman spring summer beach wear lon...
3,Hot Summer Cool T-shirt pour les femmes Mode T...,Hot Summer Cool T Shirt for Women Fashion Tops...,8.0,8,EUR,5000,1,4.03,579,295.0,...,4.020435,58cfdefdacb37b556efdff7c,0,,https://www.wish.com/c/5cedf17ad1d44c52c59e4aca,https://contestimg.wish.com/api/webimage/5cedf...,5cedf17ad1d44c52c59e4aca,summer,2020-08,hot summer cool shirt woman fashion top bee pr...
4,Femmes Shorts d'été à lacets taille élastique ...,Women Summer Shorts Lace Up Elastic Waistband ...,2.72,3,EUR,100,1,3.1,20,6.0,...,4.001588,5ab3b592c3911a095ad5dadb,0,,https://www.wish.com/c/5ebf5819ebac372b070b0e70,https://contestimg.wish.com/api/webimage/5ebf5...,5ebf5819ebac372b070b0e70,summer,2020-08,woman summer short lace elastic waistband loos...


### Sentence Transformers: https://huggingface.co/sentence-transformers

In [50]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('paraphrase-MiniLM-L12-v2')

You try to use a model that was created with version 1.2.0, however, your version is 1.1.0. This might cause unexpected behavior or errors. In that case, try to update to the latest version.





In [51]:
# df['bert_encode'] = [model.encode(sen) for sen in df['title_orig']]
df['bert_encode'] = [model.encode(sen) for sen in df['title_pre']]

In [52]:
# product color
def main_color(s):
    main_color = {"red":"red", "white":"white", "pink":"pink", "yellow":"yellow", "green":"green", "blue":"blue", "wine":"red", "burgundy":"red", "black":"black", "navy":"navy", "orange":"orange", 
    "rose":"pink", "gray":"gray", "grey":"gray", "purple":"purple", "violet":"purple", "army":"green", "leopard":"orange", "ivory":"white", 
    "brown":"brown", "coffee":"brown", "camel":"beige", "tan":"brown", "nude":"beige", "khaki":"khaki", "apricot":"yellow", "camouflage":"green", "jasper":"red"}  # ordered by importance
    for key, value in main_color.items():
        if key in s:
            return value
    return "others"
product_color = df["product_color"]
product_color = [s.lower() if type(s) is str else 'nan' for s in product_color]
product_color = [main_color(s) for s in product_color]
from matplotlib import colors
product_color = [(-0.1,-0.1,-0.1,-0.1) if s == "others" else colors.to_rgba(s) for s in product_color]

df['product_color_rgb'] = [np.array(t) for t in product_color]

# log prices
df['log_price'] = [np.log(p) for p in df["price"]]
df['log_retail_price'] = [np.log(p) for p in df["retail_price"]]

# log merchant rating count
df['log_merchant_rating_count'] = np.log(df['merchant_rating_count'])

# urgent text
df['urgent'] = [1 if s == "Quantité limitée !" else 0 for s in df["urgency_text"]]

  result = getattr(ufunc, method)(*inputs, **kwargs)


In [53]:
rgb = df["product_color_rgb"]
rgb = np.stack(rgb.values, axis=0)
for i in range(4):
    df["product_color_rgb"+str(i)] = rgb[:,i]

In [54]:
data = df[["log_price", "log_retail_price", "uses_ad_boosts", "badges_count", "badge_local_product", 
           "badge_product_quality", "badge_fast_shipping", "urgent", "units_sold"]]
label = [1 if sales > 100 else 0 for sales in data["units_sold"]]
data['high_sale'] = label
bert = np.stack(df['bert_encode'].values, axis=0)
for i in range(384):
    data["title_bert"+str(i)] = bert[:, i]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  import sys


In [55]:
data.replace([np.inf, -np.inf], np.nan, inplace=True)

In [56]:
X = data.loc[:, ~data.columns.isin(['high_sale', 'units_sold'])]
y = data['high_sale']

In [57]:
import tensorflow as tf

from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.models import Sequential

from keras.layers import Input, Dense, Concatenate, Bidirectional
from keras.models import Model
from keras.layers import Dense, Conv1D, MaxPooling1D, Flatten, Dropout, BatchNormalization, LSTM
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_dev, y_train, y_dev = train_test_split(X_train, y_train, test_size=0.14286, random_state=42)

In [64]:
print ("number of training examples = " + str(X_train.shape[0]))
print ("number of dev examples = " + str(X_dev.shape[0]))
print ("number of test examples = " + str(X_test.shape[0]))
print ("X_train shape: " + str(X_train.shape))
print ("Y_train shape: " + str(y_train.shape))
print ("X_dev shape: " + str(X_dev.shape))
print ("Y_dev shape: " + str(y_dev.shape))
print ("X_test shape: " + str(X_test.shape))
print ("Y_test shape: " + str(y_test.shape))

number of training examples = 1078
number of dev examples = 180
number of test examples = 315
X_train shape: (1078, 392)
Y_train shape: (1078,)
X_dev shape: (180, 392)
Y_dev shape: (180,)
X_test shape: (315, 392)
Y_test shape: (315,)


In [65]:
# Defining hold out data for evaluation 
X_train = tf.expand_dims(X_train, axis=-1)
input_shape = X_train.shape[1:]

In [72]:
print ("X_train shape: " + str(X_train.shape))
X_dev = tf.expand_dims(X_dev, axis=-1)
print ("X_dev shape: " + str(X_dev.shape))

X_train shape: (1078, 392, 1)
X_dev shape: (180, 392, 1)


In [73]:
# define model
model = Sequential()
model.add(Conv1D(filters=32, kernel_size=8, activation='relu'))
model.add(MaxPooling1D(pool_size=2))
model.add(Dropout(0.5))
model.add(Flatten())
model.add(Dense(128, activation='relu', kernel_regularizer='l2'))
model.add(Dense(32, activation='relu', kernel_regularizer='l2'))
model.add(Dense(1, activation='sigmoid'))

In [74]:
# lr_schedule = keras.optimizers.schedules.ExponentialDecay(
#     initial_learning_rate=1e-2,
#     decay_steps=10000,
#     decay_rate=0.9)
# optimizer = keras.optimizers.Adam(learning_rate=lr_schedule)

model.compile(optimizer='Adam',
              loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
              metrics=['accuracy'])

es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=20)

In [75]:
history = model.fit(
    X_train,
    y_train,
    epochs=1000,
    verbose=2,
    validation_data=(X_dev, y_dev),
    batch_size=32,
    callbacks=[es]
)
model.summary()

Epoch 1/1000


To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float64')`. To change just this layer, pass dtype='float64' to the layer constructor. If you are the author of this layer, you can disable autocasting by passing autocast=False to the base Layer constructor.

34/34 - 0s - loss: 1.7670 - accuracy: 0.4221 - val_loss: 0.9891 - val_accuracy: 0.3944
Epoch 2/1000
34/34 - 0s - loss: 0.9171 - accuracy: 0.4295 - val_loss: 0.8363 - val_accuracy: 0.4278
Epoch 3/1000
34/34 - 0s - loss: 0.8065 - accuracy: 0.4518 - val_loss: 0.7595 - val_accuracy: 0.3889
Epoch 4/1000
34/34 - 0s - loss: 0.7487 - accuracy: 0.4314 - val_loss: 0.7173 - val_accuracy: 0.3889
Epoch 5/1000
34/34 - 0s - loss: 0.7141 - accuracy: 0.4128 - val_loss: 0.6933 - val_accuracy: 0.3889
Epoch 6/1000
34/34 - 0s - loss: 0.6961 - accuracy: 0.4128 - val_loss: 0.6809 - val_accuracy: 0.3889
Epoch 7/1000
34/34 - 0s - loss: 0.6880 - accuracy: 0.4128 - val_loss: 0.6750 - val_accuracy: 0.3889

Epoch 76/1000
34/34 - 0s - loss: 0.6506 - accuracy: 0.6753 - val_loss: 0.6648 - val_accuracy: 0.6278
Epoch 77/1000
34/34 - 0s - loss: 0.6481 - accuracy: 0.6855 - val_loss: 0.6663 - val_accuracy: 0.6056
Epoch 78/1000
34/34 - 0s - loss: 0.6446 - accuracy: 0.6967 - val_loss: 0.6675 - val_accuracy: 0.5833
Epoch 79/1000
34/34 - 0s - loss: 0.6455 - accuracy: 0.6855 - val_loss: 0.6666 - val_accuracy: 0.6000
Epoch 80/1000
34/34 - 0s - loss: 0.6412 - accuracy: 0.7050 - val_loss: 0.6766 - val_accuracy: 0.5222
Epoch 81/1000
34/34 - 0s - loss: 0.6506 - accuracy: 0.6744 - val_loss: 0.6676 - val_accuracy: 0.6278
Epoch 82/1000
34/34 - 0s - loss: 0.6500 - accuracy: 0.6883 - val_loss: 0.6727 - val_accuracy: 0.5500
Epoch 83/1000
34/34 - 0s - loss: 0.6446 - accuracy: 0.6948 - val_loss: 0.6660 - val_accuracy: 0.6389
Epoch 00083: early stopping
Model: "sequential_8"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv1d_5 (