In [1]:
import pandas as pd
import nltk
import numpy as np

#load inthe NTLK stopwords to remove articles, preposition and other words that are not actionable
from nltk.corpus import stopwords
# This allows to create individual objects from a bog of words
from nltk.tokenize import word_tokenize, sent_tokenize
# Lemmatizer helps to reduce words to the base form
from nltk.stem import WordNetLemmatizer

In [2]:
df=pd.read_csv('summer-products-with-rating-and-performance_2020-08.csv')
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /Users/jasmineli/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/jasmineli/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/jasmineli/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [3]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('paraphrase-MiniLM-L6-v2')

In [4]:
df['bert_encode'] = [model.encode(sen) for sen in df['title_orig']]

In [5]:
# product color
def main_color(s):
    main_color = {"red":"red", "white":"white", "pink":"pink", "yellow":"yellow", "green":"green", "blue":"blue", "wine":"red", "burgundy":"red", "black":"black", "navy":"navy", "orange":"orange", 
    "rose":"pink", "gray":"gray", "grey":"gray", "purple":"purple", "violet":"purple", "army":"green", "leopard":"orange", "ivory":"white", 
    "brown":"brown", "coffee":"brown", "camel":"beige", "tan":"brown", "nude":"beige", "khaki":"khaki", "apricot":"yellow", "camouflage":"green", "jasper":"red"}  # ordered by importance
    for key, value in main_color.items():
        if key in s:
            return value
    return "others"
product_color = df["product_color"]
product_color = [s.lower() if type(s) is str else 'nan' for s in product_color]
product_color = [main_color(s) for s in product_color]
from matplotlib import colors
product_color = [(-0.1,-0.1,-0.1,-0.1) if s == "others" else colors.to_rgba(s) for s in product_color]

df['product_color_rgb'] = [np.array(t) for t in product_color]

# log prices
df['log_price'] = [np.log(p) for p in df["price"]]
df['log_retail_price'] = [np.log(p) for p in df["retail_price"]]

# log merchant rating count
df['log_merchant_rating_count'] = np.log(df['merchant_rating_count'])

# urgent text
df['urgent'] = [1 if s == "Quantité limitée !" else 0 for s in df["urgency_text"]]

  result = getattr(ufunc, method)(*inputs, **kwargs)


In [6]:
rgb = df["product_color_rgb"]
rgb = np.stack(rgb.values, axis=0)
for i in range(4):
    df["product_color_rgb"+str(i)] = rgb[:,i]

In [7]:
data = df[["log_price", "log_retail_price", "uses_ad_boosts", "badges_count", "badge_local_product", 
           "badge_product_quality", "badge_fast_shipping", "urgent", "units_sold"]]
label = [1 if sales > 200 else 0 for sales in data["units_sold"]]
data['high_sale'] = label
bert = np.stack(df['bert_encode'].values, axis=0)
for i in range(384):
    data["title_bert"+str(i)] = bert[:, i]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  import sys
  import sys


In [8]:
data.replace([np.inf, -np.inf], np.nan, inplace=True)

In [9]:
X = data.loc[:, ~data.columns.isin(['high_sale', 'units_sold'])]
y = data['high_sale']

In [10]:
import tensorflow as tf

from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.models import Sequential

from keras.layers import Input, Dense, Concatenate
from keras.models import Model
from keras.layers import Dense, Conv1D, MaxPooling1D, Flatten, Dropout
from keras.callbacks import EarlyStopping
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.125, random_state=42)
X_train, X_dev, y_train, y_dev = train_test_split(X_train, y_train, test_size=0.14286, random_state=42)

In [11]:
print ("number of training examples = " + str(X_train.shape[0]))
print ("number of dev examples = " + str(X_dev.shape[0]))
print ("number of test examples = " + str(X_test.shape[0]))
print ("X_train shape: " + str(X_train.shape))
print ("Y_train shape: " + str(y_train.shape))
print ("X_dev shape: " + str(X_dev.shape))
print ("Y_dev shape: " + str(y_dev.shape))
print ("X_test shape: " + str(X_test.shape))
print ("Y_test shape: " + str(y_test.shape))

number of training examples = 1179
number of dev examples = 197
number of test examples = 197
X_train shape: (1179, 392)
Y_train shape: (1179,)
X_dev shape: (197, 392)
Y_dev shape: (197,)
X_test shape: (197, 392)
Y_test shape: (197,)


In [12]:
# Defining hold out data for evaluation 
evals_X = X[-100:]
evals_y = y[-100:]
print(evals_y.shape)
print(evals_X.shape)
X_train = tf.expand_dims(X_train, axis=-1)
input_shape = X_train.shape[1:]

(100,)
(100, 392)


2021-11-06 17:51:49.544309: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [13]:
# define model
model = Sequential()
model.add(Conv1D(filters=32, kernel_size=8, activation='relu'))
model.add(MaxPooling1D(pool_size=2))
model.add(Dropout(0.1))
model.add(Flatten())
model.add(Dense(128, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

In [14]:
model.compile(optimizer='adam',
              loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
              metrics=['accuracy'])

es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=50)

In [15]:
history = model.fit(
    X_train,
    y_train,
    epochs=1000,
    verbose=2,
    validation_data=(evals_X, evals_y),
    batch_size=32,
    callbacks=[es]
)
model.summary()

Epoch 1/1000


  return dispatch_target(*args, **kwargs)


37/37 - 1s - loss: 0.6930 - accuracy: 0.5768 - val_loss: 0.6373 - val_accuracy: 0.6500 - 894ms/epoch - 24ms/step


  return dispatch_target(*args, **kwargs)


Epoch 2/1000
37/37 - 0s - loss: 0.6684 - accuracy: 0.6022 - val_loss: 0.6641 - val_accuracy: 0.6500 - 324ms/epoch - 9ms/step
Epoch 3/1000
37/37 - 0s - loss: 0.6544 - accuracy: 0.6209 - val_loss: 0.6078 - val_accuracy: 0.6600 - 322ms/epoch - 9ms/step
Epoch 4/1000
37/37 - 0s - loss: 0.6410 - accuracy: 0.6226 - val_loss: 0.6250 - val_accuracy: 0.7000 - 297ms/epoch - 8ms/step
Epoch 5/1000
37/37 - 0s - loss: 0.6211 - accuracy: 0.6616 - val_loss: 0.5877 - val_accuracy: 0.6700 - 302ms/epoch - 8ms/step
Epoch 6/1000
37/37 - 0s - loss: 0.6088 - accuracy: 0.6751 - val_loss: 0.5757 - val_accuracy: 0.6900 - 314ms/epoch - 8ms/step
Epoch 7/1000
37/37 - 0s - loss: 0.5886 - accuracy: 0.7006 - val_loss: 0.5596 - val_accuracy: 0.6900 - 312ms/epoch - 8ms/step
Epoch 8/1000
37/37 - 0s - loss: 0.5754 - accuracy: 0.6938 - val_loss: 0.5452 - val_accuracy: 0.7400 - 305ms/epoch - 8ms/step
Epoch 9/1000
37/37 - 0s - loss: 0.5438 - accuracy: 0.7379 - val_loss: 0.5336 - val_accuracy: 0.7800 - 307ms/epoch - 8ms/step


37/37 - 0s - loss: 0.1560 - accuracy: 0.9321 - val_loss: 0.2187 - val_accuracy: 0.9100 - 315ms/epoch - 9ms/step
Epoch 68/1000
37/37 - 0s - loss: 0.2065 - accuracy: 0.9186 - val_loss: 0.2587 - val_accuracy: 0.8900 - 311ms/epoch - 8ms/step
Epoch 69/1000
37/37 - 0s - loss: 0.1547 - accuracy: 0.9389 - val_loss: 0.2014 - val_accuracy: 0.9400 - 323ms/epoch - 9ms/step
Epoch 70/1000
37/37 - 0s - loss: 0.1313 - accuracy: 0.9534 - val_loss: 0.2105 - val_accuracy: 0.9300 - 323ms/epoch - 9ms/step
Epoch 71/1000
37/37 - 0s - loss: 0.1338 - accuracy: 0.9534 - val_loss: 0.2215 - val_accuracy: 0.9200 - 331ms/epoch - 9ms/step
Epoch 72/1000
37/37 - 0s - loss: 0.1421 - accuracy: 0.9491 - val_loss: 0.2080 - val_accuracy: 0.9400 - 320ms/epoch - 9ms/step
Epoch 73/1000
37/37 - 0s - loss: 0.1380 - accuracy: 0.9567 - val_loss: 0.2051 - val_accuracy: 0.9400 - 367ms/epoch - 10ms/step
Epoch 74/1000
37/37 - 0s - loss: 0.1383 - accuracy: 0.9491 - val_loss: 0.2201 - val_accuracy: 0.9300 - 349ms/epoch - 9ms/step
Epoch

Epoch 132/1000
37/37 - 0s - loss: 0.0992 - accuracy: 0.9593 - val_loss: 0.1557 - val_accuracy: 0.9400 - 353ms/epoch - 10ms/step
Epoch 133/1000
37/37 - 0s - loss: 0.1243 - accuracy: 0.9525 - val_loss: 0.1604 - val_accuracy: 0.9500 - 334ms/epoch - 9ms/step
Epoch 134/1000
37/37 - 0s - loss: 0.1176 - accuracy: 0.9491 - val_loss: 0.1922 - val_accuracy: 0.9200 - 377ms/epoch - 10ms/step
Epoch 135/1000
37/37 - 0s - loss: 0.1093 - accuracy: 0.9559 - val_loss: 0.2050 - val_accuracy: 0.9200 - 354ms/epoch - 10ms/step
Epoch 136/1000
37/37 - 0s - loss: 0.0971 - accuracy: 0.9618 - val_loss: 0.2491 - val_accuracy: 0.8800 - 339ms/epoch - 9ms/step
Epoch 137/1000
37/37 - 0s - loss: 0.0960 - accuracy: 0.9635 - val_loss: 0.1681 - val_accuracy: 0.9400 - 367ms/epoch - 10ms/step
Epoch 138/1000
37/37 - 0s - loss: 0.0868 - accuracy: 0.9661 - val_loss: 0.2661 - val_accuracy: 0.9000 - 341ms/epoch - 9ms/step
Epoch 139/1000
37/37 - 0s - loss: 0.0925 - accuracy: 0.9644 - val_loss: 0.2490 - val_accuracy: 0.9100 - 326

Epoch 197/1000
37/37 - 0s - loss: 0.0951 - accuracy: 0.9635 - val_loss: 0.1131 - val_accuracy: 0.9600 - 379ms/epoch - 10ms/step
Epoch 198/1000
37/37 - 0s - loss: 0.0788 - accuracy: 0.9678 - val_loss: 0.1903 - val_accuracy: 0.9300 - 369ms/epoch - 10ms/step
Epoch 199/1000
37/37 - 0s - loss: 0.0968 - accuracy: 0.9610 - val_loss: 0.1736 - val_accuracy: 0.9400 - 360ms/epoch - 10ms/step
Epoch 200/1000
37/37 - 0s - loss: 0.0773 - accuracy: 0.9695 - val_loss: 0.2126 - val_accuracy: 0.9100 - 394ms/epoch - 11ms/step
Epoch 201/1000
37/37 - 0s - loss: 0.0869 - accuracy: 0.9644 - val_loss: 0.1785 - val_accuracy: 0.9400 - 379ms/epoch - 10ms/step
Epoch 202/1000
37/37 - 0s - loss: 0.0799 - accuracy: 0.9661 - val_loss: 0.1751 - val_accuracy: 0.9400 - 365ms/epoch - 10ms/step
Epoch 203/1000
37/37 - 0s - loss: 0.0834 - accuracy: 0.9652 - val_loss: 0.1486 - val_accuracy: 0.9400 - 373ms/epoch - 10ms/step
Epoch 204/1000
37/37 - 0s - loss: 0.0817 - accuracy: 0.9627 - val_loss: 0.1885 - val_accuracy: 0.9300 - 