In [1]:
import pandas as pd
import nltk
import numpy as np

#load inthe NTLK stopwords to remove articles, preposition and other words that are not actionable
from nltk.corpus import stopwords
# This allows to create individual objects from a bog of words
from nltk.tokenize import word_tokenize, sent_tokenize
# Lemmatizer helps to reduce words to the base form
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
df=pd.read_csv('summer-products-with-rating-and-performance_2020-08.csv')
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /Users/jasmineli/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/jasmineli/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/jasmineli/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [3]:
vectorizer = TfidfVectorizer()
vectors = vectorizer.fit_transform(df['title_orig'])

In [4]:
feature_names = vectorizer.get_feature_names()
dense = vectors.todense()
denselist = dense.tolist()
tfidf = pd.DataFrame(denselist, columns=feature_names)



In [5]:
print(tfidf.shape)

(1573, 1396)


In [6]:
# product color
def main_color(s):
  main_color = {"red":"red", "white":"white", "pink":"pink", "yellow":"yellow", "green":"green", "blue":"blue", "wine":"red", "burgundy":"red", "black":"black", "navy":"navy", "orange":"orange", 
  "rose":"pink", "gray":"gray", "grey":"gray", "purple":"purple", "violet":"purple", "army":"green", "leopard":"orange", "ivory":"white", 
  "brown":"brown", "coffee":"brown", "camel":"beige", "tan":"brown", "nude":"beige", "khaki":"khaki", "apricot":"yellow", "camouflage":"green", "jasper":"red"}  # ordered by importance
  for key, value in main_color.items():
    if key in s:
        return value
    return "others"
product_color = df["product_color"]
product_color = [s.lower() if type(s) is str else 'nan' for s in product_color]
product_color = [main_color(s) for s in product_color]
from matplotlib import colors
product_color = [(-0.1,-0.1,-0.1,-0.1) if s == "others" else colors.to_rgba(s) for s in product_color]

df['product_color_rgb'] = [np.array(t) for t in product_color]

# log prices
df['log_price'] = [np.log(p) for p in df["price"]]
df['log_retail_price'] = [np.log(p) for p in df["retail_price"]]

# log merchant rating count
df['log_merchant_rating_count'] = np.log(df['merchant_rating_count'])

# urgent text
df['urgent'] = [1 if s == "Quantité limitée !" else 0 for s in df["urgency_text"]]

  result = getattr(ufunc, method)(*inputs, **kwargs)


In [7]:
data = df[["log_price", "log_retail_price", "uses_ad_boosts", "badges_count", "badge_local_product", 
           "badge_product_quality", "badge_fast_shipping", "urgent", "units_sold"]]
df2 = pd.concat([data, tfidf], axis=1)
label = [1 if sales > 200 else 0 for sales in data["units_sold"]]
df2['high_sale'] = label

In [8]:
rgb = df["product_color_rgb"]
rgb = np.stack(rgb.values, axis=0)
for i in range(4):
    df2["product_color_rgb"+str(i)] = rgb[:,i]
df2.replace([np.inf, -np.inf], np.nan, inplace=True)
print(df2.shape)

(1573, 1410)


In [9]:
X = df2.loc[:, ~df2.columns.isin(['high_sale', 'units_sold'])]
y = df2['high_sale']

In [10]:
import tensorflow as tf

from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.models import Sequential

from keras.layers import Input, Dense, Concatenate
from keras.models import Model
from keras.layers import Dense, Conv1D, MaxPooling1D, Flatten, Dropout
from keras.callbacks import EarlyStopping
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.125, random_state=42)
X_train, X_dev, y_train, y_dev = train_test_split(X_train, y_train, test_size=0.14286, random_state=42)

In [11]:
print ("number of training examples = " + str(X_train.shape[0]))
print ("number of dev examples = " + str(X_dev.shape[0]))
print ("number of test examples = " + str(X_test.shape[0]))
print ("X_train shape: " + str(X_train.shape))
print ("Y_train shape: " + str(y_train.shape))
print ("X_dev shape: " + str(X_dev.shape))
print ("Y_dev shape: " + str(y_dev.shape))
print ("X_test shape: " + str(X_test.shape))
print ("Y_test shape: " + str(y_test.shape))

number of training examples = 1179
number of dev examples = 197
number of test examples = 197
X_train shape: (1179, 1408)
Y_train shape: (1179,)
X_dev shape: (197, 1408)
Y_dev shape: (197,)
X_test shape: (197, 1408)
Y_test shape: (197,)


In [12]:
# Defining hold out data for evaluation 
evals_X = X[-100:]
evals_y = y[-100:]
X_train = tf.expand_dims(X_train, axis=-1)
input_shape = X_train.shape[1:]

2021-11-06 16:54:01.772208: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [13]:
print(input_shape)

(1408, 1)


In [14]:
# from sklearn.linear_model import LogisticRegression

# classifier = LogisticRegression()
# classifier.fit(X_train, y_train)

# score = classifier.score(evals_X, evals_y)

# print("Accuracy:", score)

In [15]:
# define model
model = Sequential()
model.add(Conv1D(filters=32, kernel_size=8, activation='relu'))
model.add(MaxPooling1D(pool_size=2))
model.add(Dropout(0.1))
model.add(Flatten())
model.add(Dense(128, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

In [16]:
model.compile(optimizer='adam',
              loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
              metrics=['accuracy'])

es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=50)

In [None]:
history = model.fit(
    X_train,
    y_train,
    epochs=1000,
    verbose=2,
    validation_data=(evals_X, evals_y),
    batch_size=32,
    callbacks=[es]
)
model.summary()

Epoch 1/1000


  return dispatch_target(*args, **kwargs)


37/37 - 1s - loss: 0.6540 - accuracy: 0.6251 - val_loss: 0.5809 - val_accuracy: 0.7400 - 1s/epoch - 38ms/step


  return dispatch_target(*args, **kwargs)


Epoch 2/1000
37/37 - 1s - loss: 0.5748 - accuracy: 0.7065 - val_loss: 0.4909 - val_accuracy: 0.8400 - 803ms/epoch - 22ms/step
Epoch 3/1000
37/37 - 1s - loss: 0.4957 - accuracy: 0.7651 - val_loss: 0.4382 - val_accuracy: 0.8200 - 832ms/epoch - 22ms/step
Epoch 4/1000
37/37 - 1s - loss: 0.4388 - accuracy: 0.7854 - val_loss: 0.4210 - val_accuracy: 0.8300 - 804ms/epoch - 22ms/step
Epoch 5/1000
37/37 - 1s - loss: 0.3865 - accuracy: 0.8151 - val_loss: 0.4202 - val_accuracy: 0.8400 - 810ms/epoch - 22ms/step
Epoch 6/1000
37/37 - 1s - loss: 0.3402 - accuracy: 0.8499 - val_loss: 0.3649 - val_accuracy: 0.8200 - 825ms/epoch - 22ms/step
Epoch 7/1000
37/37 - 1s - loss: 0.3063 - accuracy: 0.8668 - val_loss: 0.3839 - val_accuracy: 0.8700 - 816ms/epoch - 22ms/step
Epoch 8/1000
37/37 - 1s - loss: 0.2653 - accuracy: 0.8863 - val_loss: 0.3403 - val_accuracy: 0.8500 - 858ms/epoch - 23ms/step
Epoch 9/1000
37/37 - 1s - loss: 0.2386 - accuracy: 0.9033 - val_loss: 0.3460 - val_accuracy: 0.8700 - 831ms/epoch - 22

Epoch 68/1000
37/37 - 1s - loss: 0.0478 - accuracy: 0.9779 - val_loss: 0.2331 - val_accuracy: 0.9300 - 1s/epoch - 33ms/step
Epoch 69/1000
37/37 - 1s - loss: 0.0446 - accuracy: 0.9771 - val_loss: 0.2725 - val_accuracy: 0.9300 - 1s/epoch - 33ms/step
Epoch 70/1000
37/37 - 1s - loss: 0.0597 - accuracy: 0.9737 - val_loss: 0.2692 - val_accuracy: 0.9300 - 1s/epoch - 32ms/step
Epoch 71/1000
37/37 - 1s - loss: 0.0534 - accuracy: 0.9763 - val_loss: 0.2771 - val_accuracy: 0.9300 - 1s/epoch - 33ms/step
Epoch 72/1000
37/37 - 1s - loss: 0.0423 - accuracy: 0.9788 - val_loss: 0.3057 - val_accuracy: 0.9300 - 1s/epoch - 33ms/step
Epoch 73/1000
37/37 - 1s - loss: 0.0453 - accuracy: 0.9796 - val_loss: 0.3045 - val_accuracy: 0.9300 - 1s/epoch - 31ms/step
Epoch 74/1000
37/37 - 1s - loss: 0.0467 - accuracy: 0.9779 - val_loss: 0.2856 - val_accuracy: 0.9400 - 1s/epoch - 33ms/step
Epoch 75/1000
37/37 - 1s - loss: 0.0421 - accuracy: 0.9788 - val_loss: 0.3000 - val_accuracy: 0.9300 - 1s/epoch - 35ms/step
Epoch 76