In [172]:
import pandas as pd
import nltk
import numpy as np

#load inthe NTLK stopwords to remove articles, preposition and other words that are not actionable
from nltk.corpus import stopwords
# This allows to create individual objects from a bog of words
from nltk.tokenize import word_tokenize, sent_tokenize
# Lemmatizer helps to reduce words to the base form
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer

In [173]:
df=pd.read_csv('summer-products-with-rating-and-performance_2020-08.csv')
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /Users/jasmineli/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/jasmineli/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/jasmineli/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [174]:
def process_sentence(sentence):
    new_tokens = word_tokenize(sentence)
    new_tokens = [t.lower() for t in new_tokens]
    new_tokens =[t for t in new_tokens if t not in stopwords.words('english')]
    new_tokens = [t for t in new_tokens if t.isalpha()]
    lemmatizer = WordNetLemmatizer()
    new_tokens =[lemmatizer.lemmatize(t) for t in new_tokens]
    return "".join(new_tokens)

In [175]:
titles = df['title_orig'].tolist()
tokens = [process_sentence(t) for t in titles]
df['title_pre'] = tokens

In [176]:
vectorizer = TfidfVectorizer()
vectors = vectorizer.fit_transform(df['title_pre'])

In [177]:
feature_names = vectorizer.get_feature_names()
dense = vectors.todense()
denselist = dense.tolist()
tfidf = pd.DataFrame(denselist, columns=feature_names)



In [178]:
print(tfidf.shape)

(1573, 1159)


In [179]:
# product color
def main_color(s):
    main_color = {"red":"red", "white":"white", "pink":"pink", "yellow":"yellow", "green":"green", "blue":"blue", "wine":"red", "burgundy":"red", "black":"black", "navy":"navy", "orange":"orange", 
    "rose":"pink", "gray":"gray", "grey":"gray", "purple":"purple", "violet":"purple", "army":"green", "leopard":"orange", "ivory":"white", 
    "brown":"brown", "coffee":"brown", "camel":"beige", "tan":"brown", "nude":"beige", "khaki":"khaki", "apricot":"yellow", "camouflage":"green", "jasper":"red"}  # ordered by importance
    for key, value in main_color.items():
        if key in s:
            return value
    return "others"
product_color = df["product_color"]
product_color = [s.lower() if type(s) is str else 'nan' for s in product_color]
product_color = [main_color(s) for s in product_color]
from matplotlib import colors
product_color = [(-0.1,-0.1,-0.1,-0.1) if s == "others" else colors.to_rgba(s) for s in product_color]

df['product_color_rgb'] = [np.array(t) for t in product_color]

# log prices
df['log_price'] = [np.log(p) for p in df["price"]]
df['log_retail_price'] = [np.log(p) for p in df["retail_price"]]

# log merchant rating count
df['log_merchant_rating_count'] = np.log(df['merchant_rating_count'])

# urgent text
df['urgent'] = [1 if s == "Quantité limitée !" else 0 for s in df["urgency_text"]]

  result = getattr(ufunc, method)(*inputs, **kwargs)


In [180]:
data = df[["log_price", "log_retail_price", "uses_ad_boosts", "badges_count", "badge_local_product", 
           "badge_product_quality", "badge_fast_shipping", "urgent", "units_sold"]]
df2 = pd.concat([data, tfidf], axis=1)
label = [1 if sales > 200 else 0 for sales in data["units_sold"]]
df2['high_sale'] = label

In [181]:
rgb = df["product_color_rgb"]
rgb = np.stack(rgb.values, axis=0)
for i in range(4):
    df2["product_color_rgb"+str(i)] = rgb[:,i]
df2.replace([np.inf, -np.inf], np.nan, inplace=True)
print(df2.shape)

(1573, 1173)


In [182]:
# X = df2.loc[:, ~df2.columns.isin(['high_sale', 'units_sold'])]
X = tfidf
y = df2['high_sale']

In [183]:
import tensorflow as tf

from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.models import Sequential

from keras.layers import Input, Dense, Concatenate
from keras.models import Model
from keras.layers import Dense, Conv1D, MaxPooling1D, Flatten, Dropout
from keras.callbacks import EarlyStopping
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.125, random_state=42)
X_train, X_dev, y_train, y_dev = train_test_split(X_train, y_train, test_size=0.14286, random_state=42)

In [184]:
print ("number of training examples = " + str(X_train.shape[0]))
print ("number of dev examples = " + str(X_dev.shape[0]))
print ("number of test examples = " + str(X_test.shape[0]))
print ("X_train shape: " + str(X_train.shape))
print ("Y_train shape: " + str(y_train.shape))
print ("X_dev shape: " + str(X_dev.shape))
print ("Y_dev shape: " + str(y_dev.shape))
print ("X_test shape: " + str(X_test.shape))
print ("Y_test shape: " + str(y_test.shape))

number of training examples = 1179
number of dev examples = 197
number of test examples = 197
X_train shape: (1179, 1159)
Y_train shape: (1179,)
X_dev shape: (197, 1159)
Y_dev shape: (197,)
X_test shape: (197, 1159)
Y_test shape: (197,)


In [185]:
# Defining hold out data for evaluation 
X_train = tf.expand_dims(X_train, axis=-1)
input_shape = X_train.shape[1:]

In [186]:
print(input_shape)

(1159, 1)


In [187]:
# from sklearn.linear_model import LogisticRegression

# classifier = LogisticRegression()
# classifier.fit(X_train, y_train)

# score = classifier.score(evals_X, evals_y)

# print("Accuracy:", score)

In [188]:
# define model
model = Sequential()
model.add(Conv1D(filters=24, kernel_size=8, activation='relu'))
model.add(MaxPooling1D(pool_size=2))
model.add(Dropout(0.1))
model.add(Flatten())
model.add(Dense(128, activation='relu', kernel_regularizer='l2'))
model.add(Dense(1, activation='sigmoid'))

In [189]:
model.compile(optimizer='adam',
              loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
              metrics=['accuracy'])

es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=20)

In [190]:
history = model.fit(
    X_train,
    y_train,
    epochs=1000,
    verbose=2,
    validation_data=(X_dev, y_dev),
    batch_size=32,
    callbacks=[es]
)
model.summary()

Epoch 1/1000
37/37 - 1s - loss: 1.1345 - accuracy: 0.5666 - val_loss: 0.7106 - val_accuracy: 0.6345 - 1s/epoch - 37ms/step


  return dispatch_target(*args, **kwargs)


Epoch 2/1000
37/37 - 1s - loss: 0.7039 - accuracy: 0.6081 - val_loss: 0.6873 - val_accuracy: 0.6345 - 732ms/epoch - 20ms/step
Epoch 3/1000
37/37 - 1s - loss: 0.6977 - accuracy: 0.5912 - val_loss: 0.6842 - val_accuracy: 0.6650 - 762ms/epoch - 21ms/step
Epoch 4/1000
37/37 - 1s - loss: 0.7061 - accuracy: 0.6073 - val_loss: 0.7069 - val_accuracy: 0.6447 - 769ms/epoch - 21ms/step
Epoch 5/1000
37/37 - 1s - loss: 0.7101 - accuracy: 0.6056 - val_loss: 0.6977 - val_accuracy: 0.6599 - 752ms/epoch - 20ms/step
Epoch 6/1000
37/37 - 1s - loss: 0.7160 - accuracy: 0.6192 - val_loss: 0.7306 - val_accuracy: 0.6548 - 790ms/epoch - 21ms/step
Epoch 7/1000
37/37 - 1s - loss: 0.7215 - accuracy: 0.6378 - val_loss: 0.7209 - val_accuracy: 0.6650 - 803ms/epoch - 22ms/step
Epoch 8/1000
37/37 - 1s - loss: 0.7210 - accuracy: 0.6446 - val_loss: 0.7251 - val_accuracy: 0.6447 - 758ms/epoch - 20ms/step
Epoch 9/1000
37/37 - 1s - loss: 0.7196 - accuracy: 0.6684 - val_loss: 0.7349 - val_accuracy: 0.6497 - 783ms/epoch - 21