# CNN with 3 categories, 2 features

In [31]:
import pandas as pd
import os

df_train_complete_modified = pd.read_csv('train_complete_v4.csv')
df_train_complete_modified.set_index('product_uid', inplace=True)
df_train_complete_modified = df_train_complete_modified.fillna('')

In [32]:
import tensorflow as tf
import tensorflow_hub as hub
from tqdm import tqdm
import numpy as np
embed = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")

words_search_term = [t.split(' ') for t in df_train_complete_modified['search_term']]
max_length_search_term = max(len(w) for w in words_search_term)
embeddings_search_term = [embed(w).numpy() for w in tqdm(words_search_term)]
missing_search_term = [max_length_search_term - e.shape[0] for e in embeddings_search_term]

padding = [0] * 512
for i, m in enumerate(tqdm(missing_search_term)):
    if m == 0:
        continue
    embeddings_search_term[i] = np.concatenate([embeddings_search_term[i], np.array([padding] * m)])

embeddings_search_term = np.array(embeddings_search_term)
print(embeddings_search_term.shape) #(74067, 10, 512)
#------------------------------------------------------------------------------------------------------------------------------
words_product_title = [t.split(' ') for t in df_train_complete_modified['product_title']]
max_length_product_title = max(len(w) for w in words_product_title)
embeddings_product_title = [embed(w).numpy() for w in tqdm(words_product_title)]
missing_product_title = [max_length_product_title - e.shape[0] for e in embeddings_product_title]

padding = [0] * 512
for i, m in enumerate(tqdm(missing_product_title)):
    if m == 0:
        continue
    embeddings_product_title[i] = np.concatenate([embeddings_product_title[i], np.array([padding] * m)])

embeddings_product_title = np.array(embeddings_product_title)
print(embeddings_product_title.shape) #(74067, 29, 512)
#------------------------------------------------------------------------------------------------------------------------------
#y = df_train_complete_modified['relevance'].round().astype(int).tolist()
y = df_train_complete_modified['relevance'].tolist()
print(len(y))

100%|██████████| 74067/74067 [02:25<00:00, 508.04it/s]
100%|██████████| 74067/74067 [00:20<00:00, 3572.53it/s]


(74067, 10, 512)


100%|██████████| 74067/74067 [02:57<00:00, 416.17it/s]
100%|██████████| 74067/74067 [00:52<00:00, 1406.30it/s]


(74067, 29, 512)
74067


In [5]:
from sklearn.model_selection import train_test_split

X_train_search_term, X_test_search_term, y_train_search_term, y_test_search_term = train_test_split(embeddings_search_term,
                                                                                                   y,
                                                                                                   test_size=0.1,
                                                                                                   random_state=42,
                                                                                                   #stratify=y
                                                                                                   )

X_train_search_term, X_val_search_term, y_train_search_term, y_val_search_term = train_test_split(X_train_search_term,
                                                                                                    y_train_search_term,
                                                                                                    test_size=0.2,
                                                                                                    random_state=42,
                                                                                                    #stratify=y_train_search_term
                                                                                                    )


X_train_product_title, X_test_product_title, y_train_product_title, y_test_product_title = train_test_split(embeddings_product_title,
                                                                                                           y,
                                                                                                           test_size=0.1,
                                                                                                           random_state=42,
                                                                                                           #stratify=y
                                                                                                           )

X_train_product_title, X_val_product_title, y_train_product_title, y_val_product_title = train_test_split(X_train_product_title,
                                                                                                           y_train_product_title,
                                                                                                           test_size=0.2,
                                                                                                           random_state=42,
                                                                                                           #stratify=y_train_product_title
                                                                                                           )

74067


In [12]:

# -------------------------------------------------------------------------------------------------------------
from keras.models import Sequential
from keras.callbacks import ModelCheckpoint, EarlyStopping
from keras.optimizers import Adam
from keras.models import Model
from keras.layers import Input, Dense, Embedding, merge, MaxPooling1D, Dropout, Conv1D, concatenate, Reshape, Flatten, Dropout
from keras.utils import plot_model

epochs = 100
batch_size = 256
# Search term CNN
input_search_term = Input(shape=(max_length_search_term, 512))
conv1d_search_term = Conv1D(filters=32, kernel_size=7, activation='relu')(input_search_term)
maxpooling1d_search_term = MaxPooling1D(pool_size=2, strides=2)(conv1d_search_term)
conv1d_search_term_2 = Conv1D(filters=64, kernel_size=3, activation='relu', padding='same')(maxpooling1d_search_term)
maxpooling1d_search_term_2 = MaxPooling1D(pool_size=2, strides=2)(conv1d_search_term_2)
dropout_search_term = (Dropout(0.5))(maxpooling1d_search_term_2)
flatten_search_term = Flatten()(dropout_search_term)

# Product title CNN
input_product_title = Input(shape=(max_length_product_title, 512))
conv1d_product_title = Conv1D(filters=32, kernel_size=7, activation='relu')(input_product_title)
maxpooling1d_product_title = MaxPooling1D(pool_size=2, strides=2)(conv1d_product_title)
conv1d_product_title_2 = Conv1D(filters=64, kernel_size=3, activation='relu', padding='same')(maxpooling1d_product_title)
maxpooling1d_product_title_2 = MaxPooling1D(pool_size=2, strides=2)(conv1d_product_title_2)
dropout_product_title = (Dropout(0.5))(maxpooling1d_product_title_2)
flatten_product_title = Flatten()(dropout_product_title)

# concatenated model
concatenated_layers = concatenate([flatten_search_term, flatten_product_title])
model_concatenated = Dense(80, activation="relu")(concatenated_layers)
dropout = (Dropout(0.5))(concatenated_layers)
model_output = Dense(1, activation="linear")(dropout)

model = Model(inputs= [input_search_term, input_product_title], outputs=model_output)

checkpoint = ModelCheckpoint('./model_data/' + 'weights.{epoch:03d}-{val_mse:.4f}.hdf5',
                                 monitor='val_mse', verbose=1,
                                 save_best_only=True, mode='auto')

cb = EarlyStopping(monitor='val_mse',
                              min_delta=0,
                              patience=10,
                              verbose=1,
                              mode='auto')

model.compile(loss='mse', optimizer='adam', metrics=['mse'])

model.fit([X_train_search_term, X_train_product_title], y_train_search_term, epochs=epochs, batch_size=batch_size ,verbose=1, callbacks=[checkpoint, cb],
          validation_data=([X_val_search_term, X_val_product_title], y_val_search_term), class_weight="auto")

model.save_weights('./model_data/' + 'final weights')
model.save('./model_data/' + 'my_model.h5')

# -------------------------------------------------------------------------------------------------------------


Train on 53328 samples, validate on 13332 samples
Epoch 1/100

Epoch 00001: val_mse improved from inf to 0.27680, saving model to ./model_data/weights.001-0.2768.hdf5
Epoch 2/100

Epoch 00002: val_mse improved from 0.27680 to 0.27022, saving model to ./model_data/weights.002-0.2702.hdf5
Epoch 3/100

Epoch 00003: val_mse improved from 0.27022 to 0.26610, saving model to ./model_data/weights.003-0.2661.hdf5
Epoch 4/100

Epoch 00004: val_mse did not improve from 0.26610
Epoch 5/100

Epoch 00005: val_mse improved from 0.26610 to 0.25699, saving model to ./model_data/weights.005-0.2570.hdf5
Epoch 6/100

Epoch 00006: val_mse improved from 0.25699 to 0.25429, saving model to ./model_data/weights.006-0.2543.hdf5
Epoch 7/100

Epoch 00007: val_mse did not improve from 0.25429
Epoch 8/100

Epoch 00008: val_mse did not improve from 0.25429
Epoch 9/100

Epoch 00009: val_mse improved from 0.25429 to 0.25090, saving model to ./model_data/weights.009-0.2509.hdf5
Epoch 10/100

Epoch 00010: val_mse did 

In [18]:
from keras.models import load_model
import glob

# load model
#model = load_model('./model_data/' + "my_model.h5")
#model.compile(loss='mse', optimizer='adam', metrics=['mse'])
# find the file with the best weights
list_of_files = glob.glob('./model_data/' + 'weights.*.hdf5')
#youngest_file = get_youngest_file(list_of_files)
# load weights into model
model.load_weights('model_data/weights.012-0.2482.hdf5')

prediction = model.predict([X_test_search_term, X_test_product_title])
prediction = prediction.tolist()

# -------------------------------------------------------------------------------------------------------------
from sklearn.metrics import accuracy_score, precision_score, classification_report, confusion_matrix, mean_squared_error

print ("MSE:", mean_squared_error(y_test_search_term, prediction))
# print ("Precision:", precision_score(y_test, prediction, average ='micro'))
#print (classification_report(y_test_search_term, prediction))
# print (confusion_matrix(y_test, predicted))

MSE: 0.2485610057742038


In [30]:
prediction = [y[0] for y in prediction]
df_result = pd.DataFrame({'Relevance': y_test_search_term, 'Prediction':prediction})
mask = df_result.Relevance < 2
print ("MSE:", mean_squared_error(df_result[mask]['Relevance'], df_result[mask]['Prediction']))

MSE: 0.7402754686486568


# Pipeline

In [None]:
from sklearn.base import BaseEstimator, TransformerMixin

class TextSelector(BaseEstimator, TransformerMixin):
    def __init__(self, field):
        self.field = field
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return X[self.field]
    
class NumberSelector(BaseEstimator, TransformerMixin):
    def __init__(self, field):
        self.field = field
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return X[[self.field]]

In [None]:
import nltk
def Tokenizer(str_input):
    words = re.sub(r"[^A-Za-z0-9\-]", " ", str_input).lower().split()
    porter_stemmer=nltk.PorterStemmer()
    words = [porter_stemmer.stem(word) for word in words]
    return words

In [None]:
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import TruncatedSVD
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

classifier = Pipeline([
    ('features', FeatureUnion([
        ('text', Pipeline([
            ('colext', TextSelector('Text')),
            ('tfidf', TfidfVectorizer(tokenizer=Tokenizer, stop_words=stop_words,
                     min_df=.0025, max_df=0.25, ngram_range=(1,3))),
            ('svd', TruncatedSVD(algorithm='randomized', n_components=300)), #for XGB
        ])),
        ('words', Pipeline([
            ('wordext', NumberSelector('TotalWords')),
            ('wscaler', StandardScaler()),
        ])),
    ])),
    ('clf', XGBClassifier(max_depth=3, n_estimators=300, learning_rate=0.1)),
#    ('clf', RandomForestClassifier()),
    ])

In [None]:
classifier.fit(X_train, y_train)
preds = classifier.predict(X_test)

In [None]:
from sklearn.metrics import accuracy_score, precision_score, classification_report, confusion_matrix

print ("Accuracy:", accuracy_score(y_test, preds))
print ("Precision:", precision_score(y_test, preds))
print (classification_report(y_test, preds))
print (confusion_matrix(y_test, preds))