# Load clean data

In [2]:
import pandas as pd

train = pd.read_csv('train_cleaned.csv')
train.fillna('',inplace=True)
display(train.head(2))
X_train, y_train = train.loc[:, train.columns != 'relevance'], train['relevance']

Unnamed: 0,id,product_uid,product_title,search_term,product_description,MFG Brand Name,Bullet02,Bullet03,Bullet04,Bullet01,...,Certifications and Listings,Bullet09,Assembled Height (in.),Assembled Width (in.),Assembled Depth (in.),Product Length (in.),Bullet10,Indoor/Outdoor,Bullet11,relevance
0,2,100001,simpson strong tie 12 gaug angl,angl bracket,"not only do angles make joints stronger , they...",simpson strong tie,stronger than angled nailing or screw fastenin...,help ensure joints are consistently straight a...,dimensions 3 in. x 3 in. x 1 1/2 in.,versatile connector for various 90 connections...,...,,,,,,,,,,3.0
1,3,100001,simpson strong tie 12 gaug angl,bracket,"not only do angles make joints stronger , they...",simpson strong tie,stronger than angled nailing or screw fastenin...,help ensure joints are consistently straight a...,dimensions 3 in. x 3 in. x 1 1/2 in.,versatile connector for various 90 connections...,...,,,,,,,,,,2.5


# CNN with 2 features

In [3]:
from sklearn.model_selection import train_test_split

X_train_search_term, X_val_search_term, y_train_search_term, y_val_search_term = train_test_split(X_train['search_term'].to_list(),
                                                                                                    y_train.to_list(),
                                                                                                    test_size=0.1,
                                                                                                    random_state=42,
                                                                                                    #stratify=y
                                                                                                    )


X_train_product_title, X_val_product_title, y_train_product_title, y_val_product_title = train_test_split(X_train['product_title'].to_list(),
                                                                                                            y_train.to_list(),
                                                                                                            test_size=0.1,
                                                                                                            random_state=42,
                                                                                                            #stratify=y
                                                                                                            )

In [4]:
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization

vectorizer_search_term = TextVectorization(max_tokens=20000, output_sequence_length=200)
vectorizer_search_term.adapt(X_train['search_term'].to_list())
voc_search_term = vectorizer_search_term.get_vocabulary()
word_index_search_term = dict(zip(voc_search_term, range(len(voc_search_term))))

vectorizer_product_title = TextVectorization(max_tokens=20000, output_sequence_length=200)
vectorizer_product_title.adapt(X_train['product_title'].to_list())
voc_product_title = vectorizer_product_title.get_vocabulary()
word_index_product_title = dict(zip(voc_product_title, range(len(voc_product_title))))

!wget http://nlp.stanford.edu/data/glove.6B.zip

In [5]:
from zipfile import ZipFile
import os

if "glove.6B.100d.txt" not in os.listdir("model_data/"):
    ZipFile("model_data/glove.6B.zip",'r').extractall('model_data/')

In [6]:
import numpy as np

def loadGloveModel(File):
    print("Loading Glove Model")
    f = open(File,'r',encoding='utf-8')
    gloveModel = {}
    for line in f:
        splitLines = line.split()
        word = splitLines[0]
        wordEmbedding = np.array([float(value) for value in splitLines[1:]])
        gloveModel[word] = wordEmbedding
    print(len(gloveModel)," words loaded!")
    return gloveModel

glove_embeddings_index = loadGloveModel("model_data/glove.6B.100d.txt")

Loading Glove Model
400000  words loaded!


In [7]:
import numpy as np

def createEmbeddingMatrix(embeddings_index,word_index,voc):
    num_tokens = len(voc) + 2
    embedding_dim = 100
    hits = 0
    misses = 0

    # Prepare embedding matrix
    embedding_matrix = np.zeros((num_tokens, embedding_dim))
    for word, i in word_index.items():
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            # Words not found in embedding index will be all-zeros.
            # This includes the representation for "padding" and "OOV"
            embedding_matrix[i] = embedding_vector
            hits += 1
        else:
            misses += 1
    print("Converted %d words (%d misses)" % (hits, misses))
    return embedding_matrix

embedding_matrix_search_term = createEmbeddingMatrix(embeddings_index=glove_embeddings_index, word_index=word_index_search_term, voc=voc_search_term)
embedding_matrix_product_title = createEmbeddingMatrix(embeddings_index=glove_embeddings_index, word_index=word_index_product_title, voc=voc_product_title)



Converted 3460 words (650 misses)
Converted 9480 words (9034 misses)


In [8]:
X_train_search_term = vectorizer_search_term(X_train_search_term).numpy()
X_val_search_term = vectorizer_search_term(X_val_search_term).numpy()

X_train_product_title = vectorizer_product_title(X_train_product_title).numpy()
X_val_product_title = vectorizer_product_title(X_val_product_title).numpy()

y_train = np.array(y_train_search_term)
y_val = np.array(y_val_search_term)

In [9]:
from tensorflow.python.client import device_lib
print(device_lib.list_local_devices())

[name: "/device:CPU:0"
device_type: "CPU"
memory_limit: 268435456
locality {
}
incarnation: 11263380009247252689
, name: "/device:GPU:0"
device_type: "GPU"
memory_limit: 6959755424
locality {
  bus_id: 1
  links {
  }
}
incarnation: 16672806475566927587
physical_device_desc: "device: 0, name: GeForce RTX 2070 with Max-Q Design, pci bus id: 0000:01:00.0, compute capability: 7.5"
]


In [13]:
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping
from tensorflow.keras.layers import Input, Dense, Embedding, MaxPooling1D, Dropout, Conv1D, concatenate, Reshape, Flatten, Dropout
from tensorflow.keras.initializers import Constant

num_tokens_search_term = len(voc_search_term) + 2
num_tokens_product_title = len(voc_product_title) + 2
embedding_dim = 100

epochs = 100
batch_size = 256

# Search term CNN
input_search_term = Input(shape=(200), dtype='int64')
embedding_layer = Embedding(input_dim=num_tokens_search_term, output_dim=embedding_dim, embeddings_initializer=Constant(embedding_matrix_search_term), trainable=False)(input_search_term)
conv1d_search_term = Conv1D(filters=32, kernel_size=7, activation='relu')(embedding_layer)
maxpooling1d_search_term = MaxPooling1D(pool_size=2, strides=2)(conv1d_search_term)
conv1d_search_term_2 = Conv1D(filters=64, kernel_size=3, activation='relu', padding='same')(maxpooling1d_search_term)
maxpooling1d_search_term_2 = MaxPooling1D(pool_size=2, strides=2)(conv1d_search_term_2)
dropout_search_term = (Dropout(0.5))(maxpooling1d_search_term_2)
flatten_search_term = Flatten()(dropout_search_term)

# Product title CNN
input_product_title = Input(shape=(200),dtype="int64")
embedding_layer2 = Embedding(num_tokens_product_title, embedding_dim, embeddings_initializer=Constant(embedding_matrix_product_title), trainable=False)(input_product_title)
conv1d_product_title = Conv1D(filters=32, kernel_size=7, activation='relu')(embedding_layer2)
maxpooling1d_product_title = MaxPooling1D(pool_size=2, strides=2)(conv1d_product_title)
conv1d_product_title_2 = Conv1D(filters=64, kernel_size=3, activation='relu', padding='same')(maxpooling1d_product_title)
maxpooling1d_product_title_2 = MaxPooling1D(pool_size=2, strides=2)(conv1d_product_title_2)
dropout_product_title = (Dropout(0.5))(maxpooling1d_product_title_2)
flatten_product_title = Flatten()(dropout_product_title)

# concatenated model
concatenated_layers = concatenate([flatten_search_term, flatten_product_title])
model_concatenated = Dense(80, activation="relu")(concatenated_layers)
dropout = (Dropout(0.5))(model_concatenated)
model_output = Dense(1, activation="linear")(dropout)

model = Model(inputs= [input_search_term, input_product_title], outputs=model_output)

checkpoint = ModelCheckpoint('./model_data/' + 'weights.{epoch:03d}-{val_mse:.4f}.hdf5',
                                 monitor='val_mse', verbose=1,
                                 save_best_only=True, mode='auto')

cb = EarlyStopping(monitor='val_mse',
                              min_delta=0,
                              patience=10,
                              verbose=1,
                              mode='auto')

model.compile(loss='mse', optimizer='adam', metrics=['mse'])

In [14]:
model.summary()

Model: "model_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_3 (InputLayer)            [(None, 200)]        0                                            
__________________________________________________________________________________________________
input_4 (InputLayer)            [(None, 200)]        0                                            
__________________________________________________________________________________________________
embedding_2 (Embedding)         (None, 200, 100)     411200      input_3[0][0]                    
__________________________________________________________________________________________________
embedding_3 (Embedding)         (None, 200, 100)     1851600     input_4[0][0]                    
____________________________________________________________________________________________

In [15]:
import tensorflow as tf

# Run training on GPU
with tf.device('/device:GPU:0'):
    model.fit([X_train_search_term, X_train_product_title], y_train, epochs=epochs, batch_size=batch_size ,verbose=1, callbacks=[checkpoint, cb], validation_data=([X_val_search_term, X_val_product_title], y_val))

Epoch 1/100

Epoch 00001: val_mse improved from inf to 0.28406, saving model to ./model_data\weights.001-0.2841.hdf5
Epoch 2/100

Epoch 00002: val_mse did not improve from 0.28406
Epoch 3/100

Epoch 00003: val_mse improved from 0.28406 to 0.26261, saving model to ./model_data\weights.003-0.2626.hdf5
Epoch 4/100

Epoch 00004: val_mse did not improve from 0.26261
Epoch 5/100

Epoch 00005: val_mse improved from 0.26261 to 0.26146, saving model to ./model_data\weights.005-0.2615.hdf5
Epoch 6/100

Epoch 00006: val_mse improved from 0.26146 to 0.25179, saving model to ./model_data\weights.006-0.2518.hdf5
Epoch 7/100

Epoch 00007: val_mse improved from 0.25179 to 0.24975, saving model to ./model_data\weights.007-0.2498.hdf5
Epoch 8/100

Epoch 00008: val_mse did not improve from 0.24975
Epoch 9/100

Epoch 00009: val_mse did not improve from 0.24975
Epoch 10/100

Epoch 00010: val_mse improved from 0.24975 to 0.24815, saving model to ./model_data\weights.010-0.2481.hdf5
Epoch 11/100

Epoch 00011

In [16]:
model.save_weights('./model_data/' + 'final weights')
model.save('./model_data/' + 'my_model.h5')

In [5]:
import pandas as pd
from tensorflow.keras.models import load_model

test = pd.read_csv('test_cleaned.csv')
test.fillna('',inplace=True)
display(test.head(2))
X_test_search_term, X_test_product_title = test['search_term'].to_list(), test['product_title'].to_list()

X_test_search_term = vectorizer_search_term(X_test_search_term).numpy()
X_test_product_title = vectorizer_search_term(X_test_product_title).numpy()

model = load_model('./model_data/' + "my_model.h5")

predictions = model.predict([X_test_search_term, X_test_product_title], verbose=1)

Unnamed: 0,id,product_uid,product_title,search_term,product_description,MFG Brand Name,Bullet02,Bullet03,Bullet04,Bullet01,...,Bullet08,Certifications and Listings,Bullet09,Assembled Height (in.),Assembled Width (in.),Assembled Depth (in.),Product Length (in.),Bullet10,Indoor/Outdoor,Bullet11
0,1,100001,simpson strong tie 12 gaug angl,degre bracket,"not only do angles make joints stronger , they...",simpson strong tie,stronger than angled nailing or screw fastenin...,help ensure joints are consistently straight a...,dimensions 3 in. x 3 in. x 1 1/2 in.,versatile connector for various 90 connections...,...,,,,,,,,,,
1,4,100001,simpson strong tie 12 gaug angl,metal bracket,"not only do angles make joints stronger , they...",simpson strong tie,stronger than angled nailing or screw fastenin...,help ensure joints are consistently straight a...,dimensions 3 in. x 3 in. x 1 1/2 in.,versatile connector for various 90 connections...,...,,,,,,,,,,




In [9]:
predictions = predictions / predictions.max() * 3
print(predictions.max())
print(predictions.min())

3.0
1.2510374


In [10]:
test['relevance'] = predictions
display(test.head(5))

test[['id','relevance']].to_csv('submission.csv',index=False)

Unnamed: 0,id,product_uid,product_title,search_term,product_description,MFG Brand Name,Bullet02,Bullet03,Bullet04,Bullet01,...,Certifications and Listings,Bullet09,Assembled Height (in.),Assembled Width (in.),Assembled Depth (in.),Product Length (in.),Bullet10,Indoor/Outdoor,Bullet11,relevance
0,1,100001,simpson strong tie 12 gaug angl,degre bracket,"not only do angles make joints stronger , they...",simpson strong tie,stronger than angled nailing or screw fastenin...,help ensure joints are consistently straight a...,dimensions 3 in. x 3 in. x 1 1/2 in.,versatile connector for various 90 connections...,...,,,,,,,,,,2.16201
1,4,100001,simpson strong tie 12 gaug angl,metal bracket,"not only do angles make joints stronger , they...",simpson strong tie,stronger than angled nailing or screw fastenin...,help ensure joints are consistently straight a...,dimensions 3 in. x 3 in. x 1 1/2 in.,versatile connector for various 90 connections...,...,,,,,,,,,,2.158765
2,5,100001,simpson strong tie 12 gaug angl,simpson ski abl,"not only do angles make joints stronger , they...",simpson strong tie,stronger than angled nailing or screw fastenin...,help ensure joints are consistently straight a...,dimensions 3 in. x 3 in. x 1 1/2 in.,versatile connector for various 90 connections...,...,,,,,,,,,,1.843243
3,6,100001,simpson strong tie 12 gaug angl,simpson strong tie,"not only do angles make joints stronger , they...",simpson strong tie,stronger than angled nailing or screw fastenin...,help ensure joints are consistently straight a...,dimensions 3 in. x 3 in. x 1 1/2 in.,versatile connector for various 90 connections...,...,,,,,,,,,,2.319182
4,7,100001,simpson strong tie 12 gaug angl,simpson strong tie acc,"not only do angles make joints stronger , they...",simpson strong tie,stronger than angled nailing or screw fastenin...,help ensure joints are consistently straight a...,dimensions 3 in. x 3 in. x 1 1/2 in.,versatile connector for various 90 connections...,...,,,,,,,,,,2.344798


In [None]:
# import tensorflow as tf
# import tensorflow_hub as hub
# from tqdm import tqdm
# import numpy as np
# embed = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")
# from tensorflow.keras.preprocessing.sequence import pad_sequences

# words_search_term = [t.split(' ') for t in df_train_complete_modified['search_term']]
# max_length_search_term = max(len(w) for w in words_search_term)
# embeddings_search_term = [embed(w).numpy() for w in tqdm(words_search_term)]
# missing_search_term = [max_length_search_term - e.shape[0] for e in embeddings_search_term]

# # padding = [0] * 512
# # for i, m in enumerate(tqdm(missing_search_term)):
# #     if m == 0:
# #         continue
# #     embeddings_search_term[i] = np.concatenate([embeddings_search_term[i], np.array([padding] * m)])
# embeddings_search_term = pad_sequences(embeddings_search_term,maxlen=max_length_search_term,padding='post',value=0.0)

# embeddings_search_term = np.array(embeddings_search_term)
# print(embeddings_search_term.shape) #(74067, 10, 512)
# #------------------------------------------------------------------------------------------------------------------------------
# words_product_title = [t.split(' ') for t in df_train_complete_modified['product_title']]
# max_length_product_title = max(len(w) for w in words_product_title)
# embeddings_product_title = [embed(w).numpy() for w in tqdm(words_product_title)]
# missing_product_title = [max_length_product_title - e.shape[0] for e in embeddings_product_title]

# padding = [0] * 512
# for i, m in enumerate(tqdm(missing_product_title)):
#     if m == 0:
#         continue
#     embeddings_product_title[i] = np.concatenate([embeddings_product_title[i], np.array([padding] * m)])

# embeddings_product_title = np.array(embeddings_product_title)
# print(embeddings_product_title.shape) #(74067, 29, 512)
# #------------------------------------------------------------------------------------------------------------------------------
# #y = df_train_complete_modified['relevance'].round().astype(int).tolist()
# y = df_train_complete_modified['relevance'].tolist()
# print(len(y))