In [2]:
import math
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.model_selection import train_test_split
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential, Model, load_model
from keras.layers import Input, Embedding, LSTM, Lambda, Dense, Dot, Dropout, Bidirectional, concatenate, Add, Subtract, Flatten, Multiply 
from keras.layers import MaxPooling1D, AveragePooling1D 
from keras.regularizers import l2 
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.metrics import MeanSquaredError, RootMeanSquaredError, MeanAbsoluteError 
from catboost import CatBoostRegressor
from sklearn.ensemble import RandomForestRegressor
import gensim 
import re

In [6]:
def load_data():
    product_des = pd.read_csv('./data/product_descriptions.csv/product_descriptions.csv')
    train = pd.read_csv('./data/train.csv/train.csv', encoding='ISO-8859-1')
    test = pd.read_csv('./data/test.csv.zip', encoding='ISO-8859-1')
    #test_sol = pd.read_csv('../input/test-solution-ass4/test_solution_ass4.csv')
    #test_sol = test_sol[test_sol['relevance']!=-1]
    #test_sol = test_sol.drop(['Usage'], axis=1)
    test = test.merge(product_des, on='product_uid', how='left')
    test = test.drop(['product_title'], axis=1)
    #test = test_sol.merge(test, on='id', how='left')
    train = train.merge(product_des, on='product_uid', how='left')
    train = train.drop(['product_title'], axis=1)
    return train, test

train, test = load_data()
train.head()

Unnamed: 0,id,product_uid,search_term,relevance,product_description
0,2,100001,angle bracket,3.0,"Not only do angles make joints stronger, they ..."
1,3,100001,l bracket,2.5,"Not only do angles make joints stronger, they ..."
2,9,100002,deck over,3.0,BEHR Premium Textured DECKOVER is an innovativ...
3,16,100005,rain shower head,2.33,Update your bathroom with the Delta Vero Singl...
4,17,100005,shower only faucet,2.67,Update your bathroom with the Delta Vero Singl...


In [7]:
def to_tokens(text):
    text = text.replace(' ', ',')
    text = text.replace('.', ',')
    text = text.replace('\n', ',')
    text = text.replace(';', ',')
    return text.split(',')

def split_search_description(train, test):
    train['search_term'] = [to_tokens(x) for x in train['search_term']]
    test['search_term'] = [to_tokens(x) for x in test['search_term']]
    train['product_description'] = [to_tokens(x) for x in train['product_description']]
    test['product_description'] = [to_tokens(x) for x in test['product_description']]
    return train, test

train, test = load_data()
train, test = split_search_description(train, test)
train.head()

Unnamed: 0,id,product_uid,search_term,relevance,product_description
0,2,100001,"[angle, bracket]",3.0,"[Not, only, do, angles, make, joints, stronger..."
1,3,100001,"[l, bracket]",2.5,"[Not, only, do, angles, make, joints, stronger..."
2,9,100002,"[deck, over]",3.0,"[BEHR, Premium, Textured, DECKOVER, is, an, in..."
3,16,100005,"[rain, shower, head]",2.33,"[Update, your, bathroom, with, the, Delta, Ver..."
4,17,100005,"[shower, only, faucet]",2.67,"[Update, your, bathroom, with, the, Delta, Ver..."


In [9]:
def create_texts(train, test):
    search_train = train['search_term'].to_numpy()
    search_test = test['search_term'].to_numpy()
    search_texts = np.concatenate([search_train,search_test])
    des_train = train['product_description'].to_numpy()
    des_test = test['product_description'].to_numpy()
    des_texts = np.concatenate([des_train,des_test])
    text = np.concatenate([search_texts,des_texts])
    return text

In [10]:
def create_tokonizer_word(texts):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(texts)
    return tokenizer

train_word, test_word = load_data()
text = create_texts(train_word, test_word)
word_tk = create_tokonizer_word(text)
print('numeber of diffrent words in the texts: ', len(word_tk.word_index.keys()))

numeber of diffrent words in the texts:  275997


In [12]:
def train_test_from_tk(train, test, search_tk, des_tk):
    train['search_term'] = search_tk.texts_to_sequences(train['search_term'])
    train['product_description'] = des_tk.texts_to_sequences(train['product_description'])
    test['search_term'] = search_tk.texts_to_sequences(test['search_term'])
    test['product_description'] = des_tk.texts_to_sequences(test['product_description'])
    return train, test

In [13]:
def find_min_max_avg(A, B):
    s_min = -1
    s_max = -1
    s_sum = 0
    for a in A:
        size = len(a)
        s_sum += size
        if s_min == -1 or s_min > size:
            s_min = size
        if s_max == -1 or s_max < size:
            s_max = size
    for b in B:
        size = len(b)
        s_sum += size
        if s_min == -1 or s_min > size:
            s_min = size
        if s_max == -1 or s_max < size:
            s_max = size
    s_avg = int(s_sum / (len(A) + len(B)))
    return s_min, s_max, s_avg

In [14]:
def create_X_y_with_padding(train, test, maxlen_search, maxlen_des):
    X_train = train.copy().drop(['id','product_uid','relevance'], axis=1)
    X_train['search_term'] = [x for x in pad_sequences(train['search_term'], maxlen=maxlen_search, padding='post')]
    X_train['product_description'] = [x for x in pad_sequences(train['product_description'], maxlen=maxlen_des, padding='post')]
    y_train = train.copy()['relevance']
    
    X_test = test.copy().drop(['id','product_uid','relevance'], axis=1)
    X_test['search_term'] = [x for x in pad_sequences(test['search_term'], maxlen=maxlen_search, padding='post')]
    X_test['product_description'] = [x for x in pad_sequences(test['product_description'], maxlen=maxlen_des, padding='post')]
    y_test = test.copy()['relevance']
    
    X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2)
    print('X train shape: ',X_train.shape, ' y train shape: ', y_train.shape)
    print('X val shape: ',X_val.shape, ' y val shape: ', y_val.shape)
    print('X test shape: ',X_test.shape, ' y test shape: ', y_test.shape)
    
    return X_train, y_train, X_val, y_val, X_test, y_test

In [15]:
train, test = train_test_from_tk(train, test, word_tk, word_tk)
des_min, des_max, des_avg = find_min_max_avg(train['product_description'],test['product_description'])
print('vector analyzes for product description: min=', des_min, ' ,max=',des_max, ' ,avg=', des_avg)

search_min, search_max, search_avg = find_min_max_avg(train['search_term'],test['search_term'])
print('vector analyzes for search term: min=', search_min, ' ,max=', search_max, ' ,avg=', search_avg)

maxlen_size = des_avg
if search_avg > des_avg:
    maxlen_size = search_avg 
X_train, y_train, X_val, y_val, X_test, y_test = create_X_y_with_padding(train, test, maxlen_size, maxlen_size)

vector analyzes for product description: min= 1  ,max= 869  ,avg= 128
vector analyzes for search term: min= 0  ,max= 13  ,avg= 2


KeyError: "['relevance'] not found in axis"