In [1]:
import pandas as pd
import numpy as np
import re
import gc

In [2]:
from sklearn.metrics import mean_squared_error
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.preprocessing import MinMaxScaler

import keras
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation
from keras.layers.normalization import BatchNormalization
from keras.callbacks import LearningRateScheduler

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [3]:
import matplotlib.pyplot as plt

In [4]:
# code copied from https://www.kaggle.com/marknagelberg/rmsle-function
def rmsle(y_pred, y_test) : 
    assert len(y_test) == len(y_pred)
    return np.sqrt(np.mean((np.log(1+y_pred) - np.log(1+y_test))**2))

In [5]:
# Gets the count of most frequent words give a dataframe
def word_freq(df, col):
    word_frequency = {}
    word_frequency_lst = []
    for index,row in df.iterrows(): 
        for w in list(set(str(row[col]).split(' '))):
            if w not in word_frequency:
                word_frequency[w] = 1
            else:
                word_frequency[w] += 1

    for key, value in word_frequency.items():
        temp = [key, value]
        word_frequency_lst.append(temp)
    word_freq_df = pd.DataFrame(word_frequency_lst, columns=["unique_word", 'frequency'])
    word_freq_df = word_freq_df.sort_values(['frequency'], ascending=False)
    return word_freq_df

# Read Data

In [6]:
clean_data = pd.read_csv(
    '/Users/joashc/Downloads/mercari-price-suggestion-challenge/partially_clean_train_data.csv')
clean_data.shape

(1482486, 13)

## Modeling Text
- stemmed_item_description
    - tdidf matrix
- clean_brand_name
    - One-hot-encode
- clean_category_name
    - One-hot-encode unique values if possible
- clean_item_name
    - tdidf matrix

### TF-IDF item_description

In [7]:
item_description_df = clean_data['stemmed_item_description']
item_description_df.shape

(1482486,)

In [8]:
max_item_desc_features = 1500

In [9]:
tfidf = TfidfVectorizer(max_features=max_item_desc_features)
x_tfidf = pd.DataFrame(tfidf.fit_transform(item_description_df).toarray())
x_tfidf.columns = ['item_desc_' + str(col) for col in x_tfidf.columns]
print(x_tfidf.shape)
x_tfidf.head(2)

(1482486, 1500)


Unnamed: 0,item_desc_0,item_desc_1,item_desc_2,item_desc_3,item_desc_4,item_desc_5,item_desc_6,item_desc_7,item_desc_8,item_desc_9,...,item_desc_1490,item_desc_1491,item_desc_1492,item_desc_1493,item_desc_1494,item_desc_1495,item_desc_1496,item_desc_1497,item_desc_1498,item_desc_1499
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.714017,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [10]:
print(clean_data.shape)
clean_data_v2 = pd.concat([clean_data, x_tfidf], axis=1).drop(columns=['item_description', 
                                                                                         'stemmed_item_description'])
print(clean_data_v2.shape)

(1482486, 13)
(1482486, 1511)
CPU times: user 1min 15s, sys: 1min 26s, total: 2min 41s
Wall time: 3min 23s


In [11]:
# Delete dataframes from memory
del [[x_tfidf,item_description_df, clean_data]]
gc.collect()
clean_data = pd.DataFrame()
item_description_df=pd.DataFrame()
x_tfidf=pd.DataFrame()

### TF-IDF clean_item_name

In [12]:
# item_name_df = clean_data_v2['clean_item_name']
# item_name_df.shape

(1482486,)

In [36]:
# max_item_name_features = 100

In [37]:
# tfidf = TfidfVectorizer(max_features=max_item_name_features)
# item_name_tfidf = pd.DataFrame(tfidf.fit_transform(item_name_df).toarray())
# item_name_tfidf.columns = ['item_name_' + str(col) for col in item_name_tfidf.columns]
# print(item_name_tfidf.shape)
# item_name_tfidf.head(2)

(1482486, 100)


Unnamed: 0,item_name_0,item_name_1,item_name_2,item_name_3,item_name_4,item_name_5,item_name_6,item_name_7,item_name_8,item_name_9,...,item_name_90,item_name_91,item_name_92,item_name_93,item_name_94,item_name_95,item_name_96,item_name_97,item_name_98,item_name_99
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.554055,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [38]:
# %%time
# print(clean_data_v4.shape)
# clean_data_v5 = pd.concat([clean_data_v4, item_name_tfidf], axis=1).drop(columns=['clean_item_name', 
#                                                                                          'name'])
print(clean_data_v5.shape)

(1482486, 1837)
(1482486, 1935)
CPU times: user 41.3 s, sys: 1min 17s, total: 1min 58s
Wall time: 2min 24s


In [39]:
# # Delete dataframes from memory
# del [[item_name_tfidf,item_name_df, clean_data_v4]]
# gc.collect()
# clean_data_v4 = pd.DataFrame()
# item_name_df=pd.DataFrame()
# item_name_tfidf=pd.DataFrame()

### Drop Unwanted Columns

In [13]:
clean_data_v2 = clean_data_v2.drop(columns=list(clean_data_v2.select_dtypes(object)))
clean_data_v2.shape

(1482486, 1503)

In [16]:
clean_data_v2 = clean_data_v2.drop(columns= ['item_condition_id','shipping'])
clean_data_v2.shape

(1482486, 1501)

# Model Implementation

In [17]:
X_train, X_test, y_train, y_test = train_test_split(clean_data_v2.drop(columns=['price']).reset_index(drop=True), 
                                                    clean_data_v2[['price']].reset_index(drop=True), 
                                                                  test_size=0.15, random_state=42)
print('Number of rows in train and validation data:', X_train.shape[0], y_train.shape)
print('Number of rows in test data:', X_test.shape[0], y_test.shape)

Number of rows in train and validation data: 1260113 (1260113, 1)
Number of rows in test data: 222373 (222373, 1)


In [18]:
X_train = X_train.reset_index(drop=True)
y_train = y_train.reset_index(drop=True)

In [None]:
# scaler = MinMaxScaler(feature_range=(0, 1))
# X_train = scaler.fit_transform(X_train)
# print('MinMaxScaler Complete')

#### Model architecture parameters
n_stocks = 500
n_neurons_1 = 1024
n_neurons_2 = 512
n_neurons_3 = 256
n_neurons_4 = 128
n_target = 1

In [19]:
X_train, X_val, y_train, y_val = train_test_split(X_train.reset_index(drop=True), 
                                                    y_train.reset_index(drop=True), 
                                                                  test_size=0.15, random_state=42)
print('Number of rows in train and validation data:', X_train.shape[0], y_train.shape)
print('Number of rows in validation data:', X_val.shape[0], y_val.shape)

Number of rows in train and validation data: 1071096 (1071096, 1)
Number of rows in validation data: 189017 (189017, 1)


In [20]:
def step_decay(epoch):
    initial_lrate = 0.002
    drop = 0.5
    epochs_drop = 20
    lrate = initial_lrate * math.pow(drop,
    math.floor((1+epoch)/epochs_drop))
    return lrate

In [21]:
num_epochs = 15
batch_size = 2213

all_val_predictions = pd.DataFrame()

train_val_rmsle = []

model = Sequential()
model.add(Dense(512, activation='relu', input_dim=X_train.shape[1]))
model.add(BatchNormalization())
# model.add(Dropout(0.8))

model.add(Dense(264, activation='relu'))
model.add(BatchNormalization())
# model.add(Dropout(0.5))

model.add(Dense(128, activation='relu'))
model.add(BatchNormalization())
# model.add(Dropout(0.5))

model.add(Dense(64, activation='relu'))
model.add(BatchNormalization())
# model.add(Dropout(0.5))

model.add(Dense(1))
#     model.add(Activation('sigmoid'))
model.compile(loss='mean_squared_error', optimizer='adam', metrics=['mean_squared_error'])

lrate = LearningRateScheduler(step_decay)

model_hist = model.fit(X_train, y_train, validation_data=(X_val,y_val), 
                       batch_size=batch_size, epochs=num_epochs, verbose=1)



Train on 1071096 samples, validate on 189017 samples
Epoch 1/15

KeyboardInterrupt: 

In [None]:
model = Sequential()
model.add(Dense(1024, activation='relu', input_dim=X_train.shape[1]))
model.add(BatchNormalization())

model.add(Dense(512, activation='relu', input_dim=X_train.shape[1]))
model.add(BatchNormalization())
# model.add(Dropout(0.8))

model.add(Dense(264, activation='relu'))
model.add(BatchNormalization())
# model.add(Dropout(0.5))

model.add(Dense(128, activation='relu'))
model.add(BatchNormalization())
# model.add(Dropout(0.5))

model.add(Dense(64, activation='relu'))
model.add(BatchNormalization())
# model.add(Dropout(0.5))

model.add(Dense(1))

Epoch 1/15
2019-06-01 21:52:00.440682: I tensorflow/core/common_runtime/process_util.cc:69] Creating new thread pool with default inter op setting: 2. Tune using inter_op_parallelism_threads for best performance.
1071096/1071096 [==============================] - 62s 58us/step - loss: 1575.1133 - mean_squared_logarithmic_error: 3.5816 - val_loss: 1127.1075 - val_mean_squared_logarithmic_error: 0.5566
Epoch 2/15
1071096/1071096 [==============================] - 58s 54us/step - loss: 1043.6532 - mean_squared_logarithmic_error: 0.4336 - val_loss: 1087.2679 - val_mean_squared_logarithmic_error: 0.4298
Epoch 3/15
1071096/1071096 [==============================] - 57s 54us/step - loss: 901.0541 - mean_squared_logarithmic_error: 0.4288 - val_loss: 1053.5793 - val_mean_squared_logarithmic_error: 0.4423
Epoch 4/15
1071096/1071096 [==============================] - 58s 54us/step - loss: 782.7322 - mean_squared_logarithmic_error: 0.4164 - val_loss: 1047.5116 - val_mean_squared_logarithmic_error: 0.4374
Epoch 5/15
1071096/1071096 [==============================] - 58s 54us/step - loss: 682.4031 - mean_squared_logarithmic_error: 0.4012 - val_loss: 1040.5188 - val_mean_squared_logarithmic_error: 0.4202
Epoch 6/15
1071096/1071096 [==============================] - 58s 54us/step - loss: 610.8303 - mean_squared_logarithmic_error: 0.3871 - val_loss: 1057.9986 - val_mean_squared_logarithmic_error: 0.4210
Epoch 7/15
1071096/1071096 [==============================] - 58s 54us/step - loss: 549.4215 - mean_squared_logarithmic_error: 0.3753 - val_loss: 1055.0224 - val_mean_squared_logarithmic_error: 0.4625
Epoch 8/15
1071096/1071096 [==============================] - 58s 54us/step - loss: 520.1334 - mean_squared_logarithmic_error: 0.3707 - val_loss: 1070.6939 - val_mean_squared_logarithmic_error: 0.4277
Epoch 9/15
1071096/1071096 [==============================] - 57s 54us/step - loss: 488.1673 - mean_squared_logarithmic_error: 0.3607 - val_loss: 1063.8773 - val_mean_squared_logarithmic_error: 0.4367
Epoch 10/15
1071096/1071096 [==============================] - 57s 54us/step - loss: 460.5009 - mean_squared_logarithmic_error: 0.3528 - val_loss: 1089.4883 - val_mean_squared_logarithmic_error: 0.4295
Epoch 11/15
1071096/1071096 [==============================] - 57s 54us/step - loss: 433.2903 - mean_squared_logarithmic_error: 0.3439 - val_loss: 1088.7680 - val_mean_squared_logarithmic_error: 0.4340
Epoch 12/15
1071096/1071096 [==============================] - 58s 54us/step - loss: 426.6830 - mean_squared_logarithmic_error: 0.3411 - val_loss: 1092.9512 - val_mean_squared_logarithmic_error: 0.4301
Epoch 13/15
1071096/1071096 [==============================] - 57s 54us/step - loss: 403.5846 - mean_squared_logarithmic_error: 0.3323 - val_loss: 1106.4169 - val_mean_squared_logarithmic_error: 0.4234
Epoch 14/15
1071096/1071096 [==============================] - 57s 54us/step - loss: 401.3351 - mean_squared_logarithmic_error: 0.3337 - val_loss: 1082.3636 - val_mean_squared_logarithmic_error: 0.4376
Epoch 15/15
1071096/1071096 [==============================] - 57s 54us/step - loss: 387.6622 - mean_squared_logarithmic_error: 0.3279 - val_loss: 1104.6679 - val_mean_squared_logarithmic_error: 0.4299
{'val_loss': [1127.1074626043912, 1087.2678627735013, 1053.5792987709883, 1047.5116281083513, 1040.518787082659, 1057.9986405238083, 1055.022360304423, 1070.6938961554606, 1063.8773043713986, 1089.4883044629082, 1088.7680027868587, 1092.9512373824361, 1106.4168555640338, 1082.3635730870856, 1104.6678808366034], 'val_mean_squared_logarithmic_error': [0.5565659740499078, 0.4297962513817454, 0.4422843877662052, 0.43737004754706865, 0.420194410348369, 0.42095913532351426, 0.46250864383542617, 0.4277115894045907, 0.4366671346470959, 0.42945249548830833, 0.4339731699247617, 0.4301039080199287, 0.4233793237558755, 0.4375614984162814, 0.42992322702482433], 'loss': [1575.1133286284676, 1043.653200820877, 901.0541341271995, 782.7321535718904, 682.4030638488076, 610.8303089746302, 549.4214598600078, 520.1334156088888, 488.1673113337513, 460.5008624314031, 433.29030865372914, 426.68298521459775, 403.5845953510768, 401.33514786087136, 387.6621897323485], 'mean_squared_logarithmic_error': [3.5815836171978437, 0.4336171417652295, 0.4288192078297505, 0.41643600321520013, 0.40117916568333656, 0.3871199580829268, 0.3752621106127402, 0.370657310023544, 0.3606964625948292, 0.35277935667126714, 0.3438604436602765, 0.34114952042988345, 0.33234044059753115, 0.3336569687144336, 0.32790030767299455]}
Train error is: 21.96011656311792
Validation error is: 33.236544404140375

    
# num_epochs = 15
# batch_size = 2213
# Brand frequency: 100
# lr: 0.002
# test/train split: 0.15
# test/val split: 0.15

In [None]:
model = Sequential()
model.add(Dense(1024, activation='relu', input_dim=X_train.shape[1]))
model.add(BatchNormalization())
model.add(Dropout(0.5))

model.add(Dense(512, activation='relu', input_dim=X_train.shape[1]))
model.add(BatchNormalization())
model.add(Dropout(0.5))

model.add(Dense(264, activation='relu'))
model.add(BatchNormalization())
model.add(Dropout(0.5))

model.add(Dense(128, activation='relu'))
model.add(BatchNormalization())
model.add(Dropout(0.5))

model.add(Dense(64, activation='relu'))
model.add(BatchNormalization())
model.add(Dropout(0.5))

model.add(Dense(1))

Epoch 1/15
2019-06-01 22:28:11.231741: I tensorflow/core/common_runtime/process_util.cc:69] Creating new thread pool with default inter op setting: 2. Tune using inter_op_parallelism_threads for best performance.
1071096/1071096 [==============================] - 61s 57us/step - loss: 1572.8784 - mean_squared_logarithmic_error: 3.4729 - val_loss: 1217.6416 - val_mean_squared_logarithmic_error: 0.9015
Epoch 2/15
1071096/1071096 [==============================] - 57s 54us/step - loss: 1041.6045 - mean_squared_logarithmic_error: 0.4348 - val_loss: 1047.5824 - val_mean_squared_logarithmic_error: 0.4541
Epoch 3/15
1071096/1071096 [==============================] - 58s 54us/step - loss: 895.4890 - mean_squared_logarithmic_error: 0.4270 - val_loss: 1048.9667 - val_mean_squared_logarithmic_error: 0.4354
Epoch 4/15
1071096/1071096 [==============================] - 58s 54us/step - loss: 783.4882 - mean_squared_logarithmic_error: 0.4162 - val_loss: 1076.8767 - val_mean_squared_logarithmic_error: 0.4711
Epoch 5/15
1071096/1071096 [==============================] - 58s 54us/step - loss: 687.1998 - mean_squared_logarithmic_error: 0.4011 - val_loss: 1059.8252 - val_mean_squared_logarithmic_error: 0.4338
Epoch 6/15
1071096/1071096 [==============================] - 58s 54us/step - loss: 616.5953 - mean_squared_logarithmic_error: 0.3865 - val_loss: 1102.7695 - val_mean_squared_logarithmic_error: 0.4481
Epoch 7/15
1071096/1071096 [==============================] - 58s 54us/step - loss: 568.2803 - mean_squared_logarithmic_error: 0.3784 - val_loss: 1067.3536 - val_mean_squared_logarithmic_error: 0.4238
Epoch 8/15
1071096/1071096 [==============================] - 57s 54us/step - loss: 520.4558 - mean_squared_logarithmic_error: 0.3723 - val_loss: 1083.0141 - val_mean_squared_logarithmic_error: 0.4252
Epoch 9/15
1071096/1071096 [==============================] - 57s 54us/step - loss: 490.2047 - mean_squared_logarithmic_error: 0.3583 - val_loss: 1090.4227 - val_mean_squared_logarithmic_error: 0.4484
Epoch 10/15
1071096/1071096 [==============================] - 57s 54us/step - loss: 475.4367 - mean_squared_logarithmic_error: 0.3576 - val_loss: 1084.0384 - val_mean_squared_logarithmic_error: 0.4213
Epoch 11/15
1071096/1071096 [==============================] - 57s 54us/step - loss: 450.5099 - mean_squared_logarithmic_error: 0.3493 - val_loss: 1090.1185 - val_mean_squared_logarithmic_error: 0.4410
Epoch 12/15
1071096/1071096 [==============================] - 57s 54us/step - loss: 420.6035 - mean_squared_logarithmic_error: 0.3392 - val_loss: 1088.9401 - val_mean_squared_logarithmic_error: 0.4247
Epoch 13/15
1071096/1071096 [==============================] - 57s 54us/step - loss: 402.3568 - mean_squared_logarithmic_error: 0.3319 - val_loss: 1093.5605 - val_mean_squared_logarithmic_error: 0.4199
Epoch 14/15
1071096/1071096 [==============================] - 57s 53us/step - loss: 395.7446 - mean_squared_logarithmic_error: 0.3300 - val_loss: 1075.7294 - val_mean_squared_logarithmic_error: 0.4278
Epoch 15/15
1071096/1071096 [==============================] - 57s 54us/step - loss: 380.5325 - mean_squared_logarithmic_error: 0.3242 - val_loss: 1089.9974 - val_mean_squared_logarithmic_error: 0.4582
{'val_loss': [1217.6416338833594, 1047.5823814230514, 1048.9667078853502, 1076.8766861056768, 1059.8251634824612, 1102.7695363280557, 1067.353624158997, 1083.0141257887979, 1090.4227442875224, 1084.03840107145, 1090.1185013321362, 1088.940090125215, 1093.5605033395357, 1075.7294467749698, 1089.9973991865206], 'val_mean_squared_logarithmic_error': [0.9015497393156829, 0.4540567453312572, 0.4353959632427432, 0.47106237034087345, 0.43382145470214695, 0.4481305697343656, 0.423842325748674, 0.4252363223204157, 0.44844919863147775, 0.42133685137869387, 0.44104321300297566, 0.4246970767350278, 0.4199395750548852, 0.4278126349446862, 0.4582459891594587], 'loss': [1572.8783934025064, 1041.604502517444, 895.4889809983421, 783.4881842577598, 687.1998442333631, 616.595347463793, 568.280334914395, 520.4557500252855, 490.2046975656187, 475.43665048506546, 450.50989299543585, 420.6035281428809, 402.35682913674157, 395.7445517955127, 380.5324726923594], 'mean_squared_logarithmic_error': [3.4728754542720313, 0.4348138111515022, 0.4269675411510403, 0.41624336676188084, 0.4010561436988285, 0.3865401439532164, 0.3783819410077019, 0.37229918515822924, 0.3583430062584886, 0.3575915522988093, 0.34932087194219463, 0.33919731242183077, 0.3319113465872, 0.3299941059741716, 0.3241569667666879]}

Train error is: 23.154237108683564
Validation error is: 33.01510854257875

In [None]:
print(model_hist.history)

In [None]:
print('Train error is:', np.sqrt(mean_squared_error(y_train,model.predict(X_train))))

print('Validation error is:', np.sqrt(mean_squared_error(y_val,model.predict(X_val))))

print('Test error is:', np.sqrt(mean_squared_error(y_test,model.predict(scaler.transform(X_test)))))