In [1]:
import pandas as pd
import numpy as np
import re
import gc

In [2]:
from sklearn.metrics import mean_squared_error
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.preprocessing import MinMaxScaler

import keras
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation
from keras.layers.normalization import BatchNormalization
from keras.callbacks import LearningRateScheduler

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [3]:
import matplotlib.pyplot as plt

In [4]:
# code copied from https://www.kaggle.com/marknagelberg/rmsle-function
def rmsle(y_pred, y_test) : 
    assert len(y_test) == len(y_pred)
    return np.sqrt(np.mean((np.log(1+y_pred) - np.log(1+y_test))**2))

In [5]:
# Gets the count of most frequent words give a dataframe
def word_freq(df, col):
    word_frequency = {}
    word_frequency_lst = []
    for index,row in df.iterrows(): 
        for w in list(set(str(row[col]).split(' '))):
            if w not in word_frequency:
                word_frequency[w] = 1
            else:
                word_frequency[w] += 1

    for key, value in word_frequency.items():
        temp = [key, value]
        word_frequency_lst.append(temp)
    word_freq_df = pd.DataFrame(word_frequency_lst, columns=["unique_word", 'frequency'])
    word_freq_df = word_freq_df.sort_values(['frequency'], ascending=False)
    return word_freq_df

# Read Data

In [6]:
clean_data = pd.read_csv(
    '/Users/joashc/Downloads/mercari-price-suggestion-challenge/partially_clean_train_data.csv')
clean_data.shape

(1482486, 13)

In [7]:
clean_data.head()

Unnamed: 0,name,item_condition_id,category_name,brand_name,price,shipping,item_description,stemmed_item_description,clean_brand_name,clean_category_name,clean_item_name,assigned_category,assigned_sub_category
0,MLB Cincinnati Reds T Shirt Size XL,3,Men Tops T-shirts,nobrandname,10.0,1,No description yet,descript yet,nobrandname,men top,mlb cincinnati red shirt size xl,Men,Tops
1,Razer BlackWidow Chroma Keyboard,3,Electronics Computers & Tablets Components & P...,Razer,52.0,0,This keyboard is in great condition and works ...,keyboard great condit work like came box port ...,razer,electron comput tablet compon part,razer blackwidow chroma keyboard,Electronics,Other
2,AVA-VIV Blouse,1,Women Tops & Blouses Blouse,Target,10.0,1,Adorable top with a hint of lace and a key hol...,ador top hint lace key hole back pale pink als...,target,women top blous blous,blous,Women,Tops & blouses
3,Leather Horse Statues,1,Home Home Décor Home Décor Accents,nobrandname,35.0,1,New with tags. Leather horses. Retail for [rm]...,new tag leather hors retail rm stand foot high...,nobrandname,home home d cor home d cor accent,leather hors statu,Home,Other
4,24K GOLD plated rose,1,Women Jewelry Necklaces,nobrandname,44.0,0,Complete with certificate of authenticity,complet certif authent,nobrandname,women jewelri necklac,gold plate rose,Women,Jewelry


### One-hot-encoding Brand Name

In [8]:
unique_brand_names = pd.DataFrame(clean_data.clean_brand_name.value_counts())
min_brand_freq = 100
print('There are', unique_brand_names[unique_brand_names['clean_brand_name']>=min_brand_freq].shape[0],
      'brand names that occur >=',min_brand_freq, 'times in the dataset.','Will one-hot-encode these brands only.')

There are 569 brand names that occur >= 100 times in the dataset. Will one-hot-encode these brands only.


In [9]:
clean_brand_name_df = clean_data['clean_brand_name']
clean_brand_name_df.shape

(1482486,)

In [10]:
clean_brand_name_df = pd.get_dummies(clean_brand_name_df)
clean_brand_name_df.shape

(1482486, 4782)

In [11]:
drop_brand_col_lst = []
keep_brand_col_lst = list(unique_brand_names[unique_brand_names['clean_brand_name']>=min_brand_freq].index.values)
keep_brand_col_lst.remove('nobrandname')

for col_name in clean_brand_name_df.columns:
    if col_name not in keep_brand_col_lst:
        drop_brand_col_lst.append(col_name)

clean_brand_name_df_v2 =clean_brand_name_df.drop(columns=drop_brand_col_lst)
clean_brand_name_df_v2.columns = ['brand_' + str(col) for col in clean_brand_name_df_v2.columns]
clean_brand_name_df_v2.shape

(1482486, 568)

In [12]:
clean_data_v2 = pd.concat([clean_data.reset_index(drop=True)
                                   , clean_brand_name_df_v2.reset_index(drop=True)],
                                  axis=1).drop(columns=['brand_name','clean_brand_name'])
clean_data_v2.shape

(1482486, 579)

Delete the variables from memory

In [13]:
del [[clean_brand_name_df,clean_brand_name_df_v2, clean_data]]
gc.collect()
clean_data = pd.DataFrame()
clean_brand_name_df=pd.DataFrame()
clean_brand_name_df_v2=pd.DataFrame()

### One-hot-encoding Assigned Category and Assigned Sub Category

In [14]:
clean_category_name_df = clean_data_v2[['assigned_category', 'assigned_sub_category']]
clean_category_name_df.shape

(1482486, 2)

In [15]:
clean_category_name_df.head()

Unnamed: 0,assigned_category,assigned_sub_category
0,Men,Tops
1,Electronics,Other
2,Women,Tops & blouses
3,Home,Other
4,Women,Jewelry


In [16]:
clean_category_name_df_v2 = pd.get_dummies(clean_category_name_df)
clean_category_name_df_v2.shape

(1482486, 119)

In [17]:
# concat with main dataset on clean_category_name
print(clean_data_v2.shape)
clean_data_v3 = pd.concat([clean_data_v2, clean_category_name_df_v2], axis=1)
print(clean_data_v3.shape)

(1482486, 579)
(1482486, 698)


In [18]:
# Delete dataframes from memory
del [[clean_category_name_df,clean_category_name_df_v2, clean_data_v2]]
gc.collect()
clean_data_v2 = pd.DataFrame()
clean_category_name_df=pd.DataFrame()
clean_category_name_df_v2=pd.DataFrame()

### One hot encode item condition

In [19]:
item_condition_df = clean_data_v3[['item_condition_id']]
item_condition_df.shape

(1482486, 1)

In [20]:
item_condition_df['item_condition_id'] = item_condition_df['item_condition_id'].astype(object)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [21]:
item_condition_df_v2 = pd.get_dummies(item_condition_df)
item_condition_df_v2.shape

(1482486, 5)

In [22]:
%%time
print(clean_data_v3.shape)
clean_data_v4 = pd.concat([clean_data_v3, item_condition_df_v2], axis=1)
print(clean_data_v4.shape)

(1482486, 698)
(1482486, 703)
CPU times: user 463 ms, sys: 298 ms, total: 761 ms
Wall time: 788 ms


In [23]:
# Delete dataframes from memory
del [[item_condition_df,item_condition_df_v2, clean_data_v3]]
gc.collect()
clean_data_v3 = pd.DataFrame()
item_condition_df_v2=pd.DataFrame()
item_condition_df=pd.DataFrame()

### Shipping
- Change value of 0 to -1

In [24]:
clean_data_v4['shipping'] = clean_data_v4['shipping'].replace([0], [-1])

### Drop Unwanted Columns

In [28]:
clean_data_v4 = clean_data_v4.drop(columns=['name', 'category_name', 'item_description', 'stemmed_item_description',
       'clean_category_name', 'clean_item_name', 'assigned_category',
       'assigned_sub_category'])
clean_data_v4.shape

(1482486, 695)

# Model Implementation

In [35]:
X_train, X_test, y_train, y_test = train_test_split(clean_data_v4.drop(columns=['price']).reset_index(drop=True), 
                                                    clean_data_v4[['price']].reset_index(drop=True), 
                                                                  test_size=0.1, random_state=42)
print('Number of rows in train and validation data:', X_train.shape[0], y_train.shape)
print('Number of rows in test data:', X_test.shape[0], y_test.shape)

Number of rows in train and validation data: 1185988 (1185988, 1)
Number of rows in test data: 296498 (296498, 1)


In [36]:
X_train = X_train.reset_index(drop=True)
y_train = y_train.reset_index(drop=True)

In [37]:
scaler = MinMaxScaler(feature_range=(0, 1))
columns = X_train.columns
X_train = pd.DataFrame(scaler.fit_transform(X_train), columns=columns)
print('MinMaxScaler Complete')

  return self.partial_fit(X, y)


MinMaxScaler Complete


#### Model architecture parameters
n_stocks = 500
n_neurons_1 = 1024
n_neurons_2 = 512
n_neurons_3 = 256
n_neurons_4 = 128
n_target = 1

In [38]:
X_train, X_val, y_train, y_val = train_test_split(X_train.reset_index(drop=True), 
                                                    y_train.reset_index(drop=True), 
                                                                  test_size=0.1, random_state=42)
print('Number of rows in train and validation data:', X_train.shape[0], y_train.shape)
print('Number of rows in validation data:', X_val.shape[0], y_val.shape)

Number of rows in train and validation data: 948790 (948790, 1)
Number of rows in validation data: 237198 (237198, 1)


In [39]:
def step_decay(epoch):
    initial_lrate = 0.002
    drop = 0.5
    epochs_drop = 20
    lrate = initial_lrate * math.pow(drop,
    math.floor((1+epoch)/epochs_drop))
    return lrate

In [40]:
num_epochs = 1
batch_size = 2213

all_val_predictions = pd.DataFrame()

train_val_rmsle = []

model = Sequential()
model.add(Dense(512, activation='relu', input_dim=X_train.shape[1]))
model.add(BatchNormalization())
# model.add(Dropout(0.8))

model.add(Dense(264, activation='relu'))
model.add(BatchNormalization())
# model.add(Dropout(0.5))

model.add(Dense(128, activation='relu'))
model.add(BatchNormalization())
# model.add(Dropout(0.5))

model.add(Dense(64, activation='relu'))
model.add(BatchNormalization())
# model.add(Dropout(0.5))

model.add(Dense(1))
#     model.add(Activation('sigmoid'))
model.compile(loss='mean_squared_error', optimizer='adam')

lrate = LearningRateScheduler(step_decay)

model_hist = model.fit(X_train, y_train, validation_data=(X_val,y_val), 
                       batch_size=batch_size, epochs=num_epochs, verbose=1)

# 1200813/1200813 [==============================] - 62s 52us/step - loss: 1482.9186 - val_loss: 1086.2720
# Epoch 2/15
# 1200813/1200813 [==============================] - 59s 49us/step - loss: 1094.8557 - val_loss: 1063.3914
# Epoch 3/15
# 1200813/1200813 [==============================] - 59s 49us/step - loss: 1088.3559 - val_loss: 1058.8141
# Epoch 4/15
# 1200813/1200813 [==============================] - 59s 49us/step - loss: 1084.5598 - val_loss: 1064.0263
# Epoch 5/15
# 1200813/1200813 [==============================] - 60s 50us/step - loss: 1083.5474 - val_loss: 1057.3300
# Epoch 6/15
# 1200813/1200813 [==============================] - 60s 50us/step - loss: 1081.2210 - val_loss: 1053.7492
# Epoch 7/15
# 1200813/1200813 [==============================] - 59s 49us/step - loss: 1080.9597 - val_loss: 1051.4364
# Epoch 8/15
# 1200813/1200813 [==============================] - 59s 49us/step - loss: 1079.8223 - val_loss: 1055.7563
# Epoch 9/15
# 1200813/1200813 [==============================] - 59s 49us/step - loss: 1078.7325 - val_loss: 1055.9923
# Epoch 10/15
# 1200813/1200813 [==============================] - 59s 49us/step - loss: 1076.4778 - val_loss: 1058.3496
# Epoch 11/15
# 1200813/1200813 [==============================] - 59s 49us/step - loss: 1076.6074 - val_loss: 1055.3251

Train on 948790 samples, validate on 237198 samples
Epoch 1/1


In [55]:
print('Train error is:', np.sqrt(mean_squared_error(y_train,model.predict(X_train))))

print('Validation error is:', np.sqrt(mean_squared_error(y_val,model.predict(X_val))))

print('Test error is:', np.sqrt(mean_squared_error(y_test,model.predict(scaler.transform(X_test)))))

Train error is: 26.90506422754862
Validation error is: 29.538023054786148


NameError: name 'scaler' is not defined

In [None]:
num_epochs = 1
batch_size = 2213

all_val_predictions = pd.DataFrame()

train_val_rmsle = []

model = Sequential()
model.add(Dense(512, activation='relu', input_dim=X_train.shape[1]))
model.add(BatchNormalization())
# model.add(Dropout(0.8))

model.add(Dense(264, activation='relu'))
model.add(BatchNormalization())
# model.add(Dropout(0.5))

model.add(Dense(128, activation='relu'))
model.add(BatchNormalization())
# model.add(Dropout(0.5))

model.add(Dense(64, activation='relu'))
model.add(BatchNormalization())
# model.add(Dropout(0.5))

model.add(Dense(1))
#     model.add(Activation('sigmoid'))
model.compile(loss='mean_squared_error', optimizer='adam')

# num_epochs = 15
# batch_size = 2213
# Brand frequency: 100
# lr: 0.002
# test/train split: 0.1
# test/val split: 0.1

# 1200813/1200813 [==============================] - 62s 52us/step - loss: 1482.9186 - val_loss: 1086.2720
# Epoch 2/15
# 1200813/1200813 [==============================] - 59s 49us/step - loss: 1094.8557 - val_loss: 1063.3914
# Epoch 3/15
# 1200813/1200813 [==============================] - 59s 49us/step - loss: 1088.3559 - val_loss: 1058.8141
# Epoch 4/15
# 1200813/1200813 [==============================] - 59s 49us/step - loss: 1084.5598 - val_loss: 1064.0263
# Epoch 5/15
# 1200813/1200813 [==============================] - 60s 50us/step - loss: 1083.5474 - val_loss: 1057.3300
# Epoch 6/15
# 1200813/1200813 [==============================] - 60s 50us/step - loss: 1081.2210 - val_loss: 1053.7492
# Epoch 7/15
# 1200813/1200813 [==============================] - 59s 49us/step - loss: 1080.9597 - val_loss: 1051.4364
# Epoch 8/15
# 1200813/1200813 [==============================] - 59s 49us/step - loss: 1079.8223 - val_loss: 1055.7563
# Epoch 9/15
# 1200813/1200813 [==============================] - 59s 49us/step - loss: 1078.7325 - val_loss: 1055.9923
# Epoch 10/15
# 1200813/1200813 [==============================] - 59s 49us/step - loss: 1076.4778 - val_loss: 1058.3496
# Epoch 11/15
# 1200813/1200813 [==============================] - 59s 49us/step - loss: 1076.6074 - val_loss: 1055.3251

- deleted it early because there was no improvement