In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import os
from scipy import stats
from scipy.stats import norm, skew
import time
import random
import logging
import typing as tp
from pathlib import Path
from contextlib import contextmanager
sns.set()
%matplotlib inline
warnings.filterwarnings("ignore")

import lightgbm as lgb
import xgboost as xgb
import catboost as cat

from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split

from sklearn.metrics import mean_squared_error, mean_squared_log_error

In [2]:
def rmse(predicted, actual):
    return np.sqrt(((predicted - actual) ** 2).mean())

def split_cat(text):
    try:
        return text.split("/")
    except:
        return ("No label", "No label", "No label")

In [3]:
class TargetEncoder:
    def __repr__(self):
        return "TargetEnconer"
    def __init__(self, cols, smoothing=1, min_samples_leaf=1, noise_level=0, keep_original=False):
        self.cols = cols
        self.smoothing = smoothing
        self.min_samples_leaf = min_samples_leaf
        self.noise_level = noise_level
        self.keep_original = keep_original
        
    @staticmethod
    def add_nosie(series, noise_level):
        return series * (1 + noise_level * np.random.randn(len(series)))
    
    def encode(self, train, test, target):
        for col in self.cols:
            if self.keep_original:
                train[col + "_te"], test[col + "_te"] = self.encode_column(train[col], test[col], target)
            else:
                train[col], test[col] = self.encode_column(train[col], test[col], target)
        return train, test
    
    def encode_column(self, train_series, test_series, target):
        temp = pd.concat([train_series, test_series], axis=1)
        
        averages = temp.groupby(by=train_series.name)[target.name].agg(["mean", "count"])
        
        smoothing = 1 / (1 + np.exp(-(averages["count"] - self.min_samples_leaf) / self.smoothing))
        
        prior = target.mean()
        
        averages[target.name] = prior * (1 - smoothing) + averages["mean"] * smoothing
        averages.drop(["mean", "count"], axis=1, inplace=True)
        
        ft_train_series = pd.merge(
           train_series.to_frame(train_series.name),
           averages.reset_idex().rename(columns={"index":target.name, target.name:"average"}),
           on = train_series.name, how="left")["average"].rename(train_series.name + "_mean").fillna(prior)
        
        ft_train_series.index = train_series.index
        
        ft_test_series = pd.merge(
           test_series.to_frame(test_series.name),
           averages.reset_idex().rename(columns={"index":target.name, target.name:"average"}),
           on = test_series.name, how="left")["average"].rename(train_series.name + "_mean").fillna(prior)

        ft_test_series.index = test_series.index
        
        return self.add_nosie(ft_train_series, self.noise_level), self.add_nosie(ft_test_series, self.noise_level)

In [4]:
def to_number(x):
    try:
        if not x.isdigit():
            return 0
        x = int(x)
        if x > 100:
            return 100
        else:
            return x
    except:
        return 0
    
def sum_number(desc):
    if not isinstance(desc, str):
        return 0
    try:
        return sum([to_number(s) for s in desc.split()])
    except:
        return 0

In [5]:
from nltk.corpus import stopwords
import re
import string
import time
start_time = time.time()

stopwords = {x: 1 for x in stopwords.words("english")}
non_alphanums = re.compile(u'[^A-za-z0-9]+')
non_alphanumpunct = re.compile(u'[^A-Za-z0-9\.?!, ; \(\)\[\]\'\"\$]+')
RE_PUNCTUATION = '|'.join([re.escape(x) for x in string.punctuation])

def normalize_text(text):
    return u" ".join(
    [x for x in [y for y in non_alphanums.sub(' ', text).lower().strip().split(" ")]\
    if len(x) > 1 and x not in stopwords])

def clean_name(x):
    if len(x):
        x = non_alphanums.sub(' ', x).split()
        if len(x):
            return x[0].lower()
    return ''

In [6]:
print(time.time() - start_time)

0.0748295783996582


In [7]:
os.chdir("/work/kaggle_practice/Mercari/input/")
train = pd.read_table("train.tsv", engine="c", dtype={"item_condition_id":"category",
                                                     "shipping":"category"},
                     converters={"category_name":split_cat})

test = pd.read_table("test.tsv", engine="c", dtype={"item_condition_id":"category",
                                                     "shipping":"category"},
                     converters={"category_name":split_cat})
print("Finished load data:{}".format(time.time() - start_time))

Finished load data:35.7831609249115


In [8]:
train["is_train"] = 1
test["is_train"] = 0
print("Compiled train / test:{}".format(time.time() - start_time))
print("Train shape:", train.shape, "Test shape:", test.shape)

Compiled train / test:37.371657848358154
Train shape: (1482535, 9) Test shape: (693359, 8)


In [9]:
train = train[train.price != 0].reset_index(drop=True)
print("Removed nonzero price:{}".format(time.time() - start_time))
print("Train shape:", train.shape, "Test shape:", test.shape)

Removed nonzero price:41.97385311126709
Train shape: (1481661, 9) Test shape: (693359, 8)


In [10]:
y = np.log1p(train["price"])
nrow_train = train.shape[0]

merge = pd.concat([train, test])
submission = test[["test_id"]]
print("Compiled merge:{}".format(time.time() - start_time))
print("Merge shape:", merge.shape)

Compiled merge:45.75414729118347
Merge shape: (2175020, 10)


In [11]:
import gc
del train 
del test
merge.drop(["train_id", "test_id", "price"], axis=1, inplace=True)
gc.collect()
print("Garbage collection:{}".format(time.time() - start_time))

Garbage collection:52.63427662849426


In [12]:
merge["gencat_name"] = merge["category_name"].str.get(0).replace("", "missing").astype("category")
merge["subcat1_name"] = merge["category_name"].str.get(1).fillna("missing").astype("category")
merge["subcat2_name"] = merge["category_name"].str.get(2).fillna("missing").astype("category")
merge.drop("category_name", axis=1, inplace=True)
print("Split categories completed:{}".format(time.time() - start_time))

Split categories completed:62.509881258010864


In [13]:
merge["item_condition_id"] = merge["item_condition_id"].cat.add_categories(["missing"]).fillna("missing")
merge["shipping"] = merge["shipping"].cat.add_categories(["missing"]).fillna("missing")
merge["item_description"].fillna("missing", inplace=True)
merge["brand_name"] = merge["brand_name"].fillna("missing").astype("category")
print("Handle missing completed.:{}".format(time.time() - start_time))

Handle missing completed.:77.75830817222595


In [14]:
merge["name_first"] = merge["name"].apply(clean_name)
print("FE 1/37:{}".format(time.time() - start_time))

FE 1/37:88.37726712226868


In [15]:
merge["name_first_count"] = merge.groupby("name_first")["name_first"].transform("count")
print("FE 2/37:{}".format(time.time() - start_time))

FE 2/37:90.57625770568848


In [16]:
merge["gencat_name_count"] = merge.groupby("gencat_name")["gencat_name"].transform("count")
print("FE 3/37:{}".format(time.time() - start_time))
merge["subcat1_name_count"] = merge.groupby("subcat1_name")["subcat1_name"].transform("count")
print("FE 4/37:{}".format(time.time() - start_time))
merge["subcat2_name_count"] = merge.groupby("subcat2_name")["subcat2_name"].transform("count")
print("FE 5/37:{}".format(time.time() - start_time))
merge["brand_name_count"] = merge.groupby("brand_name")["brand_name"].transform("count")
print("FE 6/37:{}".format(time.time() - start_time))

FE 3/37:92.3014121055603
FE 4/37:92.43728756904602
FE 5/37:92.47120976448059
FE 6/37:92.49770474433899


In [17]:
merge["NameLower"] = merge.name.str.count("[a-z]")
print("FE 7/37:{}".format(time.time() - start_time))
merge["DescriptionLower"] = merge.item_description.str.count("[a-z]")
print("FE 8/37:{}".format(time.time() - start_time))
merge["NameUpper"] = merge.name.str.count("[A-Z]")
print("FE 9/37:{}".format(time.time() - start_time))
merge["DescriptionUpper"] = merge.item_description.str.count("[A-Z]")
print("FE 10/37:{}".format(time.time() - start_time))

FE 7/37:106.43829584121704
FE 8/37:165.35912609100342
FE 9/37:171.03732633590698
FE 10/37:187.1764430999756


In [18]:
merge["name_len"] = merge["name"].apply(lambda x:len(x))
print("FE 11/37:{}".format(time.time() - start_time))
merge["des_len"] = merge["item_description"].apply(lambda x:len(x))
print("FE 12/37:{}".format(time.time() - start_time))
merge["name_desc_len_ratio"] = merge["name_len"] - merge["des_len"]
print("FE 13/37:{}".format(time.time() - start_time))
merge["desc_word_count"] = merge["item_description"].apply(lambda x: len(x.split()))
print("FE 14/37:{}".format(time.time() - start_time))
merge["mean_des"] = merge["item_description"].apply(lambda x: 0 if len(x) == 0 else float(len(x.split())) / len(x) * 10)
print("FE 15/37:{}".format(time.time() - start_time))
merge["name_word_count"] = merge["name"].apply(lambda x: len(x.split()))
print("FE 16/37:{}".format(time.time() - start_time))
merge["mean_name"] = merge["name"].apply(lambda x: 0 if len(x) == 0 else float(len(x.split())) / len(x) * 10)
print("FE 17/37:{}".format(time.time() - start_time))
merge["desc_letters_per_word"] = merge["des_len"] / merge["desc_word_count"]
print("FE 18/37:{}".format(time.time() - start_time))
merge["name_letters_per_word"] = merge["name_len"] / merge["name_word_count"]
print("FE 19/37:{}".format(time.time() - start_time))
merge["NameLowerRatio"] = merge["NameLower"] / merge["name_len"]
print("FE 20/37:{}".format(time.time() - start_time))
merge["DescriptionLowerRatio"] = merge["DescriptionLower"] / merge["des_len"]
print("FE 21/37:{}".format(time.time() - start_time))
merge["NameUpperRatio"] = merge["NameUpper"] / merge["name_len"]
print("FE 22/37:{}".format(time.time() - start_time))
merge["DescriptionUpperRatio"] = merge["DescriptionUpper"] / merge["des_len"]
print("FE 23/37:{}".format(time.time() - start_time))

FE 11/37:192.06892848014832
FE 12/37:193.76055669784546
FE 13/37:193.78000116348267
FE 14/37:199.76943516731262
FE 15/37:206.85459876060486
FE 16/37:209.19546604156494
FE 17/37:212.16957783699036
FE 18/37:212.28700733184814
FE 19/37:212.2971212863922
FE 20/37:212.30767107009888
FE 21/37:212.33090829849243
FE 22/37:212.3795886039734
FE 23/37:212.48866724967957


In [19]:
merge["NamePunctCount"] = merge.name.str.count(RE_PUNCTUATION)
print("FE 24/37:{}".format(time.time() - start_time))
merge["DescriptionPunctCount"] = merge.item_description.str.count(RE_PUNCTUATION)
print("FE 25/37:{}".format(time.time() - start_time))
merge["NamePunctCountRatio"] = merge["NamePunctCount"] / merge["name_word_count"]
print("FE 26/37:{}".format(time.time() - start_time))
merge["DescriptionPunctCountRatio"] = merge["DescriptionPunctCount"] / merge["desc_word_count"]
print("FE 27/37:{}".format(time.time() - start_time))

FE 24/37:222.44420886039734
FE 25/37:234.08341526985168
FE 26/37:234.12060236930847
FE 27/37:234.13272047042847


In [20]:
merge["NameDigitCount"] = merge.name.str.count("[0-9]")
print("FE 28/37:{}".format(time.time() - start_time))
merge["DescriptionDigitCount"] = merge.item_description.str.count("[0-9]")
print("FE 29/37:{}".format(time.time() - start_time))
merge["NameDigitCountRatio"] = merge["NameDigitCount"] / merge["name_word_count"]
print("FE 30/37:{}".format(time.time() - start_time))
merge["DescriptionDigitCountRatio"] = merge["DescriptionDigitCount"] / merge["desc_word_count"]
print("FE 31/37:{}".format(time.time() - start_time))

FE 28/37:244.15404868125916
FE 29/37:253.99641227722168
FE 30/37:254.00394558906555
FE 31/37:254.01152348518372


In [21]:
merge["stopword_ratio_desc"] = merge["item_description"].apply(lambda x: len([w for w in x.split() if w in stopwords])) / merge["desc_word_count"]
print("FE 32/37:{}".format(time.time() - start_time))
merge["num_sum"] = merge["item_description"].apply(sum_number)
print("FE 33/37:{}".format(time.time() - start_time))
merge["weird_characters_desc"] = merge["item_description"].str.count(non_alphanumpunct)
print("FE 34/37:{}".format(time.time() - start_time))
merge["weird_characters_name"] = merge["name"].str.count(non_alphanumpunct)
print("FE 35/37:{}".format(time.time() - start_time))
merge["prices_count"] = merge["item_description"].str.count("[rm]")
print("FE 36/37:{}".format(time.time() - start_time))
merge["prices_in_name"] = merge["item_description"].str.contains("[rm]", regex=False).astype("int")
print("FE 37/37:{}".format(time.time() - start_time))

FE 32/37:269.2812809944153
FE 33/37:288.58040499687195
FE 34/37:301.6157257556915
FE 35/37:305.76182794570923
FE 36/37:319.7226526737213
FE 37/37:320.6909189224243


In [24]:
cols = set(merge.columns.values)
basic_cols = {"name", "item_condition_id", "brand_name", "shipping", "item_description", "gencat_name",
             "subcat1_name", "subcat2_name", "name_first", "is_train"}

cols_to_normalize = cols - basic_cols - {"prices_in_name"}
other_cols = basic_cols | {"prices_in_name"}

merge_to_normalize = merge[list(cols_to_normalize)]
merge_to_normalize = (merge_to_normalize - merge_to_normalize.mean()) / (merge_to_normalize.max() - merge_to_normalize.min())

print("FE Normalized:{}".format(time.time() - start_time))

FE Normalized:444.6609630584717


In [25]:
merge = merge[list(other_cols)]
merge = pd.concat([merge, merge_to_normalize], axis=1)

print("FE Merged:{}".format(time.time() - start_time))

FE Merged:481.3497984409332


In [26]:
del (merge_to_normalize)
gc.collect()
print("Garbage collection:{}".format(time.time() - start_time))

Garbage collection:552.5701448917389


In [28]:
SUBMIT_MODE = True
df_test = merge.loc[merge["is_train"] == 0]
df_train = merge.loc[merge["is_train"] == 1]
del merge
gc.collect()
df_test = df_test.drop(["is_train"], axis=1)
df_train = df_train.drop(["is_train"], axis=1)

if SUBMIT_MODE:
    y_train = y
    del y
    gc.collect()
else:
    df_train, df_test, y_train, y_test = train_test_split(df_train, y, test_size=0.2, random_state=0)

print("Splittin completed:{}".format(time.time() - start_time))

Splittin completed:881.2715878486633
