In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import pickle
import time
import os.path
import gc
import matplotlib.pyplot as plt
import string
from scipy.sparse import csr_matrix, hstack
import sys
%matplotlib inline
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import Ridge
from sklearn.cluster import MiniBatchKMeans
from sklearn.preprocessing import LabelBinarizer, MinMaxScaler
from sklearn.linear_model import SGDRegressor
from sklearn.ensemble import RandomForestRegressor
import lightgbm as lgb

#Add https://www.kaggle.com/anttip/wordbatch to your kernel Data Sources, 
#until Kaggle admins fix the wordbatch pip package installation
sys.path.insert(0, '../input/wordbatch/wordbatch/')
import wordbatch
from wordbatch.extractors import WordBag
from nltk.corpus import stopwords
import re

NAME_MIN_DF = 10
MAX_FEATURES_ITEM_DESCRIPTION = 1200000

# from subprocess import check_output
# print(check_output(["ls", "../input"]).decode("utf8"))

# Any results you write to the current directory are saved as output.
def split_cat(text):
    try: return text.split("/")
    except: return ("No Label", "No Label", "No Label")
    
def handle_missing_inplace(dataset):
    dataset['general_cat'].fillna(value='No Label', inplace=True)
    dataset['subcat_1'].fillna(value='No Label', inplace=True)
    dataset['subcat_2'].fillna(value='No Label', inplace=True)
    dataset['brand_name'].fillna(value='missing', inplace=True)
    dataset['item_description'].fillna(value='No description yet', inplace=True)


def cutting(dataset):
    pop_brand = dataset['brand_name'].value_counts().loc[lambda x: x.index != 'missing']
    pop_category1 = dataset['general_cat'].value_counts().loc[lambda x: x.index != 'No Label']
    pop_category2 = dataset['subcat_1'].value_counts().loc[lambda x: x.index != 'No Label']
    pop_category3 = dataset['subcat_2'].value_counts().loc[lambda x: x.index != 'No Label']
    pop_brand= pop_brand[pop_brand!=1].index
    pop_category1 = pop_category1[pop_category1!=1].index
    pop_category2 = pop_category2[pop_category2!=1].index
    pop_category3 = pop_category3[pop_category3!=1].index
    dataset.loc[~dataset['brand_name'].isin(pop_brand), 'brand_name'] = 'missing'
    dataset.loc[~dataset['general_cat'].isin(pop_category1), 'general_cat'] = 'No Label'
    dataset.loc[~dataset['subcat_1'].isin(pop_category2), 'subcat_1'] = 'No Label'
    dataset.loc[~dataset['subcat_2'].isin(pop_category3), 'subcat_2'] = 'No Label'
    
def to_categorical(dataset):
    dataset['general_cat'] = dataset['general_cat'].astype('category')
    dataset['subcat_1'] = dataset['subcat_1'].astype('category')
    dataset['subcat_2'] = dataset['subcat_2'].astype('category')
    dataset['item_condition_id'] = dataset['item_condition_id'].astype('category')
    
def rmsle(y, y_pred):
    y = np.array(y).reshape(-1)
    y_pred = np.array(y_pred).reshape(-1)
    assert len(y) == len(y_pred)
    to_sum = (np.log(y_pred + 1) - np.log(y + 1)) ** 2
    return to_sum.mean() ** 0.5
#Source: https://www.kaggle.com/marknagelberg/rmsle-function

# Define helpers for text normalization
stopwords = {x: 1 for x in stopwords.words('english')}
non_alphanums = re.compile(u'[^A-Za-z0-9]+')
def normalize_text(text):
    return u" ".join(
        [x for x in [y for y in non_alphanums.sub(' ', text).lower().strip().split(" ")] \
         if len(x) > 1 and x not in stopwords])

In [None]:
# sorted(merge['subcat_2'].unique())'description' in 
# merge[merge.item_description.str.contains('No')]['item_description'].unique()

In [None]:
# plt.figure(figsize=(20, 15))
# bins=50
# # plt.hist(train2['price'], bins, normed=True, range=[0,250],
# #          alpha=0.6, label='price when shipping==1')
# plt.hist(train2[(train2.price < 1.0)]['price'], bins, normed=True, range=[0,1],
#          alpha=0.6, label='price when shipping==0')
# plt.title('Train price over shipping type distribution', fontsize=15)
# plt.xlabel('Price', fontsize=15)
# plt.ylabel('Normalized Samples', fontsize=15)
# plt.legend(fontsize=15)
# plt.xticks(fontsize=15)
# plt.yticks(fontsize=15)
# plt.show()

In [None]:
# print(merge[merge.subcat_1 == 'Artwork']['general_cat'].unique())
# print(merge[merge.subcat_1 == 'Art']['general_cat'].unique())
# print(merge[merge.subcat_1 == 'Artwork']['subcat_2'].unique())
# print(merge[merge.subcat_1 == 'Art']['subcat_2'].unique())

In [None]:
if os.path.exists("data.pkl"):
    print("Loading saved dataset, data.pkl.")
    X,X_test,y = pickle.load(open("data.pkl","rb"))
else:
    start_time = time.time()

    train = pd.read_table('../input/train.tsv', engine='c')
    test = pd.read_table('../input/test.tsv', engine='c')
    print('[{}] Finished to load data'.format(time.time() - start_time))
    print('Train shape: ', train.shape)
    print('Test shape: ', test.shape)
    nrow_test = train.shape[0] #-dftt.shape[0]
    dftt = train[(train.price < 0.1)]
    train = train.drop(train[(train.price < 0.1)].index)
    del dftt['price']
    nrow_train = train.shape[0] #-dftt.shape[0]
    #nrow_test = train.shape[0] + dftt.shape[0]

    y_original = train["price"] # save down original value
    # Scale output price -> log -> minMax(-1,1)
    price_scaler = MinMaxScaler(feature_range=(-1, 1))
    train_price_log = np.log1p(train["price"]) # sames as np.log(train['price'] + 1)

    y = train_price_log
    # y = price_scaler.fit_transform(train_price_log.values.reshape(-1, 1))

    merge: pd.DataFrame = pd.concat([train, dftt, test])
    submission: pd.DataFrame = test[['test_id']]

    del train
    del test
    gc.collect()

    merge['general_cat'], merge['subcat_1'], merge['subcat_2'] = \
    zip(*merge['category_name'].apply(lambda x: split_cat(x)))
    merge.drop('category_name', axis=1, inplace=True)
    print('[{}] Split categories completed.'.format(time.time() - start_time))

    handle_missing_inplace(merge)
    print('[{}] Handle missing completed.'.format(time.time() - start_time))

    cutting(merge)
    print('[{}] Cut completed.'.format(time.time() - start_time))

    to_categorical(merge)
    print('[{}] Convert categorical completed'.format(time.time() - start_time))

    wb = wordbatch.WordBatch(normalize_text, extractor=(WordBag, {"hash_ngrams": 2, "hash_ngrams_weights": [1.5, 1.0],
                                                                  "hash_size": 2 ** 29, "norm": None, "tf": 'binary',
                                                                  "idf": None,
                                                                  }), procs=8)
    wb.dictionary_freeze= True
    X_name = wb.fit_transform(merge['name'])
    del(wb)
    print('[{}] Count vectorize `name` completed.'.format(time.time() - start_time))

    cv = CountVectorizer()
    X_category1 = cv.fit_transform(merge['general_cat'])
    X_category2 = cv.fit_transform(merge['subcat_1'])
    X_category3 = cv.fit_transform(merge['subcat_2'])
    print('[{}] Count vectorize `categories` completed.'.format(time.time() - start_time))

    tv = TfidfVectorizer(max_features=MAX_FEATURES_ITEM_DESCRIPTION,
                         ngram_range=(1, 2),
                         stop_words='english')
    X_description = tv.fit_transform(merge['item_description'])
    print('[{}] TFIDF vectorize `item_description` completed.'.format(time.time() - start_time))

    num_clusters = 30 # need to be selected wisely
    kmeans_model = MiniBatchKMeans(n_clusters=num_clusters,
                                   init='k-means++',
                                   n_init=1,
                                   init_size=1000, batch_size=1000, verbose=0, max_iter=1000)
    X_description_kmeans = kmeans_model.fit_transform(X_description)
    print('[{}] K means clustering  `X_description_kmeans` completed.'.format(time.time() - start_time))

    lb = LabelBinarizer(sparse_output=True)
    X_brand = lb.fit_transform(merge['brand_name'])
    print('[{}] Label binarize `brand_name` completed.'.format(time.time() - start_time))

    X_dummies = csr_matrix(pd.get_dummies(merge[['item_condition_id', 'shipping']],
                                          sparse=True).values)
    print('[{}] Get dummies on `item_condition_id` and `shipping` completed.'.format(time.time() - start_time))

    print (X_dummies.shape, X_description_kmeans.shape, X_brand.shape, X_category1.shape, X_category2.shape, X_category3.shape, X_name.shape)
    sparse_merge = hstack((X_dummies, X_description_kmeans, X_brand, X_category1, X_category2, X_category3, X_name)).tocsr()
    print('[{}] Create sparse merge completed'.format(time.time() - start_time))
    del X_dummies, merge, X_description, X_brand, X_category1, X_category2, X_category3, X_name, X_description_kmeans
    gc.collect()

    X = sparse_merge[:nrow_train]
    X_test = sparse_merge[nrow_test:]

    # valid_y_original = price_scaler.inverse_transform(valid_y)
    # valid_y_original = np.expm1(valid_y_original)
    # valid_y_original = valid_y_original.clip(0, None)

    pickle.dump((X,X_test,y), open("MNIST.pkl","wb"))
    
# train_X, valid_X, train_y, valid_y = train_test_split(X, y, test_size = 0.1, random_state = 144)

In [None]:
#  create a dictionary mapping the tokens to their tfidf values

# tfidf = dict(zip(tv.get_feature_names(), tv.idf_))
# tfidf = pd.DataFrame(columns=['tfidf']).from_dict(
#                     dict(tfidf), orient='index')
# tfidf.columns = ['tfidf']
# tfidf.sort_values(by=['tfidf'], ascending=True).tail(10)

In [None]:
# # Ridge 1
# model = Ridge(alpha=.6, copy_X=True, fit_intercept=True, max_iter=100,
# normalize=False, random_state=101, solver='auto', tol=0.01)
# model.fit(X, y)
# print('[{}] Train ridge completed'.format(time.time() - start_time))
# predsR_train = model.predict(X=X)
# # predsR_valid = model.predict(X=valid_X)
# predsR = model.predict(X=X_test)
# print('[{}] Predict ridge completed'.format(time.time() - start_time))

In [None]:
# # Ridge 2
# model = Ridge(solver='sag', fit_intercept=True)
# model.fit(X, y)
# print('[{}] Train ridge v2 completed'.format(time.time() - start_time))
# predsR2_train = model.predict(X=X)
# # predsR2_valid = model.predict(X=valid_X)
# predsR2 = model.predict(X=X_test)
# print('[{}] Predict ridge v2 completed'.format(time.time() - start_time))

In [None]:
# # LGBM 1
# # train_X, valid_X, train_y, valid_y = train_test_split(X, y, test_size = 0.1, random_state = 144) 
# d_train = lgb.Dataset(train_X, label=train_y.reshape(-1))
# d_valid = lgb.Dataset(valid_X, label=valid_y.reshape(-1))
# watchlist = [d_train, d_valid]

# params = {
#     'learning_rate': 0.7,
#     'application': 'regression',
#     'max_depth': 3,
#     'num_leaves': 60,
#     'verbosity': -1,
#     'metric': 'RMSE',
#     'data_random_seed': 1,
#     'reg_alpha': 1,
#     'min_split_gain': 0.5, 
#     'min_child_weight': 1, 
#     'min_child_samples': 10, 
#     'scale_pos_weight': 1,
#     'reg_lambda': 0.001,
#     'bagging_fraction': 0.8,
#     'nthread': 4
# }
# model = lgb.train(params, train_set=d_train, num_boost_round=7500, valid_sets=watchlist, \
# early_stopping_rounds=1000, verbose_eval=1000)
# predsL_train = model.predict(X)
# # predsL_valid = model.predict(valid_X)
# predsL = model.predict(X_test)

# print('[{}] Predict lgb 1 completed.'.format(time.time() - start_time))

In [None]:
# LGBM 2
train_X2, valid_X2, train_y2, valid_y2 = train_test_split(X, y, test_size = 0.1, random_state = 101) 
d_train2 = lgb.Dataset(train_X2, label=train_y2.reshape(-1))
d_train2 = lgb.Dataset(valid_X2, label=valid_y2.reshape(-1))
watchlist2 = [d_train2, d_train2]

params2 = {
    'learning_rate': 0.85,
    'application': 'regression',
    'max_depth': 3,
    'num_leaves': 130,
    'verbosity': -1,
    'metric': 'RMSE',
    'data_random_seed': 2,
    'reg_alpha': 1, 
    'reg_lambda': 0.001,
    'save_binary': True,
    'bagging_fraction': 0.8,
    'bagging_freq': 2,
    'min_split_gain': 0.05, 
#     'min_child_weight': 1, 
    'min_child_samples': 10, 
    'nthread': 4
}

model = lgb.train(params2, train_set=d_train2, num_boost_round=5000, valid_sets=watchlist2, \
early_stopping_rounds=500, verbose_eval=500)
predsL2_train = model.predict(X)
# predsL2_valid = model.predict(valid_X)
predsL2 = model.predict(X_test)

print('[{}] Predict lgb 2 completed.'.format(time.time() - start_time))

In [None]:
regr = RandomForestRegressor(max_depth=4, random_state=0, n_estimators = 20, min_samples_leaf = 50)

# regr_X_train = pd.DataFrame(data={'predsR_train':predsR_train.reshape(-1), 'predsR2_train':predsR2_train.reshape(-1),\
#                                   'predsL2_train':predsL2_train.reshape(-1)})
regr_X_train = pd.DataFrame(data={'predsL2_train':predsL2_train.reshape(-1)})

regr.fit(regr_X_train, y.reshape(-1))
# regr_X_valid = pd.DataFrame(data={'predsR_valid':predsR_valid.reshape(-1), 'predsR2_valid':predsR2_valid.reshape(-1),\
#                                   'predsL_valid':predsL_valid.reshape(-1), 'predsL2_valid':predsL2_valid.reshape(-1)})

# regr_X_test = pd.DataFrame(data={'predsR':predsR.reshape(-1), 'predsR2':predsR2.reshape(-1),\
#                                  'predsL2':predsL2.reshape(-1)})
regr_X_test = pd.DataFrame(data={'predsL2':predsL2.reshape(-1)})

# predRegr_valid = regr.predict(regr_X_valid)
predRegr_test = regr.predict(regr_X_test)
print('[{}] Random Forest Regressor completed'.format(time.time() - start_time))

In [None]:
# preds = (predsR.reshape(-1)*0.2 + predsL.reshape(-1)*0.3 + predsL2.reshape(-1)*0.3 + predsR2.reshape(-1)*0.2)
# preds_valid = (predsR_valid.reshape(-1)*0.2 + predsL_valid.reshape(-1)*0.3 + predsL2_valid.reshape(-1)*0.3 + predsR2_valid.reshape(-1)*0.2)

# Scale back price
# pred_scale_back = price_scaler.inverse_transform(preds.reshape(-1, 1))
# pred_scale_back = np.expm1(pred_scale_back)
# pred_scale_back = pred_scale_back.clip(0, None)

# preds_valid_scaled = price_scaler.inverse_transform(preds_valid.reshape(-1, 1))
# preds_valid_scaled = np.expm1(preds_valid_scaled)
# preds_valid_scaled = preds_valid_scaled.clip(0, None)

# predRegr_valid_scaled = price_scaler.inverse_transform(predRegr_valid.reshape(-1, 1))
# predRegr_valid_scaled = np.expm1(predRegr_valid_scaled)
# predRegr_valid_scaled = predRegr_valid_scaled.clip(0, None)

# predRegr_test_scaled = price_scaler.inverse_transform(predRegr_test.reshape(-1, 1))
predRegr_test_scaled = np.expm1(predRegr_test)
predRegr_test_scaled = predRegr_test_scaled.clip(0, None)

submission['price'] = predRegr_test_scaled
submission.to_csv("submission_ridge_2xlgbm_rfRegr.csv", index=False)
print('writting is done.')