In [2]:
%matplotlib inline

import time
notebookstart= time.time()

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
import gc
#print("Data:\n",os.listdir("../input"))

# Models Packages
from sklearn import metrics
from sklearn.metrics import mean_squared_error
from sklearn import feature_selection
from sklearn.model_selection import train_test_split
from sklearn import preprocessing

# Gradient Boosting
import lightgbm as lgb

# Tf-Idf
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.pipeline import FeatureUnion
from scipy.sparse import hstack, csr_matrix
from nltk.corpus import stopwords 

# Viz
import seaborn as sns
import matplotlib.pyplot as plt

import kaggle_util
from profiler import profile
from calcImgAtt import load
import parse_att
from tqdm import tqdm
from datetime import datetime
import string

from sklearn.linear_model import Ridge
from sklearn.cross_validation import KFold, StratifiedKFold
from sklearn.decomposition import LatentDirichletAllocation as LDA

dict_att_path = '../input/dict_imgatt.pkl'
NFOLDS = 5
SEED = 5

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [3]:
debug = 1

frm = 0
to = 1503424
if debug:    
    frm = 0
    to = 10000

In [4]:
test = pd.read_csv('../input/test.csv', skiprows=range(1,frm), nrows=to-frm, index_col = "item_id", parse_dates = ["activation_date"])
train = pd.read_csv('../input/train.csv', skiprows=range(1,frm), nrows=to-frm, index_col = "item_id", parse_dates = ["activation_date"])
df = pd.concat([train,test])
testdex = df[pd.isnull(df['image_top_1'])].index
traindex = df[pd.notnull(df['image_top_1'])].index

train = df.loc[traindex]
test = df.loc[testdex]

len_train = len(train)
df = pd.concat([train,test])

y = train.image_top_1.copy().astype(np.uint16)
df.drop("image_top_1",axis=1, inplace=True)

del train, test

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=True'.


  This is separate from the ipykernel package so we can avoid doing imports until


In [5]:
categorical = ["user_id","region","city","parent_category_name",
                   "category_name","user_type",
                   "param_1","param_2","param_3"]
predictors = categorical.copy() + ['price']
df = df[categorical + ['title', 'description', 'price']]

In [6]:
train_features = pd.read_csv('../input/aggregated_features.csv')
df = df.reset_index().merge(train_features, on = ['user_id'], how = 'left').set_index('item_id')
df['avg_days_up_user'].fillna(0, inplace = True)
df['avg_times_up_user'].fillna(0, inplace = True)
df['n_user_items'].fillna(0, inplace = True)
predictors += ['avg_days_up_user', 'avg_times_up_user', 'n_user_items']

In [7]:
lbl = preprocessing.LabelEncoder()
for col in tqdm(categorical):
    df[col].fillna('Unknown')
    df[col] = lbl.fit_transform(df[col].astype(str))
    if col == 'user_id':
        df[col] = df[col].astype(np.uint32)
    else:
        df[col] = df[col].astype(np.uint16)

100%|██████████| 9/9 [00:00<00:00, 59.64it/s]


In [8]:
n_components = 20
    
lda_path = '../input/lda_{}_{}.npy'.format(frm, to)
if os.path.exists(lda_path):
    lda_categorical = np.load(lda_path)
else:
    lda = LDA(n_components=n_components, max_iter=5,
              learning_method='online',
              learning_offset=50.,
              random_state=0)
    lda_catergorical = ["parent_category_name",
                   "category_name","user_type","image_top_1",
                   "param_1","param_2","param_3"]
    df_categorical = df[lda_catergorical]
    lda_categorical = lda.fit_transform(df_categorical)
    np.save(lda_path, lda_categorical)
for i in range(n_components):
    name = 'lda_cat_{}'.format(i)
    df[name] = lda_categorical[:, i]
    predictors.append(name)

del lda_categorical
gc.collect()

182

In [9]:
count = lambda l1, l2: sum([1 for x in l1 if x in l2])
count_digit = lambda s : sum(c.isdigit() for c in s)
count_num = lambda s : sum(c.isnumeric() for c in s.split())


# Meta Text Features
df['desc_punc'] = df['description'].apply(lambda x: len([c for c in str(x) if c in string.punctuation]))
textfeats = ["description", "title"]
for cols in tqdm(textfeats):
    df[cols] = df[cols].astype(str).fillna('nicapotato') # FILL NA
    df[cols] = df[cols].str.lower() # Lowercase all text, so that capitalized words dont get treated differently
    #df[cols] = df[cols].apply(lambda x: cleanName(x))

    att_name = cols + '_num_chars'
    predictors.append(att_name)
    if att_name not in df.columns:
        df[att_name] = df[cols].apply(len).astype(np.uint16) # Count number of Characters

    att_name = cols + '_num_words'
    predictors.append(att_name)
    if att_name not in df.columns:
        df[att_name] = df[cols].apply(lambda comment: len(comment.split())).astype(np.uint16) # Count number of Words

    att_name = cols + '_num_unique_words'
    predictors.append(att_name)
    if att_name not in df.columns:
        df[att_name] = df[cols].apply(lambda comment: len(set(w for w in comment.split()))).astype(np.uint16)

    att_name = cols + '_words_vs_unique'
    predictors.append(att_name)
    if att_name not in df.columns:
        df[att_name] = (df[cols+'_num_unique_words'] / df[cols+'_num_words'] * 100).astype(np.float32) # Count Unique Words

    att_name = cols + '_punctuation'
    predictors.append(att_name)
    if att_name not in df.columns:
        df[att_name] = df[cols].apply(count, args=(string.punctuation,)).astype(np.uint16)

    att_name = cols + '_digit'
    predictors.append(att_name)
    if att_name not in df.columns:
        df[att_name] = df[cols].apply(count_digit).astype(np.uint16)

    att_name = cols + '_num'
    predictors.append(att_name)
    if att_name not in df.columns:
        df[att_name] = df[cols].apply(count_num).astype(np.uint16)

    att_name = cols + '_num_letters'
    predictors.append(att_name)
    if att_name not in df.columns:
        df[att_name] = df[cols].apply(lambda comment: len(comment)).astype(np.uint16)

#df['description_num_letters'] = df['description_num_letters'] + 1
#df['description_num_words'] = df['description_num_words'] + 1
df['title_desc_len_ratio'] = df['title_num_letters']/df['description_num_letters']
df['desc_num_ratio'] = df['description_num']/df['description_num_words']
predictors += ['title_desc_len_ratio', 'desc_num_ratio']

100%|██████████| 2/2 [00:01<00:00,  1.66it/s]


In [10]:
feature_list = [
        (['region', 'parent_category_name', 'category_name'], ['count', 'cumcount', 'nunique']),

        (['parent_category_name', 'category_name', 'price'], ['count', 'zscore']),

        (['user_id', 'price'], ['count', 'cumcount']),
        (['user_id', 'parent_category_name', 'category_name', 'price'], ['count', 'cumcount']),
        (['region', 'city', 'parent_category_name', 'category_name', 'price'], ['count', 'zscore']),

        (['parent_category_name', 'category_name', 'description_num_chars'], ['zscore']),
        (['parent_category_name', 'category_name', 'description_num_words'], ['zscore']),
        (['parent_category_name', 'category_name', 'description_num_unique_words'], ['zscore']),
        (['parent_category_name', 'category_name', 'description_words_vs_unique'], ['zscore']),
        (['parent_category_name', 'category_name', 'description_punctuation'], ['zscore']),
        (['parent_category_name', 'category_name', 'description_digit'], ['zscore']),
        (['parent_category_name', 'category_name', 'description_num'], ['zscore']),


        (['parent_category_name', 'category_name', 'title_num_chars'], ['zscore']),
        (['parent_category_name', 'category_name', 'title_num_words'], ['zscore']),
        (['parent_category_name', 'category_name', 'title_num_unique_words'], ['zscore']),
        (['parent_category_name', 'category_name', 'title_words_vs_unique'], ['zscore']),
        (['parent_category_name', 'category_name', 'title_punctuation'], ['zscore']),
        (['parent_category_name', 'category_name', 'title_digit'], ['zscore']),
        (['parent_category_name', 'category_name', 'title_num'], ['zscore']),

        (['parent_category_name', 'category_name', 'title_desc_len_ratio'], ['zscore']),
        (['parent_category_name', 'category_name', 'desc_num_ratio'], ['zscore']),
        ]


for (selcol, how) in tqdm(feature_list):
    print('{} {}'.format(selcol, how))
    df, sub_changed = parse_att.calcGroupFeatureBulk(df, selcol, how, frm, to, predictors)

  0%|          | 0/21 [00:00<?, ?it/s]

['region', 'parent_category_name', 'category_name'] ['count', 'cumcount', 'nunique']
group feature: region_parent_category_name_category_name_count
calculate from scratch: region_parent_category_name_category_name_count
group feature: region_parent_category_name_category_name_cumcount
load from file


  5%|▍         | 1/21 [00:00<00:05,  3.37it/s]

group feature: region_parent_category_name_category_name_nunique
load from file
['parent_category_name', 'category_name', 'price'] ['count', 'zscore']
group feature: parent_category_name_category_name_price_count
calculate from scratch: parent_category_name_category_name_price_count
group feature: parent_category_name_category_name_price_zscore


 10%|▉         | 2/21 [00:00<00:04,  3.98it/s]

load from file
['user_id', 'price'] ['count', 'cumcount']
group feature: user_id_price_count
calculate from scratch: user_id_price_count


 14%|█▍        | 3/21 [00:00<00:04,  4.37it/s]

group feature: user_id_price_cumcount
load from file
['user_id', 'parent_category_name', 'category_name', 'price'] ['count', 'cumcount']
group feature: user_id_parent_category_name_category_name_price_count
calculate from scratch: user_id_parent_category_name_category_name_price_count
group feature: user_id_parent_category_name_category_name_price_cumcount
load from file


 19%|█▉        | 4/21 [00:00<00:03,  4.55it/s]

['region', 'city', 'parent_category_name', 'category_name', 'price'] ['count', 'zscore']
group feature: region_city_parent_category_name_category_name_price_count
calculate from scratch: region_city_parent_category_name_category_name_price_count
group feature: region_city_parent_category_name_category_name_price_zscore
load from file


 29%|██▊       | 6/21 [00:01<00:02,  5.01it/s]

['parent_category_name', 'category_name', 'description_num_chars'] ['zscore']
group feature: parent_category_name_category_name_description_num_chars_zscore
load from file
['parent_category_name', 'category_name', 'description_num_words'] ['zscore']
group feature: parent_category_name_category_name_description_num_words_zscore
load from file


 38%|███▊      | 8/21 [00:01<00:02,  5.65it/s]

['parent_category_name', 'category_name', 'description_num_unique_words'] ['zscore']
group feature: parent_category_name_category_name_description_num_unique_words_zscore
load from file
['parent_category_name', 'category_name', 'description_words_vs_unique'] ['zscore']
group feature: parent_category_name_category_name_description_words_vs_unique_zscore
load from file


 48%|████▊     | 10/21 [00:01<00:01,  6.13it/s]

['parent_category_name', 'category_name', 'description_punctuation'] ['zscore']
group feature: parent_category_name_category_name_description_punctuation_zscore
load from file
['parent_category_name', 'category_name', 'description_digit'] ['zscore']
group feature: parent_category_name_category_name_description_digit_zscore
load from file


 57%|█████▋    | 12/21 [00:01<00:01,  6.48it/s]

['parent_category_name', 'category_name', 'description_num'] ['zscore']
group feature: parent_category_name_category_name_description_num_zscore
load from file
['parent_category_name', 'category_name', 'title_num_chars'] ['zscore']
group feature: parent_category_name_category_name_title_num_chars_zscore
load from file


 67%|██████▋   | 14/21 [00:02<00:01,  6.73it/s]

['parent_category_name', 'category_name', 'title_num_words'] ['zscore']
group feature: parent_category_name_category_name_title_num_words_zscore
load from file
['parent_category_name', 'category_name', 'title_num_unique_words'] ['zscore']
group feature: parent_category_name_category_name_title_num_unique_words_zscore
load from file


 76%|███████▌  | 16/21 [00:02<00:00,  6.95it/s]

['parent_category_name', 'category_name', 'title_words_vs_unique'] ['zscore']
group feature: parent_category_name_category_name_title_words_vs_unique_zscore
load from file
['parent_category_name', 'category_name', 'title_punctuation'] ['zscore']
group feature: parent_category_name_category_name_title_punctuation_zscore
load from file


 86%|████████▌ | 18/21 [00:02<00:00,  7.12it/s]

['parent_category_name', 'category_name', 'title_digit'] ['zscore']
group feature: parent_category_name_category_name_title_digit_zscore
load from file
['parent_category_name', 'category_name', 'title_num'] ['zscore']
group feature: parent_category_name_category_name_title_num_zscore
load from file


 95%|█████████▌| 20/21 [00:02<00:00,  7.27it/s]

['parent_category_name', 'category_name', 'title_desc_len_ratio'] ['zscore']
group feature: parent_category_name_category_name_title_desc_len_ratio_zscore
load from file
['parent_category_name', 'category_name', 'desc_num_ratio'] ['zscore']
group feature: parent_category_name_category_name_desc_num_ratio_zscore
load from file


100%|██████████| 21/21 [00:02<00:00,  7.33it/s]


In [11]:
df = kaggle_util.reduce_mem_usage(df)

100%|██████████| 81/81 [00:00<00:00, 957.55it/s]

Memory usage of dataframe is 9.16 MB
Memory usage after optimization is: 3.34 MB
Decreased by 63.5%





In [12]:
print("\n[TF-IDF] Term Frequency Inverse Document Frequency Stage")
russian_stop = set(stopwords.words('russian'))

tfidf_para = {
    "stop_words": russian_stop,
    "analyzer": 'word',
    "token_pattern": r'\w{1,}',
    "sublinear_tf": True,
    "dtype": np.float32,
    "norm": 'l2',
    #"min_df":5,
    #"max_df":.9,
    "smooth_idf":False
}
def get_col(col_name): return lambda x: x[col_name]
vectorizer = FeatureUnion([
        ('description',TfidfVectorizer(
            ngram_range=(1, 2),
            max_features=17000,
            **tfidf_para,
            preprocessor=get_col('description'))),

        ('title',TfidfVectorizer(
            ngram_range=(1, 2),
            **tfidf_para,
            #max_features=7000,
            preprocessor=get_col('title')))
    ])   

start_vect=time.time()
vectorizer.fit(df.loc[traindex].to_dict('records'))
ready_df = vectorizer.transform(df.to_dict('records'))
tfvocab = vectorizer.get_feature_names()
print("Vectorization Runtime: %0.2f Minutes"%((time.time() - start_vect)/60))

# Drop Text Cols
df.drop(textfeats, axis=1,inplace=True)


[TF-IDF] Term Frequency Inverse Document Frequency Stage
Vectorization Runtime: 0.10 Minutes


In [13]:
df = df[predictors]
tfvocab = df.columns.tolist() + tfvocab
testing = hstack([csr_matrix(df[len_train:].values),ready_df[len_train:]])
df = df.loc[traindex]

In [23]:
lgbm_params =  {
        'task': 'train',
        'boosting_type': 'gbdt',
        'objective': 'multiclass',
        'metric' : 'multi_logloss',
        'num_class' : 3067,
        'num_leaves': 270,# 37,
        'feature_fraction': 0.5,
        'bagging_fraction': 0.75,
        'learning_rate': 0.016,
        'nthread': 6,
        'verbose': 0,
        'drop_rate': 0.02
        }


In [24]:
nfold = 2 if debug else 5
skf = StratifiedKFold(y, n_folds=nfold)
for i, (train_split, val_split) in enumerate(skf):

    print(len(train_split),len(val_split))
    X_train = hstack([csr_matrix(df.iloc[train_split].values),ready_df[train_split]])
    X_valid = hstack([csr_matrix(df.iloc[val_split].values),ready_df[val_split]]) # Sparse Matrix 
    y_train = y[train_split]
    y_valid = y[val_split]

    lgtrain = lgb.Dataset(X_train, y_train,
                    feature_name=tfvocab,
                    categorical_feature = categorical)
    lgvalid = lgb.Dataset(X_valid, y_valid,
                    feature_name=tfvocab,
                    categorical_feature = categorical)

    modelstart = time.time()
    lgb_clf = lgb.train(
        lgbm_params,
        lgtrain,
        num_boost_round=26000,
        valid_sets=[lgtrain, lgvalid],
        valid_names=['train','valid'],
        early_stopping_rounds=100,
        verbose_eval=100
    )



8551 9924




Training until validation scores don't improve for 100 rounds.


KeyboardInterrupt: 