In [1]:
%load_ext autoreload
%autoreload 1
%aimport myutils

In [2]:
import sys

import fasttext
import numpy as np
import pandas as pd

print('python:'.ljust(16), sys.version.split('\n')[0])

python:          3.8.9 (default, Apr 13 2022, 08:48:06) 


## Define constants

In [3]:
RANDOM_SEED = 2147483647

DATA_PATH = './data/'
WORKING_PATH = './working/'

TARGET = 'is_bad'  # Target feature
TRAIN = 'TRAIN'  # Binary feature to separate train and valid data

DIM = 100
MIN_CHAR_NGRAM = 2
MAX_CHAR_NGRAM = 6
WORD_NGRAM = 2

AUTOTUNE = True
AUTOTUNE_TIME = 7200

## Load data

In [4]:
# Load datasets
df_train = pd.read_csv(DATA_PATH + 'train.csv', parse_dates=['datetime_submitted'])
df_valid = pd.read_csv(DATA_PATH + 'val.csv', parse_dates=['datetime_submitted'])

# Join train and valid datasets
df_train[TRAIN] = True  # Train/valid flag
df_valid[TRAIN] = False  # Train/valid flag
df = pd.concat([df_train, df_valid], ignore_index=True)

del df_train
del df_valid

## Prepare dataframe
### Preprocess

In [5]:
# Add binary column to indicate price absence (NaNs)
df['no_price'] = 0
df.loc[df['price'].isna(), 'no_price'] = 1

# Replace NaNs price with mean value
median_price = df.loc[df['price'].notna(), 'price'].median()
df.loc[df['price'].isna(), 'price'] = median_price

### Feature engineering

In [6]:
# price round number feature
# datetime features
# Drop features that will not be used

### View df

In [7]:
df.head(2)

Unnamed: 0,title,description,subcategory,category,price,region,city,datetime_submitted,is_bad,TRAIN,no_price
0,Диван-кровать,Продаем диван-кровать. Удобный механизм - евро...,Мебель и интерьер,Для дома и дачи,7000.0,Россия,Москва,2019-06-01 00:00:15.180656,0,True,0
1,Кожух рулевой колонки Даф хф 91 4509834,Кожух рулевой колонки DAF XF 94 (60066004)/\n ...,Запчасти и аксессуары,Транспорт,2290.0,Россия,Москва,2019-06-01 00:00:44.317933,0,True,0


### Split df on train and valid

In [8]:
# Split df
X_train = df.loc[df[TRAIN]].drop([TRAIN, TARGET], axis=1).reset_index(drop=True)
X_valid = df.loc[~df[TRAIN]].drop([TRAIN, TARGET], axis=1).reset_index(drop=True)
y_train = df.loc[df[TRAIN], TARGET].values
y_valid = df.loc[~df[TRAIN], TARGET].values

### Tokenize text data

In [9]:
%%time
# Get tokenized columns
X_train['description'] = myutils.get_tokenized_x(
    X_train['title'] + ' ' + X_train['description'] + ' __label__' + pd.Series(y_train).astype(str),
    fname=WORKING_PATH + 'tokenized_x_train_w1_at.csv',
    stopwords_fname = WORKING_PATH + 'stopwords-ru.txt',
    regexp=r'(?u)[\w@]+',
    saving=True,
)
X_valid['description'] = myutils.get_tokenized_x(
    X_valid['title'] + ' ' + X_valid['description'] + ' __label__' + pd.Series(y_valid).astype(str),
    fname=WORKING_PATH + 'tokenized_x_valid_w1_at.csv',
    stopwords_fname = WORKING_PATH + 'stopwords-ru.txt',
    regexp=r'(?u)[\w@]+',
    saving=True,
)

# Delete labels from 'description'
X_train['description'] = X_train['description'].str.rsplit(n=1, expand=True).loc[:, 0]
X_valid['description'] = X_valid['description'].str.rsplit(n=1, expand=True).loc[:, 0]

# Drop 'title' column
X_train.drop(columns='title', inplace=True)
X_valid.drop(columns='title', inplace=True)

CPU times: user 9.08 s, sys: 1.22 s, total: 10.3 s
Wall time: 10.5 s


In [10]:
X_train.head(2)

Unnamed: 0,description,subcategory,category,price,region,city,datetime_submitted,no_price
0,диван кровать продавать диван кровать удобный ...,Мебель и интерьер,Для дома и дачи,7000.0,Россия,Москва,2019-06-01 00:00:15.180656,0
1,кожух рулевой колонка даф хф 91 4509834 кожух ...,Запчасти и аксессуары,Транспорт,2290.0,Россия,Москва,2019-06-01 00:00:44.317933,0


In [11]:
# Print shapes
pd.DataFrame({'X': {'train': X_train.shape, 'valid': X_valid.shape},
              'y': {'train': y_train.shape, 'valid': y_valid.shape}})

Unnamed: 0,X,y
train,"(984487, 8)","(984487,)"
valid,"(16237, 8)","(16237,)"


### Vectorize text data

In [12]:
%%time
params = {
    'input': WORKING_PATH + 'tokenized_x_train_w1_at.csv',  # training file path (required)
    'model': 'skipgram',  # unsupervised fasttext model {cbow, skipgram} [skipgram]
    'lr': 0.05,  # learning rate [0.05]
    'dim': DIM,  # size of word vectors [100]
    'ws': 5,  # size of the context window [5]
    'epoch': 5,  # number of epochs [5]
    'minCount': 5,  # minimal number of word occurences [5]
    'minn': MIN_CHAR_NGRAM,  # min length of char ngram [3]
    'maxn': MAX_CHAR_NGRAM,  # max length of char ngram [6]
    'neg': 5,  # number of negatives sampled [5]
    'wordNgrams': WORD_NGRAM,  # max length of word ngram [1]
    # 'loss': 'ns',  # loss function {ns, hs, softmax, ova} [ns]
    # 'bucket': 2000000,  # number of buckets [2000000]
    # 'thread': 8,  # number of threads [number of cpus]
    # 'lrUpdateRate': 100,  # change the rate of updates for the learning rate [100]
    # 't': 0.0001,  # sampling threshold [0.0001]
    'verbose': 2,  # verbose [2]
}

# Get vectorizer
vectorizer = myutils.get_vectorizer(
    params=params,
    fname=(WORKING_PATH + 'vectorizer_train_w1_at_' + str(DIM) + str(MIN_CHAR_NGRAM)
           + str(MAX_CHAR_NGRAM) + str(WORD_NGRAM) + '.bin'),
    saving=True,
)

CPU times: user 268 ms, sys: 334 ms, total: 603 ms
Wall time: 892 ms




In [13]:
%%time
# Get vectorized data
X_train_vect = myutils.get_vectorized_x(
    X_train['description'],
    fname=WORKING_PATH + 'vectorized_x_train_w1_at_' + str(DIM) + str(MIN_CHAR_NGRAM)
           + str(MAX_CHAR_NGRAM) + str(WORD_NGRAM) + '.npy',
    vectorizer=vectorizer,
    saving=True,
)
X_valid_vect = myutils.get_vectorized_x(
    X_valid['description'],
    fname=WORKING_PATH + 'vectorized_x_valid_w1_at_' + str(DIM) + str(MIN_CHAR_NGRAM)
           + str(MAX_CHAR_NGRAM) + str(WORD_NGRAM) + '.npy',
    vectorizer=vectorizer,
    saving=True,
)

# Add other columns from df
columns = ['price', 'no_price']
X_train_vect = np.hstack((X_train_vect, X_train[columns].values))
X_valid_vect = np.hstack((X_valid_vect, X_valid[columns].values))

# Print shapes
pd.DataFrame({'X': {'train': X_train_vect.shape, 'valid': X_valid_vect.shape},
              'y': {'train': y_train.shape, 'valid': y_valid.shape}})

CPU times: user 50.1 ms, sys: 320 ms, total: 371 ms
Wall time: 675 ms


Unnamed: 0,X,y
train,"(984487, 102)","(984487,)"
valid,"(16237, 102)","(16237,)"


## Define classifier and fit

In [14]:
%%time
# Set classifier parameters
if AUTOTUNE:
    params = {
        'input': WORKING_PATH + 'tokenized_x_train_w1_at.csv',
        # 'dim': DIM,  # size of word vectors [100]
        'autotuneValidationFile': WORKING_PATH + 'tokenized_x_valid_w1_at.csv',  # ['']
        # 'autotuneMetric': 'f1:__label__0',  # ['f1']
        # 'autotunePredictions': 1,  # [1]
        'autotuneDuration': AUTOTUNE_TIME,  # [60 * 5]
        # 'autotuneModelSize': '100M',  # ['']
    }
    fname = (WORKING_PATH + 'classifier_train_auto_' + str(AUTOTUNE_TIME) + '.bin')
    
else:
    params = {
        'input': WORKING_PATH + 'tokenized_x_train_w1_at.csv',  # training file path (required)
        'lr': 0.1,  # learning rate [0.1]
        'dim': DIM,  # size of word vectors [100]
        'ws': 5,  # size of the context window [5]
        'epoch': 5,  # number of epochs [5]
        'minCount': 1,  # minimal number of word occurences [1]
        'minCountLabel': 1,  # minimal number of label occurences [1]
        'minn': MIN_CHAR_NGRAM,  # min length of char ngram [0]
        'maxn': MAX_CHAR_NGRAM,  # max length of char ngram [0]
        'neg': 5,  # number of negatives sampled [5]
        'wordNgrams': WORD_NGRAM,  # max length of word ngram [1]
        # 'loss': 'ns',  # loss function {ns, hs, softmax, ova} [softmax]
        # 'bucket': 2000000,  # number of buckets [2000000]
        # 'thread': 8,  # number of threads [number of cpus]
        # 'lrUpdateRate': 100,  # change the rate of updates for the learning rate [100]
        # 't': 0.0001,  # sampling threshold [0.0001]
        # 'label': '__label__',  # label prefix ['__label__']
        'verbose': 2,  # verbose [2]
        # 'pretrainedVectors': None # pretrained word vectors (.vec file) for supervised learning ['']
    }
    fname = (WORKING_PATH + 'classifier_train_w1_at_' + str(DIM) + str(MIN_CHAR_NGRAM)
             + str(MAX_CHAR_NGRAM) + str(WORD_NGRAM) + '.bin')

# Get classifier
classifier = myutils.get_classifier(params=params, fname=fname, saving=True)

Progress: 100.0% Trials:   43 Best score:  0.909097 ETA:   0h 0m 0s
Training again with best arguments
Read 55M words
Number of words:  1760190
Number of labels: 2
Progress: 100.0% words/sec/thread:  426672 lr:  0.000000 avg.loss:  0.106920 ETA:   0h 0m 0s


CPU times: user 13h 55min 49s, sys: 9min 58s, total: 14h 5min 48s
Wall time: 2h 9min 11s


## Predict

In [15]:
%%time
pred = classifier.predict(X_valid['description'].tolist())

pred_labels = np.array([int(x[0][-1:]) for x in pred[0]])
pred_probas = np.array([x[0] for x in pred[1]])

pred_probas[pred_labels == 0] = 1 - pred_probas[pred_labels == 0]

CPU times: user 1.08 s, sys: 68 ms, total: 1.14 s
Wall time: 1.17 s


## Compute metric

In [16]:
# Compute metric
macro_score, micro_score, roc_auc = myutils.get_score(y_valid, pred_probas, X_valid['category'])

# Print results
print(macro_score, micro_score)
print()
pd.Series(roc_auc).sort_values(ascending=False)

0.9017756703353609 0.9300779788070932



Транспорт              0.975117
Недвижимость           0.950268
Бытовая электроника    0.927029
Животные               0.915918
Для дома и дачи        0.915255
Хобби и отдых          0.881212
Личные вещи            0.872822
Услуги                 0.869428
Для бизнеса            0.855917
Работа                 0.854792
dtype: float64