In [1]:
%load_ext autoreload
%autoreload 1
%aimport myutils

In [2]:
import joblib
import sys

import hiclass
import matplotlib.pyplot as plt
import networkx as nx
import numpy as np
import pandas as pd
import scipy
import sklearn
from sklearn.ensemble import HistGradientBoostingClassifier

print('python:'.ljust(16), sys.version.split('\n')[0])
print('hiclass:'.ljust(16), hiclass.__version__)
print('scikit-learn:'.ljust(16), sklearn.__version__)

python:          3.9.12 (main, Apr  5 2022, 01:53:17) 
hiclass:         3.0.3
scikit-learn:    1.0.2


## Define constants

In [3]:
RANDOM_SEED = 2147483647
GRAPH_ROOT = 1

DATA_PATH = './data/'
WORKING_PATH = myutils.WORKING_PATH

IDENT = 'id'  # ID feature name
TARGET = myutils.TARGET  # Target feature name
TRAIN = myutils.TRAIN  # Binary feature name to separate train and test data
PATH_COLS = myutils.PATH_COLS
RANDOM = 'RANDOM'  # Feature of random numbers

MODEL_TYPE = 'lcpn'  # {'lcpn', 'lcn'}
N_ESTIMATORS = 20
N_WORD_SYMBOLS = 2
VALID_SIZE = 0.2

## Load data

In [4]:
# Load datasets
tree = pd.read_csv(DATA_PATH + 'categories_tree.csv')
df_train = pd.read_parquet(DATA_PATH + 'train.parquet', 'pyarrow')
df_test = pd.read_parquet(DATA_PATH + 'test.parquet', 'pyarrow')

# Join train and test datasets
df_test[TARGET] = pd.NA  # Fill target variable of the test dataset with zeros
df_train[TRAIN] = True  # Train/test flag
df_test[TRAIN] = False  # Train/test flag
df = pd.concat([df_train, df_test], ignore_index=True)

## Prepare graph

In [5]:
# Define graph
edgelist = [tuple(r) for r in tree[['parent_id', 'id']].to_numpy()]
G = nx.from_edgelist(edgelist, create_using=nx.DiGraph)

# Remove nodes that not in df
G.remove_node(0)
df_leaf_nodes = set(df.loc[df[TRAIN], TARGET])
while True:
    graph_leaf_nodes = {x for x in G.nodes() if G.out_degree(x)==0}
    useless_nodes = graph_leaf_nodes - df_leaf_nodes
    if len(useless_nodes) == 0:
        break
    G.remove_nodes_from(useless_nodes)

## Prepare dataframe
### Preprocess

In [6]:
# Add binary column to indicate rating absence (0.0 value)
df['no_rating'] = 0
df.loc[df['rating'] == 0.0, 'no_rating'] = 1

# Replace 0.0 rating with mean value
mean_rating = df.loc[df['rating'] == 0.0, 'rating'].mean()
df.loc[df['rating'].isna(), 'rating'] = mean_rating

# Replace -1 value of feedback quantity with 0
df.loc[df['feedback_quantity'] == -1, 'feedback_quantity'] = 0

# Add feature with random numbers
rng = np.random.default_rng(RANDOM_SEED)
df[RANDOM] = rng.integers(1000000, size=len(df))

# Drop features that will not be used
df.drop(['short_description', 'name_value_characteristics'], axis=1, inplace=True)

### Add columns with path nodes

In [7]:
path_maps = myutils.get_path_map_by_depth(G, GRAPH_ROOT)

path_map_df = pd.DataFrame.from_dict(
    path_maps[-1],
    orient='index',
    dtype=pd.Int64Dtype(),
    columns=PATH_COLS,
)

df = df.join(path_map_df, on=TARGET)

### View df

In [8]:
df.head(2)

Unnamed: 0,id,title,rating,feedback_quantity,category_id,TRAIN,no_rating,RANDOM,node_1,node_2,node_3,node_4,node_5
0,1267423,Muhle Manikure Песочные колпачки для педикюра ...,0.0,0,2693,True,1,815807,10012,10113,10355,2693,-1
1,128833,"Sony Xperia L1 Защитное стекло 2,5D",4.666667,9,13408,True,0,270854,10020,10044,10398,13408,-1


### Split df on train, valid, test

In [9]:
# Split df
X, X_train, X_valid, X_test, y, y_train, y_valid, _, _ = myutils.split_scale_df(
    df, validation_size=VALID_SIZE, random_state=RANDOM_SEED
)

# Print shapes
pd.DataFrame({'X': {'not test': X.shape, 'train': X_train.shape, 'valid': X_valid.shape, 'test': X_test.shape},
              'y': {'not test': y.shape, 'train': y_train.shape, 'valid': y_valid.shape, 'test': None}})

Unnamed: 0,X,y
not test,"(283452, 6)","(283452, 5)"
train,"(226761, 6)","(226761, 5)"
valid,"(56691, 6)","(56691, 5)"
test,"(70864, 6)",


In [10]:
# title_train = myutils.get_title_vectors(
#     X_train['title'],
#     'bl_titlevect_w2_train.joblib',
#     None,
#     saving=False,
# )
# title_valid = myutils.get_title_vectors(
#     X_valid['title'],
#     'bl_titlevect_w2_valid.joblib',
#     None,
#     saving=False,
# )
# title_whole = myutils.get_title_vectors(
#     X['title'],
#     'bl_titlevect_w2_whole.joblib',
#     None,
#     saving=False,
# )
# title_test = myutils.get_title_vectors(
#     X_test['title'],
#     'bl_titlevect_w2_test.joblib',
#     None,
#     saving=False,
# )


# feature_mask = joblib.load(WORKING_PATH + 'lvl1_feature_mask.joblib')
# joblib.dump(X_train.toarray()[:, feature_mask], WORKING_PATH + 'lvl_data_train.joblib')
# joblib.dump(X_valid.toarray()[:, feature_mask], WORKING_PATH + 'lvl_data_valid.joblib')
# joblib.dump(X.toarray()[:, feature_mask], WORKING_PATH + 'lvl_data_whole.joblib')
# joblib.dump(X_test.toarray()[:, feature_mask], WORKING_PATH + 'lvl_data_test.joblib')

In [21]:
%%time
# Get data
X_train = joblib.load(WORKING_PATH + 'lvl_data_train.joblib')
X_valid = joblib.load(WORKING_PATH + 'lvl_data_valid.joblib')


# Print shapes
pd.DataFrame({'X': {'train': X_train.shape, 'valid': X_valid.shape},
              'y': {'train': y_train.shape, 'valid': y_valid.shape}})

CPU times: user 447 ms, sys: 1.02 s, total: 1.47 s
Wall time: 1.62 s


Unnamed: 0,X,y
train,"(226761, 1736)","(226761, 5)"
valid,"(56691, 1736)","(56691, 5)"


## Define classifier and fit

In [22]:
%%time
clf = HistGradientBoostingClassifier(
    max_iter=100,
    random_state=RANDOM_SEED,
)

clf.fit(X_train, y_train[:, 0])

CPU times: user 23min 54s, sys: 2min 7s, total: 26min 1s
Wall time: 3min 48s


HistGradientBoostingClassifier(random_state=2147483647)

## Predict

In [23]:
%%time
pred_train = clf.predict(X_train)
pred_valid = clf.predict(X_valid)
print()

pred_valid


CPU times: user 39.6 s, sys: 3.28 s, total: 42.9 s
Wall time: 5.74 s


array([10014, 10020, 10012, ..., 10012, 10020, 10018])

## Compute metric

In [24]:
whF_train, _ = myutils.get_score(path_maps[0], y_train[:, :1], pred_train[:, None])
whF_valid, _ = myutils.get_score(path_maps[0], y_valid[:, :1], pred_valid[:, None])
print()
print('Weighted hF on train:', whF_train)
print('Weighted hF on valid:', whF_valid)

100%|████████████████████████████████████████████| 5/5 [00:00<00:00, 249.41it/s]
100%|███████████████████████████████████████████| 5/5 [00:00<00:00, 1253.38it/s]



Weighted hF on train: 0.9730850259862007
Weighted hF on valid: 0.9693385884299881
