# Catboost model baseline

In [9]:
import pandas as pd
import re
import spacy
import torch
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from catboost import CatBoostRegressor
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error

path_to_train_dataset = "../dataset/train/train.parquet"
path_to_val_dataset = "../dataset/val/val.parquet"
path_to_test_dataset = "../dataset/test/test.parquet"

df_train = pd.read_parquet(path_to_train_dataset, engine="pyarrow")
df_val = pd.read_parquet(path_to_val_dataset, engine="pyarrow")
df_test = pd.read_parquet(path_to_test_dataset, engine="pyarrow")

SEED = 42

df_train.head()

Unnamed: 0,is_available,gen,year,mileage,color,equipment,tax,transmission,drive,wheel_type,state,owners,price,model_name,description
0,True,III,2014,212000,синий,1 опция,39900,автоматическая,полный,Левый,Не требует ремонта,1 владелец,2100000,"Toyota Sienna, 2014","Машина привезена из США, владею уже больше 5 л..."
1,True,II (G02) Рестайлинг,2021,73750,чёрный,xDrive20d M Sport Pure,9500,автоматическая,полный,Левый,Не требует ремонта,3 или более,5599000,"BMW X4 20d, 2021","УНИКАЛЬНОЕ ПРЕДЛОЖЕНИЕ✅КРЕДИТ ОТ 4,9% ГОДОВЫХ✅..."
2,True,I Рестайлинг,2024,1,серый,Full Moon (Xingyue L),17850,автоматическая,полный,Левый,Не требует ремонта,1 владелец,3600000,"Geely Monjaro (Xingyue L), 2024",Максимальная выгода на любой автомобиль из нал...
3,True,I (X156),2017,123652,чёрный,57 опций,5250,роботизированная,передний,Левый,Не требует ремонта,3 или более,1897000,"Mercedes-Benz GLA 200, 2017",РОЛЬФ | Вешки автомобили с пробегом✅Данный авт...
4,True,IV Рестайлинг,2021,111798,серый,41 опция,8950,автоматическая,полный,Левый,Не требует ремонта,1 владелец,3045000,"Hyundai Santa Fe, 2021",Комплектации LifeStyle 2.5 АТ AWD 181л.с.Один ...


### Column preprocessing

In [10]:
TEXT_COLUMNS = ["gen", "color", "transmission", "drive", "wheel_type", "state", "model_name", "description"]

def clean_text(text: str) -> str:
    text = re.sub(r'\n\n+', '\n', text)
    text = re.sub(r'\t+', ' ', text)
    text = re.sub(r' +', ' ', text)
    return text.strip()

df_train["text"] = df_train[TEXT_COLUMNS].agg(" ".join, axis=1)
df_val["text"] = df_val[TEXT_COLUMNS].agg(" ".join, axis=1)
df_test["text"] = df_test[TEXT_COLUMNS].agg(" ".join, axis=1)

df_train["text"] = df_train["text"].apply(clean_text)
df_val["text"] = df_val["text"].apply(clean_text)
df_test["text"] = df_test["text"].apply(clean_text)

df_train["text"].head()

0    III синий автоматическая полный Левый Не требу...
1    II (G02) Рестайлинг чёрный автоматическая полн...
2    I Рестайлинг серый автоматическая полный Левый...
3    I (X156) чёрный роботизированная передний Левы...
4    IV Рестайлинг серый автоматическая полный Левы...
Name: text, dtype: object

### Split target and not target variables

In [14]:
X_train, Y_train = df_train[["text", "mileage", "tax"]], np.log(df_train["price"])
X_val, Y_val = df_val[["text", "mileage", "tax"]], np.log(df_val["price"])
X_test, Y_test = df_test[["text", "mileage", "tax"]], df_test["price"]
X_train.head()

Unnamed: 0,text,mileage,tax
0,III синий автоматическая полный Левый Не требу...,212000,39900
1,II (G02) Рестайлинг чёрный автоматическая полн...,73750,9500
2,I Рестайлинг серый автоматическая полный Левый...,1,17850
3,I (X156) чёрный роботизированная передний Левы...,123652,5250
4,IV Рестайлинг серый автоматическая полный Левы...,111798,8950


### Calculate tf-idf features

In [16]:
nlp = spacy.load("ru_core_news_sm")
russian_stopwords = list(nlp.Defaults.stop_words)
vectorizer = TfidfVectorizer(max_features=20000, stop_words=russian_stopwords, 
                            sublinear_tf = True, analyzer = "word", token_pattern=r"\w{1,}", ngram_range=(1, 2))

X_train_tfidf = vectorizer.fit_transform(X_train["text"]).toarray()
X_val_tfidf = vectorizer.transform(X_val["text"]).toarray()
X_test_tfidf = vectorizer.transform(X_test["text"]).toarray()

### Calculate svd from tf-idf to reduce space size

In [17]:
svd = TruncatedSVD(n_components=400, random_state=SEED)

X_train_tfidf = svd.fit_transform(X_train_tfidf)
X_val_tfidf = svd.transform(X_val_tfidf)
X_test_tfidf = svd.transform(X_test_tfidf)

svd_features = [f'svd_{i}' for i in range(X_train_tfidf.shape[1])]

X_train_tfidf_df = pd.DataFrame(X_train_tfidf, index=X_train.index, columns=svd_features)
X_val_tfidf_df = pd.DataFrame(X_val_tfidf, index=X_val.index, columns=svd_features)
X_test_tfidf_df = pd.DataFrame(X_test_tfidf, index=X_test.index, columns=svd_features)

### Combine tf-idf features and numerical

In [18]:
catboost_train = pd.concat([X_train[["mileage", "tax"]], X_train_tfidf_df], axis=1)
catboost_val = pd.concat([X_val[["mileage", "tax"]], X_val_tfidf_df], axis=1)
catboost_test = pd.concat([X_test[["mileage", "tax"]], X_test_tfidf_df], axis=1)

### Train catboost model

In [19]:
%%time
catboost = CatBoostRegressor(
    iterations=10000, 
    learning_rate=0.1,
    depth=9, 
    verbose=100, 
    random_seed=SEED, 
    task_type="GPU" if torch.cuda.is_available() else "CPU", 
    eval_metric="R2", 
    od_wait=150, 
    use_best_model=True,
)

catboost.fit(catboost_train, Y_train, eval_set=(catboost_val, Y_val))

Default metric period is 5 because R2 is/are not implemented for GPU
Metric R2 is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time


0:	learn: 0.1330692	test: 0.1321153	best: 0.1321153 (0)	total: 58.8ms	remaining: 9m 47s
100:	learn: 0.9268570	test: 0.8655634	best: 0.8655634 (100)	total: 5.83s	remaining: 9m 31s
200:	learn: 0.9562696	test: 0.8748983	best: 0.8749011 (198)	total: 11.6s	remaining: 9m 25s
300:	learn: 0.9687377	test: 0.8786904	best: 0.8786904 (300)	total: 17.3s	remaining: 9m 16s
400:	learn: 0.9749730	test: 0.8800237	best: 0.8800237 (400)	total: 22.9s	remaining: 9m 7s
500:	learn: 0.9789013	test: 0.8805673	best: 0.8806469 (495)	total: 28.6s	remaining: 9m 1s
600:	learn: 0.9806963	test: 0.8810777	best: 0.8810938 (599)	total: 34.3s	remaining: 8m 56s
700:	learn: 0.9830261	test: 0.8820071	best: 0.8820130 (699)	total: 40.1s	remaining: 8m 51s
800:	learn: 0.9854206	test: 0.8826974	best: 0.8827363 (780)	total: 45.8s	remaining: 8m 46s
900:	learn: 0.9868396	test: 0.8829095	best: 0.8829817 (862)	total: 51.5s	remaining: 8m 40s
1000:	learn: 0.9880035	test: 0.8830850	best: 0.8830876 (987)	total: 57.3s	remaining: 8m 35s
110

<catboost.core.CatBoostRegressor at 0x795e9765ddb0>

### Calculate final metric from catboost model

In [20]:
catboost_pred_test = catboost.predict(catboost_test)
print(f"r2_score: {r2_score(Y_test, np.exp(catboost_pred_test))}")

r2_score: 0.8144389323397352
