In [2]:
from const import *
from function import *

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import os
import warnings
import itertools

from tqdm import tqdm
from catboost import CatBoostRegressor
from catboost import Pool

warnings.simplefilter('ignore')

In [3]:
data_path = '../1.data/'
train_file_name = 'train.csv'
test_file_name  = 'test.csv'

In [4]:
train_df = pd.read_csv(f'{data_path}{train_file_name}')
test_df = pd.read_csv(f'{data_path}{test_file_name}')
master_df = pd.concat([train_df, test_df])

In [5]:
non_cols = ['id', 'price']
num_cols = ['year', 'odometer']
mun_df_cols = num_cols + non_cols
cat_cols = [col for col in train_df.columns if col not in non_cols + num_cols]
cat_df_cols = cat_cols + non_cols

In [6]:
def creansing_dataset(df):
    # year
    is_error_year = df['year'] > 2023
    df.loc[is_error_year, 'year'] = df[is_error_year]['year'] - 1000
    # size
    df['size'] = df['size'].replace(size_replace)
    # manufacturer
    df['manufacturer'] = df['manufacturer'].replace(manufactuer_replace)

    return df

In [7]:
def main_create_num_features(df):
    df['elapsed_year'] = 2023 - df['year']
    del df['year']

    return df

In [8]:
def conver_category(df, col_list):
    for col in col_list:
        df[col] = df[col].factorize()[0]
        df[col] = df[col].astype('category')

    return df


def create_cat_combination(df, comb_n):
    print(f'create cat combination feature. combination number = {comb_n}')
    for comb_cols in tqdm(itertools.combinations(cat_cols, comb_n)):
        comb_cols = list(comb_cols)
        ccol_name = '-'.join(comb_cols)
        comb_df = df[comb_cols+['price']].copy()
        comb_df = \
            comb_df[comb_cols+['price']].groupby(comb_cols)['price'] \
            .agg(['min', 'max', 'mean', 'std']) \
            .add_prefix(f'comb{comb_n}_{ccol_name}_') \
            .reset_index().reset_index() \
            .rename({'index':f'comb{comb_n}_{ccol_name}_id'}, axis=1) \

        df = pd.merge(left=df, right=comb_df, on=comb_cols, how='left')

    print(f'created {df.shape[1]-len(cat_df_cols)} features \n')

    return df.drop(columns=cat_cols+['price', 'id'])


def main_create_cat_combination(cat_df, comb_number_list=[2]):
    combination_df_l = []
    new_cat_cols_l = []
    for comb_number in comb_number_list:
        file_name = f'comb{comb_number}_features.pickle'
        if file_name in os.listdir(data_path):
            df = pd.read_pickle(f'{data_path}{file_name}')
        else:
            df = create_cat_combination(cat_df, comb_number)
            df.to_pickle(f'{data_path}{file_name}')

        comb_cat_cols = [col for col in df.columns if 'id' in col]
        df = conver_category(df, comb_cat_cols)

        new_cat_cols_l = new_cat_cols_l + comb_cat_cols
        combination_df_l.append(df)     

    return pd.concat(combination_df_l, axis=1), new_cat_cols_l


def main_create_cat_features(cat_df):
    comb_number_list = [2, 3]
    comb_df, new_cat_cols_l = main_create_cat_combination(cat_df, comb_number_list)
    cat_df  = conver_category(cat_df.drop(columns=['price', 'id']), cat_cols)

    return pd.concat([comb_df, cat_df], axis=1), new_cat_cols_l

In [9]:
def create_features_main(df):
    df = creansing_dataset(df)

    print('Create Num Features')
    num_df = df[num_cols].copy()
    num_df = main_create_num_features(num_df)

    print('Create Cat Features')
    cat_df = df[cat_df_cols].copy()
    cat_df, new_cat_cols_l = main_create_cat_features(cat_df)

    master_df = pd.concat([num_df, cat_df, df['price']], axis=1)
    print(f'total features : {master_df.shape[1]}')
    master_df = reduce_mem_usage(master_df, verbose=True)
    
    return master_df.reset_index(drop=True), new_cat_cols_l

master_df = pd.concat([train_df, test_df]).reset_index(drop=True)
master_df, new_cat_cols_l = create_features_main(master_df)
master_df.head()

Create Num Features
Create Cat Features
create cat combination feature. combination number = 3


220it [12:35,  3.43s/it]


created 1100 features 

total features : 1445
Mem. usage decreased to 214.18 Mb (59.1% reduction)


Unnamed: 0,odometer,elapsed_year,comb2_region-manufacturer_id,comb2_region-manufacturer_min,comb2_region-manufacturer_max,comb2_region-manufacturer_mean,comb2_region-manufacturer_std,comb2_region-condition_id,comb2_region-condition_min,comb2_region-condition_max,...,cylinders,fuel,title_status,transmission,drive,size,type,paint_color,state,price
0,115148,74,0,5021.0,45304.0,19700.544922,11096.0,0,1396.0,48692.0,...,0,0,0,0,0,0,0,0,-1,27587.0
1,172038,10,1,4724.0,8137.0,6430.5,2414.0,1,1153.0,4724.0,...,1,0,0,1,0,1,1,1,0,4724.0
2,152492,25,2,2699.0,45191.0,13260.75,10632.0,2,1409.0,39744.0,...,0,0,0,1,1,1,2,1,1,10931.0
3,104118,9,3,1815.0,78636.0,18631.482422,14632.0,3,2548.0,78636.0,...,2,0,0,0,1,0,2,2,2,16553.0
4,144554,18,4,1026.0,25444.0,9457.542969,6212.0,4,2013.0,59723.0,...,0,0,0,0,1,0,1,3,3,5158.0


In [39]:
train_df = master_df[~master_df['price'].isnull()].reset_index(drop=True).reset_index().copy()
test_df  = master_df[master_df['price'].isnull()].copy()

In [11]:
def built_catboost_model(cat_features, Xt, yt, Xe, ye):
    model = \
        CatBoostRegressor(
            learning_rate = 0.1,
            iterations=300,
            loss_function = 'MAPE',
            early_stopping_rounds = 30,
            random_seed = 42
        )

    model.fit(
        X = Xt, y = yt,
        eval_set = (Xe, ye),
        cat_features = cat_features, 
        use_best_model = True, 
        verbose = 50
    )

    return model

In [12]:
from sklearn.model_selection import train_test_split 
from sklearn.model_selection import KFold
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_absolute_percentage_error

In [13]:
train_index, valid_index = train_test_split(train_df['index'], test_size=0.25)

x_train = train_df.iloc[train_index].drop(columns='price')
y_train = train_df.iloc[train_index]['price']
x_valid = train_df.iloc[valid_index].drop(columns='price')
y_valid = train_df.iloc[valid_index]['price']

In [14]:
model = built_catboost_model(
    cat_cols+new_cat_cols_l, x_train, y_train, x_valid, y_valid
)

0:	learn: 0.5587511	test: 0.5677294	best: 0.5677294 (0)	total: 611ms	remaining: 3m 2s
50:	learn: 0.5069758	test: 0.5160277	best: 0.5160277 (50)	total: 30.1s	remaining: 2m 27s
100:	learn: 0.4957191	test: 0.5049919	best: 0.5049919 (100)	total: 1m 2s	remaining: 2m 2s
150:	learn: 0.4947840	test: 0.5041600	best: 0.5041600 (150)	total: 1m 34s	remaining: 1m 33s
200:	learn: 0.4944551	test: 0.5038520	best: 0.5038512 (199)	total: 2m 8s	remaining: 1m 3s
250:	learn: 0.4942344	test: 0.5037071	best: 0.5037053 (249)	total: 2m 42s	remaining: 31.6s
299:	learn: 0.4936938	test: 0.5031966	best: 0.5031966 (299)	total: 3m 15s	remaining: 0us

bestTest = 0.5031966484
bestIteration = 299



In [24]:
x_train.head()

Unnamed: 0,index,odometer,elapsed_year,comb2_region-manufacturer_id,comb2_region-manufacturer_min,comb2_region-manufacturer_max,comb2_region-manufacturer_mean,comb2_region-manufacturer_std,comb2_region-condition_id,comb2_region-condition_min,...,condition,cylinders,fuel,title_status,transmission,drive,size,type,paint_color,state
21288,21288,137199,12,165,2745.0,73885.0,17458.861328,14928.0,62,2054.0,...,2,1,0,0,1,2,1,5,4,2
354,354,76953,9,298,2699.0,37865.0,9232.472656,7536.0,85,1982.0,...,0,2,0,0,1,1,1,1,4,-1
12108,12108,190979,14,12,1181.0,54694.0,14299.423828,10632.0,12,1164.0,...,0,2,0,0,1,1,0,2,5,8
10940,10940,129710,15,3202,5500.0,5500.0,5500.0,,869,5500.0,...,2,0,0,0,1,1,0,2,4,38
19425,19425,123061,34,354,2156.0,70568.0,16622.892578,13832.0,587,2616.0,...,3,0,0,0,1,0,1,6,3,19


In [38]:
submit_df = pd.read_csv(f'{data_path}submit_sample.csv', header=None)
input_cols = list(x_train.columns)

submit_df[1] = model.predict(test_df.reset_index())
submit_df.to_csv('test.csv', index=False, header=None)

In [40]:
len(test_df)

27537

In [41]:
submit_df.tail()

Unnamed: 0,0,1
27532,55064,1515.505062
27533,55065,3150.925594
27534,55066,5646.139158
27535,55067,5536.970706
27536,55068,5328.768415
