# Setting

## Mount Google Drive

In [1]:
from google.colab import drive
drive.mount('/content/drive')

import os
REF_PATH = '/content/drive/MyDrive/Github/12_도서평점예측'
os.chdir(REF_PATH)

Mounted at /content/drive


<br>

## Import Library

In [2]:
!pip install -q catboost

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.6/98.6 MB[0m [31m9.8 MB/s[0m eta [36m0:00:00[0m
[?25h

In [16]:
import pandas as pd
import numpy as np
from tqdm import tqdm, trange

import matplotlib.pyplot as plt
import seaborn as sns

from catboost import CatBoostRegressor, CatBoostClassifier, Pool
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import mean_squared_error, f1_score

In [4]:
def setdiff(x,y):
    return list(set(x)-set(y))

<br></br>

# Data Load

In [5]:
#-----------------------------------------------------------------------------------------------#
# > 도서정보
#-----------------------------------------------------------------------------------------------#
# (1) ID : 샘플 고유 ID
#     - 모두 unique한 값들임
#     - train/test는 서로 다 다른 id들임
#     - 삭제해도 무관함
#
# (2) User-ID : 유저 고유 ID
#     - 한번씩만 평가한 유저도 있고, 11,143번이나 평가한 유저도 있음
#     - nunique : train(83,256건), test(21,909건)
#     - train에만있는대상(70,192건), test에만있는대상(8,845건)
#
# (3) Book-ID : 도서 고유 ID
#     - 한번씩만 평가된 책이 있고, 2,502번이나 평가된 책이 있음
#     - nunique : train(243,441건), test(62,333건)
#     - train에만있는대상(207,723건), test에만있는대상(26,615건)
#
# - (참조) https://dacon.io/competitions/official/236093/talkboard/408269?page=1&dtype=recent
# > Book-ID가 다르더라도 책의 세부 정보가 동일한 경우가 있습니다. 이러한 상황은 다음과 같은 경우에서 발생할 수 있습니다.
# 1. 다양한 출판사 및 발행국가 : 동일한 책이 여러 출판사에 의해 출간되거나, 다른 국가 혹은 지역에서 출간될 경우 서로 다른 Book-ID를 가질 수 있습니다.
# 2. 다양한 에디션 및 인쇄: 책의 개정판, 갱신판, 확장판 등이 발행될 때마다 새로운 Book-ID가 부여될 수 있습니다. 또한, 책의 여러 인쇄에서도 서로 다른 Book-ID가 사용될 수 있습니다.
# 3. 다양한 포맷: 동일한 제목의 책이 하드커버, 종이책, 오디오북, 전자책 등 다양한 형태로 출간될 경우, 각 포맷마다 고유한 Book-ID를 가지게 됩니다.
#
#-----------------------------------------------------------------------------------------------#
# > 유저정보
#-----------------------------------------------------------------------------------------------#
# (4) Age : 유저의 나이
#     - (최소,최대) = train(0,244), test(0,237)으로 데이터가 이상함
#     - train에서 0살이 495건(0.06%), 100살 초과가 2,573건(0.30%)으로, 총 3,068건(0.35%) 있음.
#     - 이것들을 어떻게 처리할지? (테스트에도 있어서 obs를 제거 할 수 없음.)
#     - user_id별로 age의 nunique는 1개씩임 (unique하지 않으면 채워넣으려고 했었는데..)
#
# (5) Location : 유저의 지역
#     - 유저지역이 1개인 곳도 있고, 12,267개인 곳도 있음
#     - nunique : train(20,971건), test(8,581건)
#     - train에만있는대상(13,897건), test에만있는대상(1,507건)
#
#-----------------------------------------------------------------------------------------------#
# > 도서정보
#-----------------------------------------------------------------------------------------------#
# (6) Book-Title : 도서명
#     - 도서명이 1개인 것도 있고, 2,502개인 것도 있음
#     - nunique : train(217,829건), test(59,408건)
#     - train에만있는대상(181,636건), test에만있는대상(23,215건)
#
# (7) Book-Author : 도서 저자
#     - 도서저자가 1개인 것도 있고, 8,467개인 것도 있음
#     - nunique : train(92,635건), test(32,605건)
#     - train에만있는대상(68,980건), test에만있는대상(8,950건) 
#
# (8) Year-Of-Publication : 도서 출판 년도 (-1일 경우 결측 혹은 알 수 없음)
#     - 동일한 도서명인데도 출판년도가 다른게 있음 (개정본 등의 이유로 보임)
#     - nunique : train(110건), test(82건)
#     - (최소,최대) = train(1376,2021), test(1909,2021)
#     - -1인경우가 train에서 11,515(1.32%), test에서 2,425(1.52%)
#     - train에서 -1을 제외하고 연도별 평균 평점을 확인해봤을 때, 연도별로 딱히 관련성이 없어보임
#       -> 제거해도 괜찮을듯?
#       tmp = train_df[train_df['year_of_publication']!=-1]
#       tmp.groupby('year_of_publication')['book_rating'].mean().sort_index()
#
# (9) Publisher : 출판사
#     - 출판사가 1개인 것도 있고, 29,696개인 것도 있음
#     - nunique : train(15,505건), test(6,584건)
#     - train에만있는대상(10,123건), test에만있는대상(1,202건) 

#-----------------------------------------------------------------------------------------------#
# > 타겟정보
#-----------------------------------------------------------------------------------------------#
# (10) Book-Rating : 유저가 도서에 부여한 평점 (0점 ~ 10점)
#     - 0점은 inplict infomation(암시적 정보?)
#     - Rating이 0인 경우는 해당 유저가 특정 책에 관심이 없고, 관련이 없는 경우로 보고 0점도 예측 할 수 있도록 개발필요
#     - (참조) : https://dacon.io/competitions/official/236093/talkboard/408231?page=1&dtype=recent

In [6]:
train_df = pd.read_csv('./data/train.csv')
test_df  = pd.read_csv('./data/test.csv')
train_df.columns = [col.replace('-','_').lower() for col in train_df.columns]
test_df.columns  = [col.replace('-','_').lower() for col in test_df.columns]

In [7]:
print('> 건수 : Train({:,}건), Test({:,}건)'.format(len(train_df),len(test_df)))
print('> Head of Data')
train_df.head()

> 건수 : Train(871,393건), Test(159,621건)
> Head of Data


Unnamed: 0,id,user_id,book_id,book_rating,age,location,book_title,book_author,year_of_publication,publisher
0,TRAIN_000000,USER_00000,BOOK_044368,8,23.0,"sackville, new brunswick, canada",Road Taken,Rona Jaffe,2001.0,Mira
1,TRAIN_000001,USER_00000,BOOK_081205,8,23.0,"sackville, new brunswick, canada",Macbeth (New Penguin Shakespeare),William Shakespeare,1981.0,Penguin Books
2,TRAIN_000002,USER_00000,BOOK_086781,0,23.0,"sackville, new brunswick, canada",Waverley (Penguin English Library),Walter Scott,1981.0,Penguin Books
3,TRAIN_000003,USER_00000,BOOK_098622,0,23.0,"sackville, new brunswick, canada",Mother Earth Father Sky,Sue Harrison,1991.0,Avon
4,TRAIN_000004,USER_00000,BOOK_180810,8,23.0,"sackville, new brunswick, canada",She Who Remembers,Linda Lay Shuler,1989.0,Signet Book


In [8]:
# 아이디어
# (1) 책의 카테고리를 찾기

<br></br>

# Data Preprocess

## Convert string feature to numeric infomation

In [9]:
def string_to_numeric_infomation(
    data,string_columns,
    numeric_columns=None,agg=['mean','min','max'],
    delete_string_columns=False,
):
    if isinstance(string_columns,str):
        string_columns = [string_columns]
    if isinstance(numeric_columns,str):
        numeric_columns = [numeric_columns]
    
    new_data = data.copy()
    for str_col in tqdm(string_columns):
        cnt_data = new_data[str_col].value_counts().reset_index()\
            .rename(columns={'index':str_col,str_col:f'{str_col}_cnt'})
        rank_data = new_data[str_col].value_counts().rank(ascending=False).reset_index()\
            .rename(columns={'index':str_col,str_col:f'{str_col}_rank'})
        new_data = new_data\
            .merge(cnt_data,how='left',on=str_col)\
            .merge(rank_data,how='left',on=str_col)
        if numeric_columns is not None:
            for num_col in numeric_columns:
                for _agg in agg:
                    mean_data = new_data.groupby(str_col)[num_col].agg(_agg).reset_index()\
                        .rename(columns={num_col:f'{str_col}_{_agg}_{num_col}'})
                    new_data = new_data\
                        .merge(mean_data,how='left',on=str_col)
        if delete_string_columns:
            new_data.drop(str_col,axis=1,inplace=True)
        
    return new_data

In [96]:
train_df2 = train_df.copy()
test_df2  = test_df .copy()

# (1) feature들의 type 정리
unuse_features = ['id','year_of_publication']
cat_features   = ['user_id','book_id','location','book_title','book_author','publisher']
num_features   = ['age']
target_feature = 'book_rating'

# (2) 필요없는 컬럼 제거
train_df2.drop(columns=unuse_features,inplace=True)
test_df2 .drop(columns=unuse_features,inplace=True)

# (3) categorical feature들을 numeric feature로 변환 (category가 너무 많아서 그대로 사용하기 힘듦)
train_df2 = string_to_numeric_infomation(
    data=train_df2,
    string_columns=cat_features,
    numeric_columns=num_features,
    delete_string_columns=True,
)
test_df2 = string_to_numeric_infomation(
    data=test_df2,
    string_columns=cat_features,
    numeric_columns=num_features,
    delete_string_columns=True,
)

# (4) 모두 하나의 값을 가지는 컬럼 제거

100%|██████████| 6/6 [00:16<00:00,  2.75s/it]
100%|██████████| 6/6 [00:02<00:00,  2.40it/s]


In [97]:
print(train_df2.shape)
train_df2.head()

(871393, 32)


Unnamed: 0,book_rating,age,user_id_cnt,user_id_rank,user_id_mean_age,user_id_min_age,user_id_max_age,book_id_cnt,book_id_rank,book_id_mean_age,...,book_author_cnt,book_author_rank,book_author_mean_age,book_author_min_age,book_author_max_age,publisher_cnt,publisher_rank,publisher_mean_age,publisher_min_age,publisher_max_age
0,8,23.0,8,11785.0,23.0,23.0,23.0,14,9016.5,40.428571,...,106,1167.5,38.933962,18.0,116.0,6510,22.0,37.939324,0.0,204.0
1,8,23.0,8,11785.0,23.0,23.0,23.0,6,24991.5,30.166667,...,1414,41.0,31.685997,0.0,148.0,14299,10.0,35.642492,0.0,201.0
2,0,23.0,8,11785.0,23.0,23.0,23.0,2,86552.0,85.5,...,24,4753.0,37.541667,21.0,148.0,14299,10.0,35.642492,0.0,201.0
3,0,23.0,8,11785.0,23.0,23.0,23.0,23,4823.0,34.782609,...,79,1586.0,36.278481,17.0,116.0,14797,9.0,37.4926,0.0,239.0
4,8,23.0,8,11785.0,23.0,23.0,23.0,53,1530.5,39.301887,...,110,1126.5,38.636364,21.0,68.0,16018,8.0,36.905232,0.0,239.0


<br>

## Delete equal columns

In [98]:
def delete_equal_columns(data):
    new_data = data.copy()
    
    try_iter = 0
    while True:
        try_iter+=1
        columns = new_data.columns
        equal_column_list = []
        
        pbar = trange(new_data.shape[1])
        for i in pbar:
            pbar.set_description('Try({}) '.format(try_iter))
            for j in range(new_data.shape[1]):
                if i>j:
                    col_i, col_j = columns[i], columns[j]

                    if new_data[col_i].nunique() == new_data[col_j].nunique():
                        ct = pd.crosstab(new_data[col_i],new_data[col_j])
                        if np.diag(ct).sum() == len(new_data):
                            equal_column_list.append([col_i,col_j])
                    else:
                        pass

        if len(equal_column_list)>0:
            delete_columns = pd.unique(np.array(equal_column_list)[:,-1])
            new_data = new_data.drop(columns=delete_columns)
            print('Delete: {}\n'.format(delete_columns))
        else:
            break
    
    return new_data

In [99]:
train_df3 = delete_equal_columns(train_df2)
feature_names = [col for col in train_df3.columns if col!=target_feature]

test_df3  = test_df2.copy()
test_df3  = test_df2[feature_names]

Try(1) : 100%|██████████| 32/32 [00:10<00:00,  3.00it/s]


Delete: ['age' 'user_id_mean_age' 'user_id_min_age']



Try(2) : 100%|██████████| 29/29 [00:07<00:00,  3.77it/s]


In [100]:
print(train_df2.shape,train_df3.shape)

(871393, 32) (871393, 29)


<br></br>

# Modeling

In [107]:
n_splits = 5
model_type = ['cls','reg'][1]
target_transform = ['None','log','sqrt'][0]
offset = 1e-3

iterations = 20000
early_stopping_rounds = [None,int(iterations/5)][0]
learning_rate = 0.1

In [109]:
skf = StratifiedKFold(n_splits=n_splits,shuffle=True,random_state=0)

X = train_df3.drop(target_feature,axis=1)
y = train_df3[target_feature]

kfold_iter=0
for train_idx, valid_idx in skf.split(X,y):
    kfold_iter+=1
    print()
    print('-'*80)
    print('> kfold : {}/{}'.format(kfold_iter,n_splits))
    print('-'*80)

    X_train, y_train = X.iloc[train_idx], y.iloc[train_idx]
    X_valid, y_valid = X.iloc[valid_idx], y.iloc[valid_idx]
    if target_transform=='None':
        pass
    elif target_transform=='log':
        y_train = np.log(y_train+offset)
        y_valid = np.log(y_valid+offset)
    elif target_transform=='sqrt':
        y_train = np.sqrt(y_train+offset)
        y_valid = np.sqrt(y_valid+offset)
    else:
        raise ValueError("target_transform must be one of ['None','log','sqrt']")

    # (1) 0점 or 1~10점 classification
    cat_features = X_train.columns[X_train.dtypes=='object'].tolist()
    if len(cat_features)==0:
        print('Change cat_features as None')
        cat_features = None

    train_pool = Pool(X_train,y_train,cat_features=cat_features)
    valid_pool = Pool(X_valid,y_valid,cat_features=cat_features)

    if model_type=='cls':
        model = CatBoostClassifier(
            random_state=0,
            iterations=iterations,
            eval_metric='TotalF1',
            task_type='GPU',
            learning_rate=learning_rate,
            cat_features = cat_features,
        )
    elif model_type=='reg':
        model = CatBoostRegressor(
            random_state=0,
            iterations=iterations,
            eval_metric='RMSE',
            task_type='GPU',
            learning_rate=learning_rate,
            cat_features = cat_features,
        )
    else:
        raise ValueError("model_type must be 'cls' or 'reg'")

    model.fit(
        train_pool,
        eval_set=valid_pool,
        metric_period=int(iterations/10),
        early_stopping_rounds=early_stopping_rounds,
        use_best_model=True,
    )
    model_path = './catboost_info/model_{}_transform:{}_iterations:{}_lr:{}_es:{}_kf{}.cbm'\
                    .format(model_type,target_transform,iterations,learning_rate,str(early_stopping_rounds),kfold_iter)
    model.save_model(model_path)


--------------------------------------------------------------------------------
> kfold : 1/5
--------------------------------------------------------------------------------
Change cat_features as None
0:	learn: 3.8189823	test: 3.8193042	best: 3.8193042 (0)	total: 13.5ms	remaining: 4m 30s
2000:	learn: 3.4018814	test: 3.4594939	best: 3.4594939 (2000)	total: 15.6s	remaining: 2m 19s
4000:	learn: 3.3318578	test: 3.4372160	best: 3.4372160 (4000)	total: 31.2s	remaining: 2m 4s
6000:	learn: 3.2790418	test: 3.4267943	best: 3.4267943 (6000)	total: 46.9s	remaining: 1m 49s
8000:	learn: 3.2340911	test: 3.4208807	best: 3.4208807 (8000)	total: 1m 2s	remaining: 1m 33s
10000:	learn: 3.1949369	test: 3.4175478	best: 3.4175478 (10000)	total: 1m 18s	remaining: 1m 18s
12000:	learn: 3.1587874	test: 3.4158210	best: 3.4158210 (12000)	total: 1m 33s	remaining: 1m 2s
14000:	learn: 3.1252433	test: 3.4145180	best: 3.4145180 (14000)	total: 1m 49s	remaining: 46.7s
16000:	learn: 3.0942928	test: 3.4138002	best: 3.41

In [110]:
def convert_prediction(preds,model_type,verbose=False):
    if model_type=='reg':
        preds = [0  if p<0  else
                10 if p>10 else
                p for p in preds]
        #preds = [int(round(p,0)) for p in preds]
        if verbose:
            print('preds converted')
    return preds

In [111]:
X = train_df3.drop(target_feature,axis=1)
y = train_df3[target_feature]

# (1) 0점 or 1~10점 classification
cat_features = X.columns[X.dtypes=='object'].tolist()
if len(cat_features)==0:
    print('Change cat_features as None')
    cat_features = None

all_pool = Pool(X,y,cat_features=cat_features)

preds_list = []
for kfold_iter in range(1,n_splits+1):
    if model_type=='cls':
        model = CatBoostClassifier(
            random_state=0,
            iterations=iterations,
            eval_metric='TotalF1',
            task_type='GPU',
            learning_rate=0.3,
            cat_features = cat_features,
        )
    elif model_type=='reg':
        model = CatBoostRegressor(
            random_state=0,
            iterations=iterations,
            eval_metric='RMSE',
            task_type='GPU',
            learning_rate=0.3,
            cat_features = cat_features,
        )
    else:
        raise ValueError("model_type must be 'cls' or 'reg'")

    model_path = './catboost_info/model_{}_transform:{}_iterations:{}_lr:{}_es:{}_kf{}.cbm'\
                    .format(model_type,target_transform,iterations,learning_rate,str(early_stopping_rounds),kfold_iter)
    model.load_model(model_path)

    preds = model.predict(all_pool)
    preds = convert_prediction(preds,model_type,verbose=False)
    if target_transform=='None':
        pass
    elif target_transform=='log':
        preds = np.exp(preds)-offset
    elif target_transform=='sqrt':
        preds = (np.array(preds)**2)-offset
    else:
        raise ValueError("model_type must be 'cls' or 'reg'")

    rmse_score = np.sqrt(mean_squared_error(y,preds))
    print('Kfold={}, RMSE: {:.4f}'.format(kfold_iter,rmse_score))

    preds_list.append(preds)

Change cat_features as None
Kfold=1, RMSE: 3.1599
Kfold=2, RMSE: 3.1601
Kfold=3, RMSE: 3.1848
Kfold=4, RMSE: 3.1163
Kfold=5, RMSE: 3.1163


- lr: 0.03
    - None: 3.1928
    - Log: 4.1548
    - Sqrt: 3.5720
- lr: 0.01
    - None: 3.1260

In [112]:
preds = np.mean(preds_list,axis=0)
preds = convert_prediction(preds,model_type,verbose=True)

rmse_score = np.sqrt(mean_squared_error(y,preds))
print('Kfold={}, RMSE: {:.4f}'.format(kfold_iter,rmse_score))

preds converted
Kfold=5, RMSE: 3.1260


<br></br>

# Submit

In [113]:
test_pool = Pool(test_df3,cat_features=cat_features)

preds_list = []
for kfold_iter in range(1,n_splits+1):
    if model_type=='cls':
        model = CatBoostClassifier(
            random_state=0,
            iterations=iterations,
            eval_metric='TotalF1',
            task_type='GPU',
            learning_rate=0.3,
            cat_features = cat_features,
        )
    elif model_type=='reg':
        model = CatBoostRegressor(
            random_state=0,
            iterations=iterations,
            eval_metric='RMSE',
            task_type='GPU',
            learning_rate=0.3,
            cat_features = cat_features,
        )
    else:
        raise ValueError("model_type must be 'cls' or 'reg'")

    model_path = './catboost_info/model_{}_transform:{}_iterations:{}_lr:{}_es:{}_kf{}.cbm'\
                    .format(model_type,target_transform,iterations,learning_rate,str(early_stopping_rounds),kfold_iter)
    model.load_model(model_path)

    preds = model.predict(test_pool)
    preds = convert_prediction(preds,model_type,verbose=False)
    if target_transform=='None':
        pass
    elif target_transform=='log':
        preds = np.exp(preds)-offset
    elif target_transform=='sqrt':
        preds = (np.array(preds)**2)-offset
    else:
        raise ValueError("model_type must be 'cls' or 'reg'")

    preds_list.append(preds)

In [114]:
preds = np.mean(preds_list,axis=0)
preds = convert_prediction(preds,model_type)

In [115]:
submit = pd.read_csv('./data/sample_submission.csv')
submit['Book-Rating'] = preds

save_path = './out/submission_6_{}_transform-{}_iterations-{}_lr-{}_es-{}_kf{}.csv'\
                .format(model_type,target_transform,iterations,learning_rate,str(early_stopping_rounds),kfold_iter)
submit.to_csv(save_path,index=False)