# Setting

## Mount Google Drive

In [1]:
from google.colab import drive
drive.mount('/content/drive')

import os
REF_PATH = '/content/drive/MyDrive/Github/12_도서평점예측'
os.chdir(REF_PATH)

Mounted at /content/drive


<br>

## Import Library

In [4]:
# !pip install -q catboost

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.6/98.6 MB[0m [31m10.0 MB/s[0m eta [36m0:00:00[0m
[?25h

In [5]:
import pandas as pd
import numpy as np
from tqdm import tqdm, trange

import matplotlib.pyplot as plt
import seaborn as sns

from catboost import CatBoostRegressor, CatBoostClassifier, Pool
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, f1_score

In [6]:
def setdiff(x,y):
    return list(set(x)-set(y))

<br></br>

# Data Load

In [7]:
#-----------------------------------------------------------------------------------------------#
# > 도서정보
#-----------------------------------------------------------------------------------------------#
# (1) ID : 샘플 고유 ID
#     - 모두 unique한 값들임
#     - train/test는 서로 다 다른 id들임
#     - 삭제해도 무관함
#
# (2) User-ID : 유저 고유 ID
#     - 한번씩만 평가한 유저도 있고, 11,143번이나 평가한 유저도 있음
#     - nunique : train(83,256건), test(21,909건)
#     - train에만있는대상(70,192건), test에만있는대상(8,845건)
#
# (3) Book-ID : 도서 고유 ID
#     - 한번씩만 평가된 책이 있고, 2,502번이나 평가된 책이 있음
#     - nunique : train(243,441건), test(62,333건)
#     - train에만있는대상(207,723건), test에만있는대상(26,615건)
#
# - (참조) https://dacon.io/competitions/official/236093/talkboard/408269?page=1&dtype=recent
# > Book-ID가 다르더라도 책의 세부 정보가 동일한 경우가 있습니다. 이러한 상황은 다음과 같은 경우에서 발생할 수 있습니다.
# 1. 다양한 출판사 및 발행국가 : 동일한 책이 여러 출판사에 의해 출간되거나, 다른 국가 혹은 지역에서 출간될 경우 서로 다른 Book-ID를 가질 수 있습니다.
# 2. 다양한 에디션 및 인쇄: 책의 개정판, 갱신판, 확장판 등이 발행될 때마다 새로운 Book-ID가 부여될 수 있습니다. 또한, 책의 여러 인쇄에서도 서로 다른 Book-ID가 사용될 수 있습니다.
# 3. 다양한 포맷: 동일한 제목의 책이 하드커버, 종이책, 오디오북, 전자책 등 다양한 형태로 출간될 경우, 각 포맷마다 고유한 Book-ID를 가지게 됩니다.
#
#-----------------------------------------------------------------------------------------------#
# > 유저정보
#-----------------------------------------------------------------------------------------------#
# (4) Age : 유저의 나이
#     - (최소,최대) = train(0,244), test(0,237)으로 데이터가 이상함
#     - train에서 0살이 495건(0.06%), 100살 초과가 2,573건(0.30%)으로, 총 3,068건(0.35%) 있음.
#     - 이것들을 어떻게 처리할지? (테스트에도 있어서 obs를 제거 할 수 없음.)
#     - user_id별로 age의 nunique는 1개씩임 (unique하지 않으면 채워넣으려고 했었는데..)
#
# (5) Location : 유저의 지역
#     - 유저지역이 1개인 곳도 있고, 12,267개인 곳도 있음
#     - nunique : train(20,971건), test(8,581건)
#     - train에만있는대상(13,897건), test에만있는대상(1,507건)
#
#-----------------------------------------------------------------------------------------------#
# > 도서정보
#-----------------------------------------------------------------------------------------------#
# (6) Book-Title : 도서명
#     - 도서명이 1개인 것도 있고, 2,502개인 것도 있음
#     - nunique : train(217,829건), test(59,408건)
#     - train에만있는대상(181,636건), test에만있는대상(23,215건)
#
# (7) Book-Author : 도서 저자
#     - 도서저자가 1개인 것도 있고, 8,467개인 것도 있음
#     - nunique : train(92,635건), test(32,605건)
#     - train에만있는대상(68,980건), test에만있는대상(8,950건) 
#
# (8) Year-Of-Publication : 도서 출판 년도 (-1일 경우 결측 혹은 알 수 없음)
#     - 동일한 도서명인데도 출판년도가 다른게 있음 (개정본 등의 이유로 보임)
#     - nunique : train(110건), test(82건)
#     - (최소,최대) = train(1376,2021), test(1909,2021)
#     - -1인경우가 train에서 11,515(1.32%), test에서 2,425(1.52%)
#     - train에서 -1을 제외하고 연도별 평균 평점을 확인해봤을 때, 연도별로 딱히 관련성이 없어보임
#       -> 제거해도 괜찮을듯?
#       tmp = train_df[train_df['year_of_publication']!=-1]
#       tmp.groupby('year_of_publication')['book_rating'].mean().sort_index()
#
# (9) Publisher : 출판사
#     - 출판사가 1개인 것도 있고, 29,696개인 것도 있음
#     - nunique : train(15,505건), test(6,584건)
#     - train에만있는대상(10,123건), test에만있는대상(1,202건) 

#-----------------------------------------------------------------------------------------------#
# > 타겟정보
#-----------------------------------------------------------------------------------------------#
# (10) Book-Rating : 유저가 도서에 부여한 평점 (0점 ~ 10점)
#     - 0점은 inplict infomation(암시적 정보?)
#     - Rating이 0인 경우는 해당 유저가 특정 책에 관심이 없고, 관련이 없는 경우로 보고 0점도 예측 할 수 있도록 개발필요
#     - (참조) : https://dacon.io/competitions/official/236093/talkboard/408231?page=1&dtype=recent

In [8]:
train_df = pd.read_csv('./data/train.csv')
test_df  = pd.read_csv('./data/test.csv')
train_df.columns = [col.replace('-','_').lower() for col in train_df.columns]
test_df.columns  = [col.replace('-','_').lower() for col in test_df.columns]

In [9]:
print('> 건수 : Train({:,}건), Test({:,}건)'.format(len(train_df),len(test_df)))
print('> Head of Data')
train_df.head()

> 건수 : Train(871,393건), Test(159,621건)
> Head of Data


Unnamed: 0,id,user_id,book_id,book_rating,age,location,book_title,book_author,year_of_publication,publisher
0,TRAIN_000000,USER_00000,BOOK_044368,8,23.0,"sackville, new brunswick, canada",Road Taken,Rona Jaffe,2001.0,Mira
1,TRAIN_000001,USER_00000,BOOK_081205,8,23.0,"sackville, new brunswick, canada",Macbeth (New Penguin Shakespeare),William Shakespeare,1981.0,Penguin Books
2,TRAIN_000002,USER_00000,BOOK_086781,0,23.0,"sackville, new brunswick, canada",Waverley (Penguin English Library),Walter Scott,1981.0,Penguin Books
3,TRAIN_000003,USER_00000,BOOK_098622,0,23.0,"sackville, new brunswick, canada",Mother Earth Father Sky,Sue Harrison,1991.0,Avon
4,TRAIN_000004,USER_00000,BOOK_180810,8,23.0,"sackville, new brunswick, canada",She Who Remembers,Linda Lay Shuler,1989.0,Signet Book


In [10]:
# 아이디어
# (1) 책의 카테고리를 찾기

<br></br>

# Data Preprocess

## Convert string feature to numeric infomation

In [11]:
def string_to_numeric_infomation(
    data,string_columns,
    numeric_columns=None,agg=['mean','min','max'],
    delete_string_columns=False,
):
    if isinstance(string_columns,str):
        string_columns = [string_columns]
    if isinstance(numeric_columns,str):
        numeric_columns = [numeric_columns]
    
    new_data = data.copy()
    for str_col in tqdm(string_columns):
        cnt_data = new_data[str_col].value_counts().reset_index()\
            .rename(columns={'index':str_col,str_col:f'{str_col}_cnt'})
        rank_data = new_data[str_col].value_counts().rank(ascending=False).reset_index()\
            .rename(columns={'index':str_col,str_col:f'{str_col}_rank'})
        new_data = new_data\
            .merge(cnt_data,how='left',on=str_col)\
            .merge(rank_data,how='left',on=str_col)
        if numeric_columns is not None:
            for num_col in numeric_columns:
                for _agg in agg:
                    mean_data = new_data.groupby(str_col)[num_col].agg(_agg).reset_index()\
                        .rename(columns={num_col:f'{str_col}_{_agg}_{num_col}'})
                    new_data = new_data\
                        .merge(mean_data,how='left',on=str_col)
        if delete_string_columns:
            new_data.drop(str_col,axis=1,inplace=True)
        
    return new_data

In [12]:
train_df2 = train_df.copy()
test_df2  = test_df .copy()

# (1) feature들의 type 정리
unuse_features = ['id','year_of_publication']
cat_features   = ['user_id','book_id','location','book_title','book_author','publisher']
num_features   = ['age']
target_feature = 'book_rating'

# (2) 필요없는 컬럼 제거
train_df2.drop(columns=unuse_features,inplace=True)
test_df2 .drop(columns=unuse_features,inplace=True)

# (3) categorical feature들을 numeric feature로 변환 (category가 너무 많아서 그대로 사용하기 힘듦)
train_df2 = string_to_numeric_infomation(
    data=train_df2,
    string_columns=cat_features,
    numeric_columns=num_features,
    delete_string_columns=True,
)
test_df2 = string_to_numeric_infomation(
    data=test_df2,
    string_columns=cat_features,
    numeric_columns=num_features,
    delete_string_columns=True,
)

# (4) 모두 하나의 값을 가지는 컬럼 제거

100%|██████████| 6/6 [00:22<00:00,  3.76s/it]
100%|██████████| 6/6 [00:03<00:00,  1.72it/s]


In [13]:
print(train_df2.shape)
train_df2.head()

(871393, 32)


Unnamed: 0,book_rating,age,user_id_cnt,user_id_rank,user_id_mean_age,user_id_min_age,user_id_max_age,book_id_cnt,book_id_rank,book_id_mean_age,...,book_author_cnt,book_author_rank,book_author_mean_age,book_author_min_age,book_author_max_age,publisher_cnt,publisher_rank,publisher_mean_age,publisher_min_age,publisher_max_age
0,8,23.0,8,11785.0,23.0,23.0,23.0,14,9016.5,40.428571,...,106,1167.5,38.933962,18.0,116.0,6510,22.0,37.939324,0.0,204.0
1,8,23.0,8,11785.0,23.0,23.0,23.0,6,24991.5,30.166667,...,1414,41.0,31.685997,0.0,148.0,14299,10.0,35.642492,0.0,201.0
2,0,23.0,8,11785.0,23.0,23.0,23.0,2,86552.0,85.5,...,24,4753.0,37.541667,21.0,148.0,14299,10.0,35.642492,0.0,201.0
3,0,23.0,8,11785.0,23.0,23.0,23.0,23,4823.0,34.782609,...,79,1586.0,36.278481,17.0,116.0,14797,9.0,37.4926,0.0,239.0
4,8,23.0,8,11785.0,23.0,23.0,23.0,53,1530.5,39.301887,...,110,1126.5,38.636364,21.0,68.0,16018,8.0,36.905232,0.0,239.0


<br>

## Delete equal columns

In [14]:
def delete_equal_columns(data):
    new_data = data.copy()
    
    try_iter = 0
    while True:
        try_iter+=1
        columns = new_data.columns
        equal_column_list = []
        
        pbar = trange(new_data.shape[1])
        for i in pbar:
            pbar.set_description('Try({}) '.format(try_iter))
            for j in range(new_data.shape[1]):
                if i>j:
                    col_i, col_j = columns[i], columns[j]

                    if new_data[col_i].nunique() == new_data[col_j].nunique():
                        ct = pd.crosstab(new_data[col_i],new_data[col_j])
                        if np.diag(ct).sum() == len(new_data):
                            equal_column_list.append([col_i,col_j])
                    else:
                        pass

        if len(equal_column_list)>0:
            delete_columns = pd.unique(np.array(equal_column_list)[:,-1])
            new_data = new_data.drop(columns=delete_columns)
            print('Delete: {}\n'.format(delete_columns))
        else:
            break
    
    return new_data

In [15]:
train_df3 = delete_equal_columns(train_df2)
feature_names = [col for col in train_df3.columns if col!=target_feature]

test_df3  = test_df2.copy()
test_df3  = test_df2[feature_names]

Try(1) : 100%|██████████| 32/32 [00:11<00:00,  2.73it/s]


Delete: ['age' 'user_id_mean_age' 'user_id_min_age']



Try(2) : 100%|██████████| 29/29 [00:08<00:00,  3.42it/s]


In [16]:
print(train_df2.shape,train_df3.shape)

(871393, 32) (871393, 29)


<br>

## Train validation split

In [17]:
X = train_df3.drop(target_feature,axis=1)
y = train_df3[target_feature]

X_train, X_valid, y_train, y_valid = train_test_split(
    X,y,test_size=0.2,shuffle=True,random_state=0,stratify=y)

In [18]:
X_train.head()

Unnamed: 0,user_id_cnt,user_id_rank,user_id_max_age,book_id_cnt,book_id_rank,book_id_mean_age,book_id_min_age,book_id_max_age,location_cnt,location_rank,...,book_author_cnt,book_author_rank,book_author_mean_age,book_author_min_age,book_author_max_age,publisher_cnt,publisher_rank,publisher_mean_age,publisher_min_age,publisher_max_age
201577,607,164.5,37.0,36,2661.0,35.444444,25.0,47.0,1381,112.0,...,52,2371.0,35.884615,22.0,50.0,1603,107.0,35.860886,1.0,104.0
208858,5,16766.5,35.0,2,86552.0,39.0,35.0,43.0,8,6272.0,...,2,38804.5,39.0,35.0,43.0,10,2858.0,38.1,25.0,52.0
289544,350,329.0,34.0,2502,1.0,34.368106,0.0,128.0,3797,25.0,...,2502,14.0,34.368106,0.0,128.0,2502,67.0,34.368106,0.0,128.0
63765,8,11785.0,35.0,212,136.0,39.254717,1.0,201.0,308,515.0,...,701,136.0,39.045649,1.0,201.0,3780,43.0,37.784921,0.0,239.0
649014,65,2096.0,35.0,16,7647.5,40.6875,23.0,148.0,65,1796.5,...,85,1479.0,35.905882,2.0,148.0,14299,10.0,35.642492,0.0,201.0


<br></br>

# Modeling

<br>

## Classification
- 0 or others(1~10)

In [21]:
# len(setdiff(X_valid.location.unique(),X_train.location.unique()))

In [37]:
cls_iterations = 5000

In [38]:
# (1) 0점 or 1~10점 classification
y_train_cls = np.where(y_train==0,0,1)
y_valid_cls = np.where(y_valid==0,0,1)

cat_features = X_train.columns[X_train.dtypes=='object'].tolist()
if len(cat_features)==0:
    print('Change cat_features as None')
    cat_features = None

train_pool = Pool(X_train,y_train_cls,cat_features=cat_features)
valid_pool = Pool(X_valid,y_valid_cls,cat_features=cat_features)

cls_model = CatBoostClassifier(
    random_state=0,
    iterations=cls_iterations,
    eval_metric='F1',
    task_type='GPU',
    learning_rate=0.3,
    cat_features = cat_features,
)

Change cat_features as None


In [39]:
cls_model.fit(
    train_pool,
    eval_set=valid_pool,
    metric_period=500,
    early_stopping_rounds=None,
    use_best_model=True
)
cls_model.save_model('./catboost_info/cls_model_iterations{}.cbm'.format(cls_iterations))

0:	learn: 0.4587440	test: 0.4570622	best: 0.4570622 (0)	total: 19.5ms	remaining: 1m 37s
500:	learn: 0.5721458	test: 0.5503571	best: 0.5503571 (500)	total: 6.14s	remaining: 55.1s
1000:	learn: 0.5991629	test: 0.5612045	best: 0.5612045 (1000)	total: 11.9s	remaining: 47.6s
1500:	learn: 0.6172431	test: 0.5660492	best: 0.5660492 (1500)	total: 17.7s	remaining: 41.2s
2000:	learn: 0.6324680	test: 0.5689897	best: 0.5689897 (2000)	total: 23.5s	remaining: 35.2s
2500:	learn: 0.6450457	test: 0.5693732	best: 0.5693732 (2500)	total: 29.3s	remaining: 29.2s
3000:	learn: 0.6567019	test: 0.5710484	best: 0.5710484 (3000)	total: 35.1s	remaining: 23.4s
3500:	learn: 0.6670532	test: 0.5720870	best: 0.5720870 (3500)	total: 40.7s	remaining: 17.4s
4000:	learn: 0.6762188	test: 0.5725738	best: 0.5725738 (4000)	total: 46.5s	remaining: 11.6s
4500:	learn: 0.6850193	test: 0.5731857	best: 0.5731857 (4500)	total: 52.2s	remaining: 5.79s
4999:	learn: 0.6936667	test: 0.5726423	best: 0.5731857 (4500)	total: 57.9s	remaining: 

In [40]:
cls_model.load_model('./catboost_info/cls_model_iterations{}.cbm'.format(cls_iterations))

cls_preds = cls_model.predict(valid_pool)
print('f1 score: {:.4f}'.format(f1_score(y_valid_cls,cls_preds)))
pd.crosstab(y_valid_cls,cls_preds)

f1 score: 0.5732


col_0,0,1
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1
0,89365,20396
1,30406,34112


<br>

## Regression
- 0: prediction<0
- 10: prediction>10
- else: prediction

In [54]:
def convert_prediction(preds):
    preds = [0  if p<0  else
            10 if p>10 else
            p for p in preds]
    #preds = [int(round(p,0)) for p in preds]
    return preds

In [55]:
reg_iterations = 5000

In [48]:
X_train_reg = X_train[y_train!=0]
y_train_reg = y_train[y_train!=0]

X_valid_reg = X_valid[y_valid!=0]
y_valid_reg = y_valid[y_valid!=0]

cat_features = X_train.columns[X_train.dtypes=='object'].tolist()
if len(cat_features)==0:
    print('Change cat_features as None')
    cat_features = None

train_pool = Pool(X_train_reg,y_train_reg,cat_features=cat_features)
valid_pool = Pool(X_valid_reg,y_valid_reg,cat_features=cat_features)

reg_model = CatBoostRegressor(
    random_state=0,
    iterations=cls_iterations,
    eval_metric='RMSE',
    task_type='GPU',
    learning_rate=0.3,
    cat_features = cat_features,
)

Change cat_features as None


In [49]:
reg_model.fit(
    train_pool,
    eval_set=valid_pool,
    metric_period=500,
    early_stopping_rounds=None,
    use_best_model=True
)
reg_model.save_model('./catboost_info/reg_model_iterations{}.cbm'.format(reg_iterations))

0:	learn: 1.8309357	test: 1.8308088	best: 1.8308088 (0)	total: 7.53ms	remaining: 37.6s
500:	learn: 1.6540448	test: 1.7025093	best: 1.7025093 (500)	total: 2.02s	remaining: 18.2s
1000:	learn: 1.6025156	test: 1.6928680	best: 1.6928680 (1000)	total: 4s	remaining: 16s
1500:	learn: 1.5610195	test: 1.6882657	best: 1.6882657 (1500)	total: 5.87s	remaining: 13.7s
2000:	learn: 1.5260118	test: 1.6870536	best: 1.6870536 (2000)	total: 7.77s	remaining: 11.6s
2500:	learn: 1.4942821	test: 1.6877559	best: 1.6870536 (2000)	total: 9.69s	remaining: 9.69s
3000:	learn: 1.4648787	test: 1.6880815	best: 1.6870536 (2000)	total: 11.6s	remaining: 7.73s
3500:	learn: 1.4378969	test: 1.6884143	best: 1.6870536 (2000)	total: 13.5s	remaining: 5.77s
4000:	learn: 1.4130827	test: 1.6896853	best: 1.6870536 (2000)	total: 15.3s	remaining: 3.82s
4500:	learn: 1.3894423	test: 1.6914613	best: 1.6870536 (2000)	total: 17.2s	remaining: 1.9s
4999:	learn: 1.3666962	test: 1.6923425	best: 1.6870536 (2000)	total: 19s	remaining: 0us
bestT

In [56]:
reg_model.load_model('./catboost_info/reg_model_iterations{}.cbm'.format(reg_iterations))

reg_preds = reg_model.predict(valid_pool)
reg_preds = convert_prediction(reg_preds)
rmse_score = np.sqrt(mean_squared_error(y_valid_reg,reg_preds))
print('rmse score: {:.4f}'.format(rmse_score))

rmse score: 1.6869


<br></br>

# Submit

In [63]:
cls_preds = cls_model.predict(test_df3)

reg_preds = reg_model.predict(test_df3)
reg_preds = convert_prediction(reg_preds)

final_preds = [0 if cls_preds[i]==0 else reg_preds[i] for i in range(len(test_df3))]


In [67]:
submit = pd.read_csv('./data/sample_submission.csv')
submit['Book-Rating'] = final_preds
submit.to_csv('./out/submission_mix_model_1.csv',index=False)