# CatBoost test

In [1]:
file_name = 'catboost'

In [2]:
import pandas as pd
import numpy as np

import re
from tqdm import tqdm
import os
import shutil
from datetime import datetime

In [3]:
eda_path = '/opt/ml/EDA/'

In [4]:
now = datetime.now()
current = now.strftime('-%Y-%m-%d')

In [5]:
folder_name = eda_path + file_name + current

if not os.path.exists(folder_name):
    os.makedirs(folder_name)

print(folder_name)

/opt/ml/EDA/catboost-2023-04-24


In [7]:
# 필요한 파일 위의 폴더에 복사
copy_list = ['sample_submission.csv', 'test_ratings.csv', 'train_ratings.csv']

for file in copy_list:
    shutil.copy(eda_path + file, folder_name + '/' + file)

## 1. 데이터 불러오기

In [8]:
df_total = pd.read_csv('/opt/ml/EDA/EDA+PP-2023-04-20/total_final.csv')

In [9]:
df_total.dropna(inplace=True)

In [10]:
df_total.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 271375 entries, 0 to 271374
Data columns (total 16 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   user_id              271375 non-null  int64  
 1   age                  271375 non-null  float64
 2   city                 271375 non-null  object 
 3   state                271375 non-null  object 
 4   country              271375 non-null  object 
 5   isbn                 271375 non-null  object 
 6   rating               271375 non-null  int64  
 7   book_title           271375 non-null  object 
 8   book_author          271375 non-null  object 
 9   year_of_publication  271375 non-null  float64
 10  publisher            271375 non-null  object 
 11  img_url              271375 non-null  object 
 12  language             271375 non-null  object 
 13  category             271375 non-null  object 
 14  summary              271375 non-null  object 
 15  img_path         

In [11]:
seed=42

In [12]:
df_total.columns

Index(['user_id', 'age', 'city', 'state', 'country', 'isbn', 'rating',
       'book_title', 'book_author', 'year_of_publication', 'publisher',
       'img_url', 'language', 'category', 'summary', 'img_path'],
      dtype='object')

In [13]:
data_cat = df_total.drop(['book_title', 'img_url', 'summary', 'img_path'], axis=1)
data_cat

Unnamed: 0,user_id,age,city,state,country,isbn,rating,book_author,year_of_publication,publisher,language,category
0,8,33.0,timmins,ontario,canada,0002005018,4,richard bruce wright,2001.0,harperflamingo canada,en,actresses
1,8,33.0,timmins,ontario,canada,074322678X,4,ann beattie,2002.0,scribner,en,fiction
2,8,33.0,timmins,ontario,canada,1552041778,2,r j kaiser,1999.0,mira books,en,fiction
3,8,33.0,timmins,ontario,canada,1567407781,6,loren d estleman,1998.0,brilliance audio trade,en,fiction
4,8,33.0,timmins,ontario,canada,1575663937,6,robert hendrickson,1999.0,kensington publishing corp,en,nature
...,...,...,...,...,...,...,...,...,...,...,...,...
271370,278330,46.0,livonia,new york,usa,0898861411,3,virginia urrutia,1987.0,mountaineers books,en,science
271371,278376,54.0,danville,pennsylvania,usa,0060530103,7,doris lessing,2004.0,harpercollins,en,fiction
271372,278621,74.0,victoria,delaware,canada,1550390961,8,andrea pinto lebowitz,1999.0,sono nis press,en,authors canadian english 20th century biography
271373,278659,33.0,vancouver,washington,usa,0345330293,10,nevil shute,1981.0,ballantine books,en,fiction


In [14]:
data_cat.isnull().sum()

user_id                0
age                    0
city                   0
state                  0
country                0
isbn                   0
rating                 0
book_author            0
year_of_publication    0
publisher              0
language               0
category               0
dtype: int64

## 2. CatBoost 모델 생성 및 훈련

In [15]:
from catboost import CatBoostRegressor, Pool
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

In [16]:
def rmse(real, predict):
  return np.sqrt(np.mean((real-predict) ** 2))
def mae(real, predict):
  return np.mean(np.abs(real-predict))

In [17]:
print(data_cat.shape)
data_cat.head(5)

(271375, 12)


Unnamed: 0,user_id,age,city,state,country,isbn,rating,book_author,year_of_publication,publisher,language,category
0,8,33.0,timmins,ontario,canada,0002005018,4,richard bruce wright,2001.0,harperflamingo canada,en,actresses
1,8,33.0,timmins,ontario,canada,074322678X,4,ann beattie,2002.0,scribner,en,fiction
2,8,33.0,timmins,ontario,canada,1552041778,2,r j kaiser,1999.0,mira books,en,fiction
3,8,33.0,timmins,ontario,canada,1567407781,6,loren d estleman,1998.0,brilliance audio trade,en,fiction
4,8,33.0,timmins,ontario,canada,1575663937,6,robert hendrickson,1999.0,kensington publishing corp,en,nature


In [18]:
# 'year_of_publication'을 문자로 변환해 범주형으로 취급
data_cat['year_of_publication'] = data_cat['year_of_publication'].astype('str')

In [19]:
data_cat.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 271375 entries, 0 to 271374
Data columns (total 12 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   user_id              271375 non-null  int64  
 1   age                  271375 non-null  float64
 2   city                 271375 non-null  object 
 3   state                271375 non-null  object 
 4   country              271375 non-null  object 
 5   isbn                 271375 non-null  object 
 6   rating               271375 non-null  int64  
 7   book_author          271375 non-null  object 
 8   year_of_publication  271375 non-null  object 
 9   publisher            271375 non-null  object 
 10  language             271375 non-null  object 
 11  category             271375 non-null  object 
dtypes: float64(1), int64(2), object(9)
memory usage: 24.8+ MB


In [20]:
X_train, X_valid, y_train, y_valid = train_test_split(data_cat.drop(['user_id', 'isbn', 'rating'], axis=1), 
                                                                      data_cat['rating'], 
                                                                      test_size=0.2, 
                                                                      shuffle=True, 
                                                                      random_state=seed)

In [21]:
cat_list = [x for x in X_train.columns.tolist() if x not in ['age']]
print(cat_list)

['city', 'state', 'country', 'book_author', 'year_of_publication', 'publisher', 'language', 'category']


In [26]:
params = {"iterations": 1000, 
          "learning_rate": 0.125, 
          "cat_features": cat_list, 
          "task_type": "GPU"
          }

In [27]:
catboost = CatBoostRegressor(**params)
catboost.fit(X_train, y_train)

0:	learn: 2.3949824	total: 15.6ms	remaining: 15.5s
1:	learn: 2.3784734	total: 35.9ms	remaining: 17.9s
2:	learn: 2.3642697	total: 51.1ms	remaining: 17s
3:	learn: 2.3529855	total: 66.2ms	remaining: 16.5s
4:	learn: 2.3438852	total: 81.6ms	remaining: 16.2s
5:	learn: 2.3352542	total: 96.5ms	remaining: 16s
6:	learn: 2.3251732	total: 113ms	remaining: 16s
7:	learn: 2.3184703	total: 132ms	remaining: 16.3s
8:	learn: 2.3109095	total: 148ms	remaining: 16.3s
9:	learn: 2.3063513	total: 176ms	remaining: 17.4s
10:	learn: 2.3022659	total: 191ms	remaining: 17.1s
11:	learn: 2.2966703	total: 209ms	remaining: 17.2s
12:	learn: 2.2938464	total: 227ms	remaining: 17.2s
13:	learn: 2.2897448	total: 247ms	remaining: 17.4s
14:	learn: 2.2862135	total: 263ms	remaining: 17.3s
15:	learn: 2.2839673	total: 278ms	remaining: 17.1s
16:	learn: 2.2813644	total: 298ms	remaining: 17.2s
17:	learn: 2.2792166	total: 318ms	remaining: 17.3s
18:	learn: 2.2778159	total: 335ms	remaining: 17.3s
19:	learn: 2.2756200	total: 350ms	remaini

<catboost.core.CatBoostRegressor at 0x7f9eb7a2a290>

In [28]:
y_pred = catboost.predict(X_valid)

In [29]:
print('RMSE : ', rmse(y_valid, y_pred))
print('MAE : ', mae(y_valid, y_pred))

RMSE :  2.1811969166239953
MAE :  1.6525329698386377


## 3. test data 준비

- X_train이나 X_valid 와 똑같은 형태로 수정

In [30]:
test_data = pd.read_csv('/opt/ml/EDA/sample_submission.csv')
test_data

Unnamed: 0,user_id,isbn,rating
0,11676,0002005018,0
1,116866,0002005018,0
2,152827,0060973129,0
3,157969,0374157065,0
4,67958,0399135782,0
...,...,...,...
76694,278543,1576734218,0
76695,278563,3492223710,0
76696,278633,1896095186,0
76697,278668,8408044079,0


In [31]:
user = pd.read_csv('/opt/ml/EDA/dpp_3-2023-04-20/users.csv')

In [32]:
def seperate_location(sen):
    loc_sen = sen.split(',')
    city = loc_sen[0].strip()
    state = loc_sen[1].strip()
    country = loc_sen[2].strip()
    return city, state, country

In [33]:
user[['city', 'state', 'country']] = user.apply(lambda x : seperate_location(x['location']), axis=1, result_type='expand')
user = user.drop('location', axis=1)
user

Unnamed: 0,user_id,age,city,state,country
0,8,33.0,timmins,ontario,canada
1,11400,49.0,ottawa,ontario,canada
2,67544,30.0,toronto,ontario,canada
3,85526,36.0,victoria,british columbia,canada
4,96054,29.0,ottawa,ontario,canada
...,...,...,...,...,...
55488,278330,46.0,livonia,new york,usa
55489,278376,54.0,danville,pennsylvania,usa
55490,278621,74.0,victoria,delaware,canada
55491,278659,33.0,vancouver,washington,usa


In [34]:
book = pd.read_csv('/opt/ml/EDA/dpp_3-2023-04-20/books.csv')
book.head(5)

Unnamed: 0,isbn,book_title,book_author,year_of_publication,publisher,img_url,language,category,summary,img_path
0,0002005018,clara callan,richard bruce wright,2001.0,harperflamingo canada,http://images.amazon.com/images/P/0002005018.0...,en,actresses,in a small town in canada clara callan relucta...,images/0002005018.01.THUMBZZZ.jpg
1,074322678X,where youll find me and other stories,ann beattie,2002.0,scribner,http://images.amazon.com/images/P/074322678X.0...,en,fiction,now back in print ann beattie 39 s finest shor...,images/074322678X.01.THUMBZZZ.jpg
2,1552041778,jane doe,r j kaiser,1999.0,mira books,http://images.amazon.com/images/P/1552041778.0...,en,fiction,jane doe,images/1552041778.01.THUMBZZZ.jpg
3,1567407781,the witchfinder amos walker mystery series,loren d estleman,1998.0,brilliance audio trade,http://images.amazon.com/images/P/1567407781.0...,en,fiction,the witchfinder amos walker mystery series,images/1567407781.01.THUMBZZZ.jpg
4,1575663937,more cunning than man a social history of rats...,robert hendrickson,1999.0,kensington publishing corp,http://images.amazon.com/images/P/1575663937.0...,en,nature,this eye opening well researched examination o...,images/1575663937.01.THUMBZZZ.jpg


In [35]:
book.columns

Index(['isbn', 'book_title', 'book_author', 'year_of_publication', 'publisher',
       'img_url', 'language', 'category', 'summary', 'img_path'],
      dtype='object')

In [36]:
test = test_data.merge(book, on=['isbn'], how='left')
test = test.merge(user, on=['user_id'], how='left')
test

Unnamed: 0,user_id,isbn,rating,book_title,book_author,year_of_publication,publisher,img_url,language,category,summary,img_path,age,city,state,country
0,11676,0002005018,0,clara callan,richard bruce wright,2001.0,harperflamingo canada,http://images.amazon.com/images/P/0002005018.0...,en,actresses,in a small town in canada clara callan relucta...,images/0002005018.01.THUMBZZZ.jpg,,,,
1,116866,0002005018,0,clara callan,richard bruce wright,2001.0,harperflamingo canada,http://images.amazon.com/images/P/0002005018.0...,en,actresses,in a small town in canada clara callan relucta...,images/0002005018.01.THUMBZZZ.jpg,45.0,ottawa,illinois,usa
2,152827,0060973129,0,decision in normandy,carlo deste,1991.0,harperperennial,http://images.amazon.com/images/P/0060973129.0...,en,1940 1949,here for the first time in paperback is an out...,images/0060973129.01.THUMBZZZ.jpg,40.0,ottawa,ontario,canada
3,157969,0374157065,0,flu the story of the great influenza pandemic ...,gina bari kolata,1999.0,farrar straus giroux,http://images.amazon.com/images/P/0374157065.0...,en,medical,describes the great flu epidemic of 1918 an ou...,images/0374157065.01.THUMBZZZ.jpg,30.0,denver,colorado,usa
4,67958,0399135782,0,the kitchen gods wife,amy tan,1991.0,putnam pub group,http://images.amazon.com/images/P/0399135782.0...,en,fiction,a chinese immigrant who is convinced she is dy...,images/0399135782.01.THUMBZZZ.jpg,39.0,idaho falls,idaho,usa
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
76694,278543,1576734218,0,,,,,,,,,,39.0,valenica,california,usa
76695,278563,3492223710,0,,,,,,,,,,37.0,wien,wien,austria
76696,278633,1896095186,0,,,,,,,,,,33.0,sandy,utah,usa
76697,278668,8408044079,0,,,,,,,,,,48.0,madrid,madrid,spain


In [37]:
# X_valid 형태, columns 확인
print(X_valid.columns)
X_valid.head(5)

Index(['age', 'city', 'state', 'country', 'book_author', 'year_of_publication',
       'publisher', 'language', 'category'],
      dtype='object')


Unnamed: 0,age,city,state,country,book_author,year_of_publication,publisher,language,category
251221,33.0,weymouth,massachusetts,usa,douglas adams,1987.0,simon schuster,en,fiction
220601,58.0,setauket,new york,usa,david baldacci,1997.0,warner books,en,fiction
188316,35.0,footscray,victoria,australia,vikram seth,1994.0,perennial,en,fiction
153699,40.0,santa clara,california,usa,sandra brown,1999.0,warner books,en,fiction
255146,33.0,christchurch,canterbury,nz,nancy holder,1999.0,simon spotlight entertainment,en,fiction


In [38]:
test = test[['age', 'city', 'state', 'country', 'book_author', 'year_of_publication', 'publisher', 'language', 'category']]
test

Unnamed: 0,age,city,state,country,book_author,year_of_publication,publisher,language,category
0,,,,,richard bruce wright,2001.0,harperflamingo canada,en,actresses
1,45.0,ottawa,illinois,usa,richard bruce wright,2001.0,harperflamingo canada,en,actresses
2,40.0,ottawa,ontario,canada,carlo deste,1991.0,harperperennial,en,1940 1949
3,30.0,denver,colorado,usa,gina bari kolata,1999.0,farrar straus giroux,en,medical
4,39.0,idaho falls,idaho,usa,amy tan,1991.0,putnam pub group,en,fiction
...,...,...,...,...,...,...,...,...,...
76694,39.0,valenica,california,usa,,,,,
76695,37.0,wien,wien,austria,,,,,
76696,33.0,sandy,utah,usa,,,,,
76697,48.0,madrid,madrid,spain,,,,,


In [39]:
test['age'] = test['age'].fillna(-1).astype('int64')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test['age'] = test['age'].fillna(-1).astype('int64')


In [43]:
test['year_of_publication'] = test['year_of_publication'].astype('str')

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test.replace(np.nan, '', regex=True, inplace=True)


In [45]:
test.dtypes

age                     int64
city                   object
state                  object
country                object
book_author            object
year_of_publication    object
publisher              object
language               object
category               object
dtype: object

In [46]:
test['age'].replace(-1, '', regex=True, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test['age'].replace(-1, '', regex=True, inplace=True)


## 4. 예측

In [47]:
test

Unnamed: 0,age,city,state,country,book_author,year_of_publication,publisher,language,category
0,,,,,richard bruce wright,2001.0,harperflamingo canada,en,actresses
1,45,ottawa,illinois,usa,richard bruce wright,2001.0,harperflamingo canada,en,actresses
2,40,ottawa,ontario,canada,carlo deste,1991.0,harperperennial,en,1940 1949
3,30,denver,colorado,usa,gina bari kolata,1999.0,farrar straus giroux,en,medical
4,39,idaho falls,idaho,usa,amy tan,1991.0,putnam pub group,en,fiction
...,...,...,...,...,...,...,...,...,...
76694,39,valenica,california,usa,,,,,
76695,37,wien,wien,austria,,,,,
76696,33,sandy,utah,usa,,,,,
76697,48,madrid,madrid,spain,,,,,


In [48]:
y_test = catboost.predict(test)

In [49]:
y_test

array([5.4668595 , 6.58226246, 7.45474442, ..., 5.95614224, 3.53354327,
       5.5138891 ])

In [50]:
y_test.dtype

dtype('float64')

In [51]:
test_data['rating'] = y_test
test_data

Unnamed: 0,user_id,isbn,rating
0,11676,0002005018,5.466860
1,116866,0002005018,6.582262
2,152827,0060973129,7.454744
3,157969,0374157065,7.631299
4,67958,0399135782,7.747726
...,...,...,...
76694,278543,1576734218,5.967396
76695,278563,3492223710,7.031009
76696,278633,1896095186,5.956142
76697,278668,8408044079,3.533543


In [52]:
# 0 이하의 rating은 0으로, 10 이상의 rating은 10으로
for i in tqdm(range(len(test_data.index))):
    if test_data.iloc[i, 2] < 0:
        test_data.iloc[i, 2] = 0
    elif test_data.iloc[i, 2] > 10:
        test_data.iloc[i, 2] = 10

100%|██████████| 76699/76699 [00:03<00:00, 24360.43it/s]


In [53]:
test_data

Unnamed: 0,user_id,isbn,rating
0,11676,0002005018,5.466860
1,116866,0002005018,6.582262
2,152827,0060973129,7.454744
3,157969,0374157065,7.631299
4,67958,0399135782,7.747726
...,...,...,...
76694,278543,1576734218,5.967396
76695,278563,3492223710,7.031009
76696,278633,1896095186,5.956142
76697,278668,8408044079,3.533543


In [54]:
test_data.to_csv(folder_name + '/' + 'submission.csv', index=False)