In [None]:
import pandas as pd
import numpy as np
import re

In [None]:
seed = 42

import random
import os
import torch

random.seed(seed)
os.environ['PYTHONHASHSEED'] = str(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.backends.cudnn.deterministic = True

In [None]:
train = pd.read_csv('~/catboost_data_final/train1.csv', dtype={'year_of_publication': int})
test = pd.read_csv('~/catboost_data_final/test1.csv', dtype={'year_of_publication': int})
data = pd.concat([train, test])

In [None]:
# indexing

def age_map(x: int) -> int:
    x = int(x)
    if x < 20:
        return 1
    elif x >= 20 and x < 30:
        return 2
    elif x >= 30 and x < 40:
        return 3
    elif x >= 40 and x < 50:
        return 4
    elif x >= 50 and x < 60:
        return 5
    else:
        return 6

# users
user2idx = {id: idx for idx, id in enumerate(data['user_id'].unique())}
loc_country2idx = {v:k for k,v in enumerate(data['location_country'].unique())}

# books
isbn2idx = {isbn:idx for idx, isbn in enumerate(data['isbn'].unique())}
category2idx = {v:k for k,v in enumerate(data['category'].unique())}
publisher2idx = {v:k for k,v in enumerate(data['publisher'].unique())}
language2idx = {v:k for k,v in enumerate(data['language'].unique())}
author2idx = {v:k for k,v in enumerate(data['book_author'].unique())}

# inters
train['user_id'] = train['user_id'].map(user2idx)
train['location_country'] = train['location_country'].map(loc_country2idx)
train['age'] = train['age'].apply(age_map)

train['isbn'] = train['isbn'].map(isbn2idx)
train['category'] = train['category'].map(category2idx)
train['publisher'] = train['publisher'].map(publisher2idx)
train['language'] = train['language'].map(language2idx)
train['book_author'] = train['book_author'].map(author2idx)
train['year_of_publication'] = train['year_of_publication'].apply(lambda x: int(x))

test['user_id'] = test['user_id'].map(user2idx)
test['location_country'] = test['location_country'].map(loc_country2idx)
test['age'] = test['age'].apply(age_map)

test['isbn'] = test['isbn'].map(isbn2idx)
test['category'] = test['category'].map(category2idx)
test['publisher'] = test['publisher'].map(publisher2idx)
test['language'] = test['language'].map(language2idx)
test['book_author'] = test['book_author'].map(author2idx)
test['year_of_publication'] = test['year_of_publication'].apply(lambda x: int(x))

In [None]:
train['summary'] = train['summary'].fillna('none')
test['summary'] = test['summary'].fillna('none')

In [None]:
train['rating'].value_counts()

In [None]:
train.head()

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_valid, y_train, y_valid = train_test_split(train.drop(['book_title', 'summary', 'rating'], axis=1),
# X_train, X_valid, y_train, y_valid = train_test_split(train.drop(['rating'], axis=1),                                                      
                                                      train['rating'],
                                                      test_size=0.2,
                                                      random_state = seed,
                                                      shuffle=True
                                                      )


In [None]:
y_train.value_counts()

In [None]:
X_train.head()

In [None]:
from imblearn.over_sampling import SMOTE

X_train_resampled, y_train_resampled = SMOTE(random_state=seed).fit_resample(X_train, y_train)

In [None]:
y_train_resampled.value_counts()

In [None]:
import optuna
from optuna import Trial
from optuna.samplers import TPESampler
from catboost import CatBoostRegressor, Pool

sampler = TPESampler(seed=seed)

def objective(trial):
    param = {
        'iterations': 3000,
        'od_wait': 300,
        'learning_rate' : 0.5,
        # 'learning_rate' : trial.suggest_float('learning_rate',0.0001, 0.1, log=True),
        'depth': trial.suggest_int('depth', 6, 10),
        'l2_leaf_reg': trial.suggest_float('l2_leaf_reg',1e-5,1e2, log=True),
        'random_strength': trial.suggest_float('random_strength',1,20),
        
        'bootstrap_type': trial.suggest_categorical("bootstrap_type", ['Bayesian', 'Poisson', 'No']),
        
        'grow_policy': trial.suggest_categorical("grow_policy", ['SymmetricTree', 'Depthwise', 'Lossguide']),
        
        'loss_function':'RMSE',
        'eval_metric': 'RMSE',
        'random_seed': seed,
        'use_best_model': True,
        'task_type': 'GPU',
        'devices': '0:1'
    }
    
    if param['bootstrap_type'] == 'Bayesian':
        param['bagging_temperature'] = trial.suggest_float('bagging_temperature', 1e-2, 1e2, log=True)
    if param['bootstrap_type'] == 'Poisson':
        param["subsample"] = trial.suggest_float("subsample", 0.1, 1, log=True)

    pool = Pool(
        data = X_train,
        label = y_train,
        cat_features = ['user_id', 'isbn', 'age', 'location_country', 'book_author', 'year_of_publication', 'publisher', 'language', 'category'],
        # text_features = ['book_title', 'summary'],
        feature_names = list(X_train.columns),
        has_header = True
    )
    
    model = CatBoostRegressor(**param)
    model = model.fit(
        pool,
        eval_set = (X_valid, y_valid),
        verbose = 0
    )
    
    return model.get_best_score()['validation']['RMSE']
    

In [None]:
# import wandb

# wandb.login()

In [None]:
# from optuna.integration.wandb import WeightsAndBiasesCallback

# wandb_kwargs = {"project": "optuna-wandb-test4"}
# wandbc = WeightsAndBiasesCallback(metric_name="rmse", wandb_kwargs=wandb_kwargs)

In [None]:
optuna_cb_combine_text = optuna.create_study(direction='minimize', sampler=sampler)
optuna_cb_combine_text.optimize(objective, n_trials=50)

In [None]:
optuna_cb_combine_text = optuna.create_study(direction='minimize', sampler=sampler)
optuna_cb_combine_text.optimize(objective, n_trials=50)

In [None]:
def objective_lr(trial):
    param = {
        'iterations': trial.suggest_int('iterations', 500, 5000, step=500),
        'od_wait': trial.suggest_int('od_wait', 100, 500, step=100),
        # 'learning_rate' : 0.01,
        'learning_rate' : trial.suggest_float('learning_rate',0.1, 1, log=True),
        'depth': 6,
        'l2_leaf_reg': 0.0014114392860391826,
        'random_strength': 9.296474034508483,

        'bootstrap_type': 'No',

        'grow_policy': 'SymmetricTree',

        'loss_function':'RMSE',
        'eval_metric': 'RMSE',
        'random_seed': seed,
        'use_best_model': True,
        'task_type': 'GPU',
        'devices': '0:1'
    }

    pool = Pool(
        data = X_train,
        label = y_train,
        cat_features = ['user_id', 'isbn', 'age', 'location_country', 'book_author', 'year_of_publication', 'publisher', 'language', 'category'],
        # text_features = ['book_title', 'summary'],
        feature_names = list(X_train.columns),
        has_header = True
    )
    
    model = CatBoostRegressor(**param)
    model = model.fit(
        pool,
        eval_set = (X_valid, y_valid),
        verbose = 0
    )
    
    return model.get_best_score()['validation']['RMSE']
    

In [None]:
optuna_lr = optuna.create_study(direction='minimize', sampler=sampler)
optuna_lr.optimize(objective_lr, n_trials=50)

In [None]:
print(optuna_cb.best_trial.params)
print(optuna_cb.best_trial.value)

In [None]:
print(optuna_cb_combine.best_trial.params)
print(optuna_cb_combine.best_trial.value)

In [None]:
{'depth': 6, 'l2_leaf_reg': 0.010097245877196693, 'random_strength': 18.5500328773567, 'bootstrap_type': 'No', 'grow_policy': 'SymmetricTree'}

In [None]:
{'depth': 6, 'l2_leaf_reg': 0.17898389848671595, 'random_strength': 4.239958350058539, 'bootstrap_type': 'No', 'grow_policy': 'SymmetricTree'}

In [None]:
param = {
    'iterations': 10000,
    'od_wait': 1000,
    'learning_rate' : 0.3,
    # 'learning_rate' : trial.suggest_float('learning_rate',0.1, 1, log=True),
    'depth': 10,
    'loss_function':'RMSE',
    'eval_metric': 'RMSE',
    'random_seed': seed,
    'use_best_model': True,
    'task_type': 'GPU',
    'devices': '0:1'
}

pool = Pool(
    data = X_train_resampled,
    label = y_train_resampled,
    cat_features = ['user_id', 'isbn', 'age', 'location_country', 'book_author', 'year_of_publication', 'publisher', 'language', 'category'],
    # text_features = ['book_title', 'summary'],
    feature_names = list(X_train.columns),
    has_header = True
)

model = CatBoostRegressor(**param)
model = model.fit(
    pool,
    eval_set = (X_valid, y_valid),
    verbose = 1
)

In [None]:
model.get_best_score()

In [None]:
pool = Pool(
    data = X_train_age,
    label = y_train_age,
    cat_features = ['user_id', 'isbn', 'rating', 'location_country', 'book_author', 'year_of_publication', 'publisher', 'language', 'category'],
    # text_features = ['book_title', 'summary'],
    feature_names = list(X_train_age.columns),
    has_header = True
)


model = CatBoostRegressor(
    iterations=1000,
    learning_rate=0.1,
    depth=14,
    early_stopping_rounds = 100,
    use_best_model = True,
    loss_function='RMSE',
    task_type="GPU",
    devices='0:1'
)

model.fit(
    pool,
    eval_set = (X_valid_age, y_valid_age),
    # verbose = False
)

In [None]:
model.get_best_score()

In [None]:
test_age = train_age_whole[train_age_whole['age'].isnull()].drop(['age', 'book_title', 'summary', 'rating'], axis = 1)

In [None]:
predicted_age = model.predict(test_age)

In [None]:
train_age_whole.loc[train_age_whole['age'].isnull(), 'age'] = predicted_age

In [None]:
train_age_whole.head()

In [None]:
train_age_whole[train_age_whole['rating'] != 0].to_csv('~/catboost_data_final/train1.csv', index=False)

In [None]:
train_age_whole[train_age_whole['rating'] == 0].to_csv('~/catboost_data_final/test1.csv', index=False)

In [None]:
for m in age_model_list:
    print(m.get_best_score())

In [None]:
from catboost import CatBoostRegressor, Pool

depth = list(range(1, 17))

pool = Pool(
    data = X_train_age,
    label = y_train_age,
    cat_features = ['user_id', 'isbn', 'rating', 'location_country', 'book_author', 'year_of_publication', 'publisher', 'language', 'category'],
    # text_features = ['book_title', 'summary'],
    feature_names = list(X_train_age.columns),
    has_header = True
)

age_model_list = []

for d in depth:
    model = CatBoostRegressor(
        iterations=100,
        learning_rate=0.1,
        depth=d,
        loss_function='RMSE',
        task_type="GPU",
        devices='0:1'
    )

    model.fit(
        pool,
        eval_set = (X_valid_age, y_valid_age),
        verbose = False
    )
    
    age_model_list.append(model)
    
print('End!')

In [None]:
model.get_best_score()

In [None]:
test_age = train_age_whole[train_age_whole['age'].isnull()].drop(['age', 'book_title', 'summary'], axis = 1)

In [None]:
predicted_age = model.predict(test_age)

In [None]:
predicted_age

In [None]:
train.loc[train['age'].isnull(), 'age']

In [None]:
temp1 = pd.read_csv('~/catboost_data/train8.csv', dtype={'year_of_publication': int})
temp2 = pd.read_csv('~/catboost_data/test8.csv', dtype={'year_of_publication': int})

In [None]:
temp1['age'] = train_age_whole[train_age_whole['rating'] != 0]['age']

In [None]:
temp2['age'] = train_age_whole[train_age_whole['rating'] == 0]['age']

In [None]:
temp1.to_csv('~/catboost_data_final/train2.csv', index=False)
temp2.to_csv('~/catboost_data_final/test2.csv', index=False)

In [None]:
train['age'] = train['age'].apply(age_map)

In [None]:
train.head()

In [None]:
seed = 42

In [None]:
import random
import os
import torch

random.seed(seed)
os.environ['PYTHONHASHSEED'] = str(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.backends.cudnn.deterministic = True

In [None]:
from sklearn.model_selection import train_test_split

# X_train, X_valid, y_train, y_valid = train_test_split(train.drop(['rating', 'book_title', 'summary', 'language'], axis=1),
X_train, X_valid, y_train, y_valid = train_test_split(train.drop(['rating'], axis=1),                                                      
                                                      train['rating'],
                                                      test_size=0.2,
                                                      random_state = seed,
                                                      shuffle=True
                                                      )


In [None]:
# from catboost import CatBoostRegressor, Pool

# depth = list(range(1, 17))

# pool = Pool(
#     data = X_train,
#     label = y_train,
#     cat_features = ['user_id', 'isbn', 'age', 'location_country', 'book_author', 'year_of_publication', 'publisher', 'language', 'category'],
#     # text_features = ['book_title', 'summary'],
#     feature_names = list(X_train.columns),
#     has_header = True
# )

# model = CatBoostRegressor(
#     iterations=1000,
#     learning_rate=0.1,
#     depth=6,
#     loss_function='RMSE',
#     task_type="GPU",
#     devices='0:1'
# )

# model.fit(
#     pool,
#     eval_set = (X_valid, y_valid),
#     verbose = False
# )

In [None]:
# model.get_best_score()

In [None]:
from catboost import CatBoostRegressor, Pool

depth = list(range(1, 17))

pool = Pool(
    data = X_train,
    label = y_train,
    cat_features = ['user_id', 'isbn', 'age', 'location_country', 'book_author', 'year_of_publication', 'publisher', 'category'],
    # text_features = ['book_title', 'summary'],
    feature_names = list(X_train.columns),
    has_header = True
)

model_list_original = []

for d in depth:
    model = CatBoostRegressor(
        iterations=1000,
        learning_rate=0.1,
        depth=d,
        loss_function='RMSE',
        task_type="GPU",
        devices='0:1'
    )

    model.fit(
        pool,
        eval_set = (X_valid, y_valid),
        verbose = False
    )
    
    model_list_original.append(model)

print('End!')

In [None]:
for m in model_list_original:
    print(m.get_best_score())

In [None]:
submission = pd.read_csv('~/data/test_ratings.csv')
submission = submission.merge(users, on='user_id', how='left').merge(books.drop(['img_url', 'img_path'], axis=1), on='isbn', how='left')
submission['year_of_publication'] = submission['year_of_publication'].apply(lambda x: int(x))

In [None]:
submission.head()

In [None]:
submission['user_id'] = submission['user_id'].map(user2idx)
submission['location_country'] = submission['location_country'].map(loc_country2idx)
submission['age'] = submission['age'].fillna(int(train['age'].mean()))
submission['age'] = submission['age'].apply(age_map)

submission['isbn'] = submission['isbn'].map(isbn2idx)
submission['category'] = submission['category'].map(category2idx)
submission['publisher'] = submission['publisher'].map(publisher2idx)
submission['language'] = submission['language'].map(language2idx)
submission['book_author'] = submission['book_author'].map(author2idx)
submission['year_of_publication'] = submission['year_of_publication'].apply(lambda x: int(x))

In [None]:
predicted_ratings = model.predict(submission.drop(['book_title', 'summary', 'rating', 'language'], axis=1))

In [None]:
predicted_ratings

In [None]:
final_submission = pd.read_csv('~/data/test_ratings.csv')
final_submission['rating'] = predicted_ratings

In [None]:
final_submission.loc[final_submission['rating'] > 10, 'rating'] = 10

In [None]:
final_submission.loc[final_submission['rating'] < 0, 'rating'] = 0

In [None]:
final_submission.to_csv('submission2.csv', index=False)

In [None]:
from catboost import CatBoostRegressor, Pool

pool = Pool(
    data = X_train,
    label = y_train,
    cat_features = ['user_id', 'isbn', 'age', 'location_country', 'book_author', 'year_of_publication', 'publisher', 'category'],
    text_features = ['book_title', 'summary'],
    feature_names = list(X_train.columns),
    has_header = True
)

model = CatBoostRegressor(
    iterations=2500,
    learning_rate=0.1,
    depth=8,
    loss_function='RMSE',
    task_type="GPU",
    devices='0:1'
)

model.fit(
    pool,
    eval_set = (X_valid, y_valid),
    # verbose = False
)

In [None]:
model.get_best_score()

In [None]:
model_list_with_text_features = model_list2

In [None]:
models = [model_list_original, model_list_with_text_features, ]