#[Dacon] Book Recommendation Algorithm AI Competition
- Ranked in the Top 7% (47/1215)
- RMSE 3.405
- Preprocessing
    - Filling missing values in publication year, Author... etc.
    - standardizing typos and abbreviations.
    - Correcting outliers in the 'Age' column (e.g., 200 years old), and handling age ranges as 'Age-Band'.
- Boosting Linear Regression, Ridge Regression and Least Angle Regression using PyCaret.

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!pip install pycaret
!pip install dataprep

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pycaret
  Downloading pycaret-3.0.0-py3-none-any.whl (481 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m481.8/481.8 kB[0m [31m27.8 MB/s[0m eta [36m0:00:00[0m
Collecting pyod>=1.0.8 (from pycaret)
  Downloading pyod-1.0.9.tar.gz (149 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m150.0/150.0 kB[0m [31m21.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting category-encoders>=2.4.0 (from pycaret)
  Downloading category_encoders-2.6.0-py2.py3-none-any.whl (81 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.2/81.2 kB[0m [31m11.8 MB/s[0m eta [36m0:00:00[0m
Collecting importlib-metadata>=4.12.0 (from pycaret)
  Downloading importlib_metadata-6.6.0-py3-none-any.whl (22 kB)
Collecting deprecation>=2.1.0 (from pycaret)
  Downloading deprecation-2.1.0-py2.py3-none-an

In [None]:
!unzip -qq "/content/drive/MyDrive/데이콘_도서추천" -d "/content/"

# Import libraries

In [None]:
import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('darkgrid')
import warnings
warnings.filterwarnings("ignore")
from pycaret.regression import *
from dataprep.eda import create_report

In [None]:
# Custom colors
class clr:
    S = '\033[1m' + '\033[94m'
    E = '\033[0m'

In [None]:
train_df = pd.read_csv("train.csv")
train_df

Unnamed: 0,ID,User-ID,Book-ID,Book-Rating,Age,Location,Book-Title,Book-Author,Year-Of-Publication,Publisher
0,TRAIN_000000,USER_00000,BOOK_044368,8,23.0,"sackville, new brunswick, canada",Road Taken,Rona Jaffe,2001.0,Mira
1,TRAIN_000001,USER_00000,BOOK_081205,8,23.0,"sackville, new brunswick, canada",Macbeth (New Penguin Shakespeare),William Shakespeare,1981.0,Penguin Books
2,TRAIN_000002,USER_00000,BOOK_086781,0,23.0,"sackville, new brunswick, canada",Waverley (Penguin English Library),Walter Scott,1981.0,Penguin Books
3,TRAIN_000003,USER_00000,BOOK_098622,0,23.0,"sackville, new brunswick, canada",Mother Earth Father Sky,Sue Harrison,1991.0,Avon
4,TRAIN_000004,USER_00000,BOOK_180810,8,23.0,"sackville, new brunswick, canada",She Who Remembers,Linda Lay Shuler,1989.0,Signet Book
...,...,...,...,...,...,...,...,...,...,...
871388,TRAIN_871388,USER_92096,BOOK_081138,0,34.0,"minneapolis, minnesota, usa",Healing Words: The Power of Prayer and the Pra...,Larry Dossey,1993.0,Harpercollins
871389,TRAIN_871389,USER_92097,BOOK_258124,0,35.0,"temple, texas, usa",The Salmon of Doubt: Hitchhiking the Galaxy On...,DOUGLAS ADAMS,2002.0,Harmony
871390,TRAIN_871390,USER_92098,BOOK_071848,0,45.0,"ottawa, ontario, canada",Harry Potter and the Prisoner of Azkaban (Book 3),J. K. Rowling,2000.0,Thorndike Press
871391,TRAIN_871391,USER_92099,BOOK_252599,8,43.0,"maple grove, minnesota, usa",Heartbreak Hill: Anatomy of a Ryder Cup,Tim Rosaforte,1996.0,St Martins Pr


In [None]:
np.sum(train_df.isna(), axis=0)

ID                     0
User-ID                0
Book-ID                0
Book-Rating            0
Age                    0
Location               0
Book-Title             0
Book-Author            0
Year-Of-Publication    0
Publisher              0
dtype: int64

In [None]:
print(train_df.columns)
for col in train_df.select_dtypes('O').columns[1:]:
    print(col, len(train_df[col].unique()))
    display(train_df[col].value_counts())
    print()

Index(['ID', 'User-ID', 'Book-ID', 'Book-Rating', 'Age', 'Location',
       'Book-Title', 'Book-Author', 'Year-Of-Publication', 'Publisher'],
      dtype='object')
User-ID 83256


USER_56601    11143
USER_54845     6456
USER_52453     5814
USER_73501     4290
USER_03411     3453
              ...  
USER_39404        1
USER_39405        1
USER_39406        1
USER_39408        1
USER_92100        1
Name: User-ID, Length: 83256, dtype: int64


Book-ID 243441


BOOK_097861    2502
BOOK_050555     883
BOOK_114855     768
BOOK_206174     732
BOOK_084346     723
               ... 
BOOK_161877       1
BOOK_139239       1
BOOK_138283       1
BOOK_133370       1
BOOK_130798       1
Name: Book-ID, Length: 243441, dtype: int64


Location 20971


toronto, ontario, canada                 12267
n/a, n/a, n/a                            11161
chicago, illinois, usa                    7214
seattle, washington, usa                  6967
ottawa, ontario, canada                   6915
                                         ...  
qom, n/a, iran                               1
warlingham, england, united kingdom          1
pfäffikon, nebraska, switzerland             1
puerto vallarta, mexico, usa                 1
castiglion fiorentino, toscana, italy        1
Name: Location, Length: 20971, dtype: int64


Book-Title 217829


Wild Animus                                                                        2502
The Da Vinci Code                                                                   895
The Nanny Diaries: A Novel                                                          828
The Lovely Bones: A Novel                                                           768
A Painted House                                                                     761
                                                                                   ... 
Das Gesetz der Liebe. Inkl. CD.                                                       1
E- Mail an alle.                                                                      1
Enc/Brown/Saves/#7 (Encyclopedia Brown (Paperback))                                   1
Sitting Pretty                                                                        1
Creme De Colorado Cookbook (Celebrating Twenty Five Years of Culinary Artistry)       1
Name: Book-Title, Length: 217829


Book-Author 92635


Stephen King           8467
Nora Roberts           6934
John Grisham           5283
James Patterson        5020
Mary Higgins Clark     3983
                       ... 
Cyrinda Foxe-Tyler        1
Matt Bloom                1
Cassie Kendall            1
Birney, M.D. Dibble       1
Stephen Griffith          1
Name: Book-Author, Length: 92635, dtype: int64


Publisher 15505


Ballantine Books                          29696
Pocket                                    27212
Berkley Publishing Group                  23647
Harlequin                                 21362
Warner Books                              21263
                                          ...  
Editorial y Distribuidora Leo                 1
Schoffling                                    1
Family Values Publishing, Incorporated        1
Slope Books                                   1
Sterling Publishing Co                        1
Name: Publisher, Length: 15505, dtype: int64




In [None]:
set(train_df.columns) - set(train_df.select_dtypes('O').columns)

{'Age', 'Book-Rating', 'Year-Of-Publication'}

In [None]:
print(train_df['Age'].dtype) # 왜 float이지? nan 값이 있나 ?
print(train_df['Book-Rating'].dtype)
print(train_df['Year-Of-Publication'].dtype)

float64
int64
float64


In [None]:
test_df = pd.read_csv('test.csv')
test_df

Unnamed: 0,ID,User-ID,Book-ID,Age,Location,Book-Title,Book-Author,Year-Of-Publication,Publisher
0,TEST_000000,USER_00008,BOOK_047966,37.0,"vermilion, ohio, usa",Birds of Prey: A Novel of Suspense,J.A. Jance,2002.0,Avon
1,TEST_000001,USER_00008,BOOK_119494,37.0,"vermilion, ohio, usa",Midnight Voices,JOHN SAUL,2003.0,Ballantine Books
2,TEST_000002,USER_00008,BOOK_151775,37.0,"vermilion, ohio, usa",Breaking Free : A Prescription for Personal an...,David M. Noer,1996.0,Jossey-Bass
3,TEST_000003,USER_00008,BOOK_176255,37.0,"vermilion, ohio, usa",Bitter Harvest,Ann Rule,1999.0,Pocket
4,TEST_000004,USER_00008,BOOK_187307,37.0,"vermilion, ohio, usa",Embraced by the Light,Betty J. Eadie,1994.0,Bantam Books
...,...,...,...,...,...,...,...,...,...
159616,TEST_159616,USER_92086,BOOK_159050,0.0,"mountain view, california, usa",The Fat Flush Plan,Ann Louise Gittleman,2001.0,McGraw-Hill/Contemporary Books
159617,TEST_159617,USER_92086,BOOK_196481,0.0,"mountain view, california, usa",We Die Alone: A WWII Epic of Escape and Endurance,David Howarth,1999.0,The Lyons Press
159618,TEST_159618,USER_92086,BOOK_199754,0.0,"mountain view, california, usa",From 60 Yards In : How to Master Golf's Short ...,Raymond Floyd,1992.0,Perennial
159619,TEST_159619,USER_92086,BOOK_227481,0.0,"mountain view, california, usa",Solo: On Her Own Adventure,Susan Fox Rogers,1996.0,Seal Press (WA)


In [None]:
train_df[~train_df['Age'].apply(lambda x: x.is_integer())]

Unnamed: 0,ID,User-ID,Book-ID,Book-Rating,Age,Location,Book-Title,Book-Author,Year-Of-Publication,Publisher


In [None]:
train_df[~train_df['Year-Of-Publication'].apply(lambda x: x.is_integer())]

Unnamed: 0,ID,User-ID,Book-ID,Book-Rating,Age,Location,Book-Title,Book-Author,Year-Of-Publication,Publisher


In [None]:
for col in test_df.columns[1:]:
    print(clr.S + col + clr.E)
    print(set(train_df[col].unique()) - set(test_df[col].unique()))
    print(set(test_df[col].unique()) - set(train_df[col].unique()))
    print('='*50)


[1m[94mUser-ID[0m
{'USER_59560', 'USER_38110', 'USER_75080', 'USER_74396', 'USER_41621', 'USER_32604', 'USER_86534', 'USER_89024', 'USER_22042', 'USER_65705', 'USER_01438', 'USER_22589', 'USER_79478', 'USER_06718', 'USER_70892', 'USER_36141', 'USER_37288', 'USER_38707', 'USER_52816', 'USER_78734', 'USER_41538', 'USER_28766', 'USER_73753', 'USER_18871', 'USER_88358', 'USER_45350', 'USER_41224', 'USER_86892', 'USER_12298', 'USER_08653', 'USER_04976', 'USER_54078', 'USER_67003', 'USER_31208', 'USER_35386', 'USER_75212', 'USER_23525', 'USER_06774', 'USER_84580', 'USER_11835', 'USER_64179', 'USER_13304', 'USER_84905', 'USER_74896', 'USER_63102', 'USER_49995', 'USER_59466', 'USER_66220', 'USER_32323', 'USER_08166', 'USER_27127', 'USER_85089', 'USER_73101', 'USER_35373', 'USER_54784', 'USER_42664', 'USER_58567', 'USER_20455', 'USER_27510', 'USER_09225', 'USER_23026', 'USER_27931', 'USER_39993', 'USER_89846', 'USER_20830', 'USER_17402', 'USER_03383', 'USER_44682', 'USER_59130', 'USER_40332'

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



{'BOOK_105719', 'BOOK_163681', 'BOOK_049176', 'BOOK_077047', 'BOOK_188294', 'BOOK_055666', 'BOOK_032085', 'BOOK_155710', 'BOOK_189832', 'BOOK_051389', 'BOOK_068238', 'BOOK_121872', 'BOOK_063487', 'BOOK_107342', 'BOOK_099804', 'BOOK_086766', 'BOOK_027706', 'BOOK_106420', 'BOOK_078556', 'BOOK_246133', 'BOOK_169717', 'BOOK_022549', 'BOOK_264132', 'BOOK_236112', 'BOOK_110249', 'BOOK_001075', 'BOOK_106237', 'BOOK_086045', 'BOOK_131451', 'BOOK_156168', 'BOOK_091460', 'BOOK_125372', 'BOOK_228122', 'BOOK_043792', 'BOOK_070543', 'BOOK_091171', 'BOOK_030225', 'BOOK_232044', 'BOOK_155595', 'BOOK_207678', 'BOOK_156247', 'BOOK_125604', 'BOOK_239964', 'BOOK_067676', 'BOOK_092332', 'BOOK_064956', 'BOOK_029846', 'BOOK_230174', 'BOOK_233138', 'BOOK_118323', 'BOOK_097309', 'BOOK_069487', 'BOOK_174641', 'BOOK_240715', 'BOOK_177273', 'BOOK_032648', 'BOOK_063257', 'BOOK_101352', 'BOOK_052148', 'BOOK_179853', 'BOOK_166195', 'BOOK_128414', 'BOOK_246608', 'BOOK_256964', 'BOOK_164528', 'BOOK_127776', 'BOOK_103

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



{'Mariano Bueno', 'Luis M. Laita', 'Degen Pener', 'M.L. Vennum', 'Nicholas Dodman', 'Ian Cochrane', 'Ana Maria Perez Martinez', 'Andrea Sutcliffe', 'Michael E. Bell', 'Albert Croll Baugh', 'Rebecca Stott', 'Belle Lin', "Annmarie O'Grady", 'Chester L. Krause', 'Sheila Norgate', 'Elizabeth Prentiss', 'Ludovico Ariosto', 'Barbara Ballinger Buchholz', 'Milton, MD Hammerly', 'Lowell "Bud" Paxson', 'Rollin O. Glaser', 'Anne Rudeen', 'Dorit Bader Whiteman', 'Daniel B. Botkin', 'Dao Strom', 'Donald W. Boyd', 'Harry Hodge', 'Suzanne Ebel', 'Dorinne Armstrong', 'Padraic O Conaire', 'Josh Pons', 'V. Wayne Klemin', 'Frederick Ramsay', 'Beth Jones', 'Robert A. Fazzi', 'Undra E. Biggs', 'Jr. O. B. Hardison', 'Herman Buchman', 'C.J. Wyckoff', 'Leavitt', 'Jack Grapes', 'H. B. Cresswell', 'Andrea Ghisotti', 'James G, Sr. Fraser', 'Fawn McKay Brodie', 'Judith Halliburton', 'Michael Goulding', 'Garcia', 'Virginia Baxter', 'Gary Garrels', 'Lauren Turner', 'Duvert', 'Eleanor Sullivan', 'J. MacDonald', 'Jos

### AGE
100살 이상... 심지어 200살 이상도 있다..  
100이상은 이상치라고 판단하고 임의로 앞자리 떼서 band로 묶어서 처리  
### Location
전처리 후  value_counts하고 다시 알아보자
### Year-Of-Publication
출판년도도 band로 묶어서 처리
### Publisher
출판사가 평점에 영향을 줄까? 시각화로 확인해보기
### Book-ID, User-ID
제외할지 말지 고민해보기. Book-ID를 제외한다면 출판사 포함.
### Book-Title
쪼개서 분석해보는것도 재밌을 듯

In [None]:
test_df[~test_df['Age'].apply(lambda x: x.is_integer())]

Unnamed: 0,ID,User-ID,Book-ID,Age,Location,Book-Title,Book-Author,Year-Of-Publication,Publisher


In [None]:
test_df[~test_df['Year-Of-Publication'].apply(lambda x: x.is_integer())]

Unnamed: 0,ID,User-ID,Book-ID,Age,Location,Book-Title,Book-Author,Year-Of-Publication,Publisher


age나 year는 int형이어야 할 것 같은데 float64형태인 것으로 보아 결측값이 있을지도...?  
-> 없으니까 그냥 int로 변경

In [None]:
train_df['Age'] = train_df['Age'].astype(int)
train_df['Year-Of-Publication'] = train_df['Year-Of-Publication'].astype(int)
test_df['Age'] = test_df['Age'].astype(int)
test_df['Year-Of-Publication'] = test_df['Year-Of-Publication'].astype(int)

print(len(train_df['Age'].unique()))
print(len(train_df['Year-Of-Publication'].unique()))
print(len(test_df['Age'].unique()))
print(len(test_df['Year-Of-Publication'].unique()))

137
110
108
82


In [None]:
test_df['Age'].value_counts()

35     45940
33      5513
29      5181
36      4378
32      4227
       ...  
84         1
237        1
95         1
156        1
99         1
Name: Age, Length: 108, dtype: int64

In [None]:
test_df['Year-Of-Publication'].value_counts()

2002    14610
2001    12201
2003    11792
1999    11008
2000    10577
        ...  
1948        1
2011        1
1909        1
2020        1
1930        1
Name: Year-Of-Publication, Length: 82, dtype: int64

In [None]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 871393 entries, 0 to 871392
Data columns (total 10 columns):
 #   Column               Non-Null Count   Dtype 
---  ------               --------------   ----- 
 0   ID                   871393 non-null  object
 1   User-ID              871393 non-null  object
 2   Book-ID              871393 non-null  object
 3   Book-Rating          871393 non-null  int64 
 4   Age                  871393 non-null  int64 
 5   Location             871393 non-null  object
 6   Book-Title           871393 non-null  object
 7   Book-Author          871393 non-null  object
 8   Year-Of-Publication  871393 non-null  int64 
 9   Publisher            871393 non-null  object
dtypes: int64(3), object(7)
memory usage: 66.5+ MB


### Age를 Age-Band로 전처리
0세가 평점을 매길 수 있을까? ...  
6세미만은 그 외로 처리  


In [None]:
train_df[train_df['Age'] < 10]['Age'].value_counts()

9    1307
0     495
8     448
1     361
2     278
4     250
3     128
7     108
5     101
6      12
Name: Age, dtype: int64

In [None]:
def age_to_ageband(x):
    if x > 110:
        x = int(str(x)[1:])
    if x < 6:
        return "others"
    elif x < 10:
        return "children"
    elif x < 20:
        return "10's"
    elif x < 30:
        return "20's"
    elif x < 40:
        return "30's"
    elif x < 50:
        return "40's"
    elif x < 60:
        return "50's"
    elif x < 70:
        return "60's"
    elif x >= 70:
        return "upper70"
train_df['Age-Band'] = train_df['Age'].apply(lambda x: age_to_ageband(x))
train_df['Age-Band'].value_counts()

30's        442314
20's        158769
40's        130835
50's         77568
10's         27359
60's         26288
upper70       4372
others        1993
children      1895
Name: Age-Band, dtype: int64

In [None]:
test_df['Age-Band'] = test_df['Age'].apply(lambda x: age_to_ageband(x))
test_df['Age-Band'].value_counts()

30's        80305
20's        30070
40's        22246
50's        15619
10's         5297
60's         4070
children      891
upper70       697
others        426
Name: Age-Band, dtype: int64

### Year-Of-Publication
-1 값 처리 어떻게 할지?  
1. 일일히 찾기
2. 그냥 결측치로 두기

In [None]:
train_df['Year-Of-Publication'].describe()

count    871393.000000
mean       1968.903339
std         228.077752
min          -1.000000
25%        1991.000000
50%        1997.000000
75%        2001.000000
max        2021.000000
Name: Year-Of-Publication, dtype: float64

In [None]:
train_df[train_df['Year-Of-Publication'] < 1900]['Year-Of-Publication'].value_counts()

-1       11515
 1378        1
 1806        1
 1376        1
 1897        1
Name: Year-Of-Publication, dtype: int64

In [None]:
 len(train_df[train_df['Year-Of-Publication']==-1][['Book-Title', 'Book-Author', 'Publisher']].drop_duplicates())

4056

In [None]:
for k, _ in train_df[(train_df['Year-Of-Publication'] == -1)][['Book-Title', 'Publisher']].value_counts().items():
    print(f'Book "{k[0]}" at "{k[1]}"')

같은 책이여도 출판사가 다르면 출판년도가 다를 수 있음을 간과함 ㅠㅠ  
찾은거 그냥 쓸지?  
다시 찾을지  
-> 일단 그냥 결측치로 두고 band 처리

In [None]:
train_df['Year-Of-Publication'].value_counts()

2002    77173
2001    67566
1999    64175
2000    61749
2003    60744
        ...  
2008        1
1909        1
1904        1
1378        1
1910        1
Name: Year-Of-Publication, Length: 110, dtype: int64

In [None]:
train_df[(train_df['Year-Of-Publication'] != -1) & (train_df['Year-Of-Publication'] < 1980)]['Year-Of-Publication'].value_counts().sum()

29337

In [None]:
train_df[(train_df['Year-Of-Publication'] >= 1980) & (train_df['Year-Of-Publication'] < 1990)]['Year-Of-Publication'].value_counts().sum()

123469

In [None]:
train_df[(train_df['Year-Of-Publication'] >= 2010)]['Year-Of-Publication'].value_counts().sum()

71

In [None]:
def year_to_yearband(df):
    df.loc[(df['Year-Of-Publication'] < 1980), 'Year-Of-Publication_Band'] = "<1980s"
    df.loc[(df['Year-Of-Publication'] >= 1980) & (df['Year-Of-Publication'] < 1990), 'Year-Of-Publication_Band'] = "1980s"
    df.loc[(df['Year-Of-Publication'] >= 1990) & (df['Year-Of-Publication'] < 2000), 'Year-Of-Publication_Band'] = "1990s"
    df.loc[(df['Year-Of-Publication'] >= 2000), 'Year-Of-Publication_Band'] = ">=2000s"
    df.loc[(df['Year-Of-Publication'] == -1), 'Year-Of-Publication_Band'] = 'no_data'
    return df
train_df = year_to_yearband(train_df)
test_df = year_to_yearband(test_df)
test_df['Year-Of-Publication_Band'].value_counts()

1990s      76793
>=2000s    53486
1980s      21623
<1980s      5294
no_data     2425
Name: Year-Of-Publication_Band, dtype: int64

In [None]:
train_df['Year-Of-Publication_Band'].value_counts()

1990s      417970
>=2000s    289102
1980s      123469
<1980s      29337
no_data     11515
Name: Year-Of-Publication_Band, dtype: int64

In [None]:
train_df.columns

Index(['ID', 'User-ID', 'Book-ID', 'Book-Rating', 'Age', 'Location',
       'Book-Title', 'Book-Author', 'Year-Of-Publication', 'Publisher',
       'Age-Band', 'Year-Of-Publication_Band'],
      dtype='object')

In [None]:
pd.get_dummies(train_df[['Book-Rating','Book-Title', 'Book-Author', 'Publisher', 'Age-Band', 'Year-Of-Publication_Band']], columns=['Book-Author','Year-Of-Publication_Band', 'Age-Band', 'Publisher'])

In [None]:
train_df['Location'].str.split(',', expand=True).loc[:, 6].value_counts()

 south korea    1
Name: 6, dtype: int64

In [None]:
train_df['Location'].str.split(',', expand=True).loc[:, 5].value_counts()

 302      1
 china    1
Name: 5, dtype: int64

In [None]:
train_df['Location'].str.split(',', expand=True).loc[:, 4].value_counts()

 usa                     263
 canada                    9
 spain                     1
 626-1                     1
 japan                     1
xueyuan rd. hexi dist      1
 italy                     1
Name: 4, dtype: int64

In [None]:
train_df['Location'].str.split(',', expand=True).loc[:, 3].value_counts()

 usa                              1048
 united kingdom                    975
 georgia                           241
 spain                             121
 japan                              67
 portugal                           67
 canada                             63
                                    56
 malaysia                           36
 australia                          23
 kansas                             21
 channel islands                    15
 south korea                        14
 tobago                             12
 italy                              11
 mexico                              8
 ireland                             8
 iceland                             8
 new zealand                         8
 british columbia                    7
 france                              7
 austria                             5
 cape verde                          5
 philippines                         4
 switzerland                         4
 germany                 

In [None]:
import re

In [None]:
x = 'asdf, 123-9, ss'
''.join(re.findall('[a-zA-Z,]', x)).split(',')

['asdf', '', 'ss']

In [None]:
def location_split(x):
    location = ''.join(re.findall('[a-zA-Z, ]', x))
    location = location.split(',')
    i = 0
    d = {0: 'city', 1: 'state', 2:'nation'}
    res = dict()
    for l in location:
        if l != '':
            res[d[i]] = l.strip()
            i += 1
        if i == 2:
            break
    res['nation'] = location[-1]
    return ','.join(list(res.values()))

In [None]:
train_df['Location'].apply(lambda x: location_split(x)).str.split(',', expand=True).value_counts()

0            1                   2              
toronto      ontario              canada            12267
na           na                   na                11161
st louis     missouri             usa                9792
chicago      illinois             usa                7214
seattle      washington           usa                6967
                                                    ...  
lexpark      md                   usa                   1
leyden       holland              netherlands           1
leytonstone  na                   united kingdom        1
lhne         nordrheinwestfalen   germany               1
zwolle       overijssel           netherlands           1
Length: 20660, dtype: int64

In [None]:
train_df = pd.concat([train_df, train_df['Location'].apply(lambda x: location_split(x)).str.split(',', expand=True).rename(columns={0:'City', 1:'State', 2:'Nation'})], axis=1)
test_df = pd.concat([test_df, test_df['Location'].apply(lambda x: location_split(x)).str.split(',', expand=True).rename(columns={0:'City', 1:'State', 2:'Nation'})], axis=1)

In [None]:
print(list(train_df['Nation'].unique()))

[' canada', ' usa', ' germany', ' spain', ' australia', ' united kingdom', '', ' italy', ' netherlands', ' portugal', ' sweden', ' austria', ' france', ' finland', ' new zealand', ' singapore', ' switzerland', ' philippines', ' espaa', ' malaysia', ' belgium', ' japan', ' denmark', ' belize', ' england', ' poland', ' ireland', ' haiti', ' trinidad and tobago', ' deutschland', ' tanzania', ' moldova', ' bulgaria', ' luxembourg', ' south africa', ' czech republic', ' israel', ' malta', ' petrolwar nation', ' kuwait', ' peru', ' ecuador', ' south korea', ' turkey', ' nigeria', ' brazil', ' hungary', ' indonesia', ' kazakhstan', ' india', ' china', ' egypt', ' croatia', ' taiwan', ' argentina', ' chile', ' venezuela', ' galiza', ' iceland', ' scotland', ' bolivia', ' lithuania', None, ' norway', ' mexico', ' bahamas', ' united sates', ' romania', ' iran', ' dominican republic', ' maricopa', ' the world tomorrow', ' andorra', ' galiza neghra', ' jersey', ' ghana', ' richmond country', ' uni

In [None]:
print(list(test_df['Nation'].unique()))

[' usa', ' united kingdom', ' canada', ' italy', ' portugal', '', ' austria', ' germany', ' australia', ' singapore', ' switzerland', ' malaysia', ' new zealand', ' belize', ' egypt', ' finland', ' england', ' spain', ' haiti', ' france', ' bulgaria', ' israel', ' belgium', ' luxembourg', ' netherlands', ' poland', ' turkey', ' hong kong', ' brazil', ' lithuania', ' sweden', ' denmark', ' mexico', ' romania', ' kuwait', ' south korea', ' japan', ' india', ' philippines', ' jersey', ' taiwan', ' laos', ' ghana', ' tunisia', ' norway', ' iran', ' greece', ' lleida', ' ireland', ' la suisse', ' la chine ternelle ', ' framingham', ' south africa', ' puerto rico', ' grenada', ' antarctica', ' china', ' argentina', ' shelby', ' samoa', ' zimbabwe', ' united states', ' chile', ' san franicsco', ' morocco', ' venezuela', ' wales', ' russia', ' here and there', ' malta', ' la france', ' hungary', ' ukraine', ' united states of america', ' monaco', ' czech republic', ' costa rica', ' nepal', ' u

In [None]:
train_df['Nation'].value_counts()[:50]

 usa                   630062
 canada                 78571
 united kingdom         27050
 germany                23471
                        21007
 australia              15270
 spain                  12515
 na                     11178
 france                  8962
 portugal                5974
 new zealand             4676
 malaysia                4551
 netherlands             4031
 switzerland             3437
 italy                   2889
 austria                 2307
 iran                    1268
 romania                 1058
 finland                 1057
 singapore               1049
 dominican republic       902
 brazil                   761
 sweden                   711
 ireland                  697
 philippines              525
 japan                    520
 belgium                  442
 china                    415
 norway                   362
 denmark                  300
 us                       294
 hong kong                277
 poland                   264
 mexico   

In [None]:
train_df['Nation'] = train_df['Nation'].str.strip()
test_df['Nation'] = test_df['Nation'].str.strip()

In [None]:
test_df['Nation'].value_counts()[:50]

usa                   116400
canada                 14418
united kingdom          6007
germany                 4187
                        3249
australia               2957
spain                   2442
france                  1686
portugal                 998
netherlands              945
new zealand              898
switzerland              752
italy                    583
malaysia                 534
austria                  501
iran                     382
japan                    255
belgium                  198
china                    196
philippines              165
brazil                   164
ireland                  156
finland                  134
qatar                    133
romania                  110
singapore                 97
denmark                   94
poland                    88
taiwan                    86
mexico                    68
sweden                    61
argentina                 59
kuwait                    43
england                   33
israel        

In [None]:
def clean_nation_values(train_df):
    train_df.loc[(train_df['Nation']=='united state') | (train_df['Nation']=='united states') | (train_df['Nation']=='united staes') | (train_df['Nation']=='us')| (train_df['Nation']=='ysa')| (train_df['Nation']=='us of a')| (train_df['Nation']=='america'), 'Nation'] = 'usa'
    train_df.loc[(train_df['Nation']=='united kingdom') | (train_df['Nation']=='england')| (train_df['Nation']=='scotland'), 'Nation'] = 'uk'
    train_df.loc[(train_df['Nation']=='deutschland'), 'Nation'] = 'espaa'
    train_df.loc[(train_df['Nation']=='') | (train_df['Nation']=='na') | (train_df['Nation']=='na  on the road')| (train_df['Nation']=='the world tomorrow')| (train_df['Nation']=='x')| (train_df['Nation']=='c')| (train_df['Nation']=='far away')| (train_df['Nation']=='everywhere and anywhere'), 'Nation'] = 'n/a'
    train_df.loc[(train_df['Nation']=='la chine ternelle'), 'Nation'] = 'china'
    train_df.loc[(train_df['Nation']=='nz'), 'Nation'] = 'new zealand'
    train_df.loc[(train_df['Nation']=='litalia'), 'Nation'] = 'italy'
    train_df.loc[(train_df['Nation']=='cananda'), 'Nation'] = 'canada'
    train_df.loc[(train_df['Nation']=='phillipines'), 'Nation'] = 'philippines'
    train_df.loc[(train_df['Nation']=='catalunya'), 'Nation'] = 'catalonia'
    train_df.loc[(train_df['Nation']=='la france'), 'Nation'] = 'france'
    train_df.loc[(train_df['Nation']=='jersey')|(train_df['Nation']=='alderney'), 'Nation'] = 'uk'
    train_df.loc[(train_df['Nation']=='catalonia'), 'Nation'] = 'spain'

    train_df.loc[(train_df['Nation']=='los estados unidos de norte america') | (train_df['Nation']=='collin'), 'Nation'] = 'usa'
    train_df.loc[train_df['Nation'].isin(['shelby', 'worcester', 'hernando', 'ventura county', 'prince william', 'san mateo', 'aroostook', 'rutherford', 'fort bend', 'madrid', 'alachua', 'burlington']), 'Nation'] = 'usa'

    return train_df
train_df = clean_nation_values(train_df)
train_df['Nation'].value_counts()[:50]

usa                   630627
canada                 78572
n/a                    32570
uk                     27147
germany                23471
australia              15270
spain                  12551
france                  8965
portugal                5974
new zealand             4691
malaysia                4551
netherlands             4031
switzerland             3437
italy                   2892
austria                 2307
iran                    1268
romania                 1058
finland                 1057
singapore               1049
dominican republic       902
brazil                   761
sweden                   711
ireland                  697
philippines              600
japan                    520
belgium                  442
china                    429
norway                   362
denmark                  300
hong kong                277
poland                   264
mexico                   238
argentina                205
qatar                    202
south africa  

In [None]:
def not_nation(x):
    if x in [None, "espaa","unknown", "petrolwar nation", "galiza", "galiza neghra", "united sates", "maricopa", "andorra", "richmond country", "universe", "lleida", "framingham", "grenada", "usa currently living in england", "saint loius", "tdzimi", "space", "here and there", "csa", "onondaga nation", "were global", "disgruntled states of america", "bergued", "ferrara", "the great white north", "united kindgonm", "caribbean sea", "kcb", "stthomasi", "catalunya spain", "la belgique", "the", "quit", "orense", "hungary and usa", "eeuu", "van wert", "camden"]:
        return 'n/a'
    else:
        return x
train_df['Nation'] = train_df['Nation'].apply(lambda x: not_nation(x))

In [None]:
print(train_df['Nation'].unique())

['canada' 'usa' 'germany' 'spain' 'australia' 'uk' 'n/a' 'italy'
 'netherlands' 'portugal' 'sweden' 'austria' 'france' 'finland'
 'new zealand' 'singapore' 'switzerland' 'philippines' 'malaysia'
 'belgium' 'japan' 'denmark' 'belize' 'poland' 'ireland' 'haiti'
 'trinidad and tobago' 'tanzania' 'moldova' 'bulgaria' 'luxembourg'
 'south africa' 'czech republic' 'israel' 'malta' 'kuwait' 'peru'
 'ecuador' 'south korea' 'turkey' 'nigeria' 'brazil' 'hungary' 'indonesia'
 'kazakhstan' 'india' 'china' 'egypt' 'croatia' 'taiwan' 'argentina'
 'chile' 'venezuela' 'iceland' 'bolivia' 'lithuania' 'norway' 'mexico'
 'bahamas' 'romania' 'iran' 'dominican republic' 'ghana'
 'united arab emirates' 'tunisia' 'papua new guinea' 'costa rica'
 'vietnam' 'sri lanka' 'slovenia' 'slovakia' 'greece' 'puerto rico'
 'cameroon' 'laos' 'burma' 'thailand' 'hong kong' 'ethiopia' 'zambia'
 'russia' 'antarctica' 'polk' 'uae' 'cambodia' 'zimbabwe' 'cherokee'
 'monaco' 'morgan' 'jamaica' 'cape verde' 'pakistan' 'mongoli

In [None]:
others_list =  train_df['Nation'].value_counts()[50:].index
def convert_to_others(x):
    if x in others_list:
        return 'others'
    else:
        return x
train_df['Nation'] = train_df['Nation'].apply(lambda x: convert_to_others(x))
train_df['Nation'].value_counts()

usa                   630627
canada                 78572
n/a                    32570
uk                     27147
germany                23471
australia              15270
spain                  12551
france                  8965
portugal                5974
new zealand             4691
malaysia                4551
netherlands             4031
switzerland             3437
italy                   2892
austria                 2307
iran                    1268
others                  1178
romania                 1058
finland                 1057
singapore               1049
dominican republic       902
brazil                   761
sweden                   711
ireland                  697
philippines              600
japan                    520
belgium                  442
china                    429
norway                   362
denmark                  300
hong kong                277
poland                   264
mexico                   238
argentina                205
qatar         

In [None]:
train_df.to_csv('add_columns_book_0509_train.csv', index=False)

In [None]:
train_df.drop(columns=['ID', 'Location', 'Age', 'City', 'State'])

Unnamed: 0,User-ID,Book-ID,Book-Rating,Book-Title,Book-Author,Year-Of-Publication,Publisher,Age-Band,Year-Of-Publication_Band,Nation
0,USER_00000,BOOK_044368,8,Road Taken,Rona Jaffe,2001,Mira,20's,>=2000s,canada
1,USER_00000,BOOK_081205,8,Macbeth (New Penguin Shakespeare),William Shakespeare,1981,Penguin Books,20's,1980s,canada
2,USER_00000,BOOK_086781,0,Waverley (Penguin English Library),Walter Scott,1981,Penguin Books,20's,1980s,canada
3,USER_00000,BOOK_098622,0,Mother Earth Father Sky,Sue Harrison,1991,Avon,20's,1990s,canada
4,USER_00000,BOOK_180810,8,She Who Remembers,Linda Lay Shuler,1989,Signet Book,20's,1980s,canada
...,...,...,...,...,...,...,...,...,...,...
871388,USER_92096,BOOK_081138,0,Healing Words: The Power of Prayer and the Pra...,Larry Dossey,1993,Harpercollins,30's,1990s,usa
871389,USER_92097,BOOK_258124,0,The Salmon of Doubt: Hitchhiking the Galaxy On...,DOUGLAS ADAMS,2002,Harmony,30's,>=2000s,usa
871390,USER_92098,BOOK_071848,0,Harry Potter and the Prisoner of Azkaban (Book 3),J. K. Rowling,2000,Thorndike Press,40's,>=2000s,canada
871391,USER_92099,BOOK_252599,8,Heartbreak Hill: Anatomy of a Ryder Cup,Tim Rosaforte,1996,St Martins Pr,40's,1990s,usa


In [None]:
train_df.columns

Index(['ID', 'User-ID', 'Book-ID', 'Book-Rating', 'Age', 'Location',
       'Book-Title', 'Book-Author', 'Year-Of-Publication', 'Publisher',
       'Age-Band', 'Year-Of-Publication_Band', 'City', 'State', 'Nation'],
      dtype='object')

In [None]:
train = train_df.drop(columns=['ID', 'Location', 'Age', 'City', 'State', 'Year-Of-Publication', 'User-ID', 'Book-ID'])

In [None]:
train.columns

Index(['Book-Rating', 'Book-Title', 'Book-Author', 'Publisher', 'Age-Band',
       'Year-Of-Publication_Band', 'Nation'],
      dtype='object')

In [None]:
y_train = train.pop('Book-Rating')

In [None]:
numeric_features = list(train.select_dtypes(int).columns)
categorical_features = list(train.select_dtypes('O').columns)
ignore_features = []
print(numeric_features, categorical_features)

[] ['Book-Title', 'Book-Author', 'Publisher', 'Age-Band', 'Year-Of-Publication_Band', 'Nation']


In [None]:
reg = setup(data=pd.concat([train, y_train],axis=1), target ='Book-Rating', session_id=42,
           numeric_features=numeric_features,categorical_features=categorical_features,ignore_features=ignore_features)

Unnamed: 0,Description,Value
0,Session id,42
1,Target,Book-Rating
2,Target type,Regression
3,Original data shape,"(871393, 7)"
4,Transformed data shape,"(871393, 19)"
5,Transformed train set shape,"(609975, 19)"
6,Transformed test set shape,"(261418, 19)"
7,Categorical features,6
8,Preprocess,True
9,Imputation type,simple


In [None]:
top3 = compare_models(exclude = ['ransac'], n_select = 3, fold=5, sort='RMSE')

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
lr,Linear Regression,3.4387,14.1984,3.7681,0.0397,1.151,0.5947,19.224
ridge,Ridge Regression,3.4387,14.1984,3.7681,0.0397,1.151,0.5947,5.534
lar,Least Angle Regression,3.4387,14.1984,3.7681,0.0397,1.151,0.5947,5.808
br,Bayesian Ridge,3.4388,14.1984,3.7681,0.0397,1.151,0.5947,5.906
omp,Orthogonal Matching Pursuit,3.502,14.4582,3.8024,0.0221,1.1637,0.6038,5.296
en,Elastic Net,3.5425,14.6481,3.8273,0.0092,1.1719,0.6094,5.682
huber,Huber Regressor,3.2323,14.7407,3.8394,0.003,1.065,0.6876,14.636
gbr,Gradient Boosting Regressor,3.5897,14.7774,3.8441,0.0005,1.1967,0.5842,86.84
lasso,Lasso Regression,3.5592,14.7849,3.8451,-0.0,1.176,0.612,11.434
llar,Lasso Least Angle Regression,3.5592,14.7849,3.8451,-0.0,1.176,0.612,5.634


Processing:   0%|          | 0/79 [00:00<?, ?it/s]

In [None]:
tuned_top3 = [tune_model(i) for i in top3]
# blender = blend_models(tuned_top3)
# stacker = stack_models(tuned_top3)
best_auc_model = automl(optimize = 'RMSE')

Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,3.4405,14.2081,3.7694,0.0406,1.1497,0.5966
1,3.422,14.0645,3.7503,0.0422,1.1489,0.592
2,3.4411,14.2404,3.7736,0.0414,1.1493,0.5943
3,3.4369,14.2172,3.7706,0.0393,1.1507,0.5946
4,3.44,14.2294,3.7722,0.0408,1.15,0.5937
5,3.4374,14.193,3.7674,0.0406,1.1505,0.5962
6,3.431,14.1226,3.758,0.0412,1.1508,0.5943
7,3.4342,14.1385,3.7601,0.0409,1.1525,0.5958
8,3.4395,14.2264,3.7718,0.0392,1.1501,0.5931
9,3.4376,14.2113,3.7698,0.0392,1.1515,0.5928


Processing:   0%|          | 0/7 [00:00<?, ?it/s]

Fitting 10 folds for each of 2 candidates, totalling 20 fits


Original model was better than the tuned model, hence it will be returned. NOTE: The display metrics are for the tuned model (not the original one).


Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,3.4405,14.2081,3.7694,0.0406,1.1497,0.5966
1,3.422,14.0645,3.7503,0.0422,1.1489,0.592
2,3.4411,14.2404,3.7736,0.0414,1.1493,0.5943
3,3.4369,14.2172,3.7706,0.0393,1.1507,0.5946
4,3.44,14.2294,3.7722,0.0408,1.15,0.5937
5,3.4374,14.193,3.7674,0.0406,1.1505,0.5962
6,3.431,14.1226,3.758,0.0412,1.1508,0.5943
7,3.4342,14.1385,3.7601,0.0409,1.1525,0.5958
8,3.4395,14.2263,3.7718,0.0392,1.1501,0.5931
9,3.4376,14.2113,3.7698,0.0392,1.1515,0.5928


Processing:   0%|          | 0/7 [00:00<?, ?it/s]

Fitting 10 folds for each of 10 candidates, totalling 100 fits


Original model was better than the tuned model, hence it will be returned. NOTE: The display metrics are for the tuned model (not the original one).


Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,3.4405,14.2081,3.7694,0.0406,1.1497,0.5966
1,3.422,14.0645,3.7503,0.0422,1.1489,0.592
2,3.4411,14.2404,3.7736,0.0414,1.1493,0.5943
3,3.4369,14.2172,3.7706,0.0393,1.1507,0.5946
4,3.44,14.2294,3.7722,0.0408,1.15,0.5937
5,3.4374,14.193,3.7674,0.0406,1.1505,0.5962
6,3.431,14.1226,3.758,0.0412,1.1508,0.5943
7,3.4342,14.1385,3.7601,0.0409,1.1525,0.5958
8,3.4395,14.2264,3.7718,0.0392,1.1501,0.5931
9,3.4376,14.2113,3.7698,0.0392,1.1515,0.5928


Processing:   0%|          | 0/7 [00:00<?, ?it/s]

Fitting 10 folds for each of 10 candidates, totalling 100 fits


Original model was better than the tuned model, hence it will be returned. NOTE: The display metrics are for the tuned model (not the original one).


In [None]:
for i in top3:
    save_model(i, f'top_{i}')
load_model(f'top_{i}')

In [None]:
best_model = tune_model(top3[0])
best_auc_model = automl(optimize = 'RMSE')

In [None]:
finalized_model = finalize_model(best_auc_model)

In [None]:
test_df['Age'] = test_df['Age'].astype(int)
test_df['Year-Of-Publication'] = test_df['Year-Of-Publication'].astype(int)
test_df['Age-Band'] = test_df['Age'].apply(lambda x: age_to_ageband(x))
test_df = pd.concat([test_df, test_df['Location'].apply(lambda x: location_split(x)).str.split(',', expand=True).rename(columns={0:'City', 1:'State', 2:'Nation'})], axis=1)
test_df.to_csv('add_columns_book_test.csv', index=False)
test_df.drop(columns=['ID', 'Location', 'Age', 'Publisher', 'City', 'State'])

Unnamed: 0,User-ID,Book-ID,Book-Title,Book-Author,Year-Of-Publication,Age-Band,Nation
0,USER_00008,BOOK_047966,Birds of Prey: A Novel of Suspense,J.A. Jance,2002,30's,usa
1,USER_00008,BOOK_119494,Midnight Voices,JOHN SAUL,2003,30's,usa
2,USER_00008,BOOK_151775,Breaking Free : A Prescription for Personal an...,David M. Noer,1996,30's,usa
3,USER_00008,BOOK_176255,Bitter Harvest,Ann Rule,1999,30's,usa
4,USER_00008,BOOK_187307,Embraced by the Light,Betty J. Eadie,1994,30's,usa
...,...,...,...,...,...,...,...
159616,USER_92086,BOOK_159050,The Fat Flush Plan,Ann Louise Gittleman,2001,under10,usa
159617,USER_92086,BOOK_196481,We Die Alone: A WWII Epic of Escape and Endurance,David Howarth,1999,under10,usa
159618,USER_92086,BOOK_199754,From 60 Yards In : How to Master Golf's Short ...,Raymond Floyd,1992,under10,usa
159619,USER_92086,BOOK_227481,Solo: On Her Own Adventure,Susan Fox Rogers,1996,under10,usa


In [None]:
predictions = predict_model(data=test_df, estimator=finalized_model)
predictions

Unnamed: 0,ID,User-ID,Book-ID,Age,Location,Book-Title,Book-Author,Year-Of-Publication,Publisher,Age-Band,City,State,Nation,prediction_label
0,TEST_000000,USER_00008,BOOK_047966,37,"vermilion, ohio, usa",Birds of Prey: A Novel of Suspense,J.A. Jance,2002,Avon,30's,vermilion,ohio,usa,6.392922
1,TEST_000001,USER_00008,BOOK_119494,37,"vermilion, ohio, usa",Midnight Voices,JOHN SAUL,2003,Ballantine Books,30's,vermilion,ohio,usa,6.739274
2,TEST_000002,USER_00008,BOOK_151775,37,"vermilion, ohio, usa",Breaking Free : A Prescription for Personal an...,David M. Noer,1996,Jossey-Bass,30's,vermilion,ohio,usa,6.490319
3,TEST_000003,USER_00008,BOOK_176255,37,"vermilion, ohio, usa",Bitter Harvest,Ann Rule,1999,Pocket,30's,vermilion,ohio,usa,6.503035
4,TEST_000004,USER_00008,BOOK_187307,37,"vermilion, ohio, usa",Embraced by the Light,Betty J. Eadie,1994,Bantam Books,30's,vermilion,ohio,usa,6.315946
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
159616,TEST_159616,USER_92086,BOOK_159050,0,"mountain view, california, usa",The Fat Flush Plan,Ann Louise Gittleman,2001,McGraw-Hill/Contemporary Books,under10,mountain view,california,usa,3.577054
159617,TEST_159617,USER_92086,BOOK_196481,0,"mountain view, california, usa",We Die Alone: A WWII Epic of Escape and Endurance,David Howarth,1999,The Lyons Press,under10,mountain view,california,usa,3.482705
159618,TEST_159618,USER_92086,BOOK_199754,0,"mountain view, california, usa",From 60 Yards In : How to Master Golf's Short ...,Raymond Floyd,1992,Perennial,under10,mountain view,california,usa,3.394149
159619,TEST_159619,USER_92086,BOOK_227481,0,"mountain view, california, usa",Solo: On Her Own Adventure,Susan Fox Rogers,1996,Seal Press (WA),under10,mountain view,california,usa,3.210918


In [None]:
predictions['prediction_label'].apply(lambda x: round(x, 0))

0         6.0
1         7.0
2         6.0
3         7.0
4         6.0
         ... 
159616    4.0
159617    3.0
159618    3.0
159619    3.0
159620    3.0
Name: prediction_label, Length: 159621, dtype: float64

In [None]:
submission['Book-Rating'] = predictions['prediction_label']
submission.to_csv('First_submission_pycaret_0509_float.csv', index=False)

In [None]:
submission = pd.read_csv('./sample_submission.csv')
submission['Book-Rating'] = predictions['prediction_label'].apply(lambda x: round(x, 0))
submission.to_csv('First_submission_pycaret_0509.csv', index=False)
submission


Unnamed: 0,ID,Book-Rating
0,TEST_000000,6.0
1,TEST_000001,7.0
2,TEST_000002,6.0
3,TEST_000003,7.0
4,TEST_000004,6.0
...,...,...
159616,TEST_159616,4.0
159617,TEST_159617,3.0
159618,TEST_159618,3.0
159619,TEST_159619,3.0


In [None]:
ensemble_top3 = ensemble_model(best_model, method="Boosting", fold=5, optimize='RMSE')

Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,3.0904,11.9483,3.4566,0.1898,1.1006,0.4925
1,3.0961,11.9879,3.4624,0.1915,1.1011,0.491
2,3.0921,11.9524,3.4572,0.1932,1.1012,0.4902
3,3.0905,11.9091,3.451,0.1918,1.103,0.4908
4,3.0986,11.9779,3.4609,0.1907,1.103,0.4885
Mean,3.0935,11.9551,3.4576,0.1914,1.1018,0.4906
Std,0.0033,0.0274,0.004,0.0011,0.001,0.0013


Processing:   0%|          | 0/6 [00:00<?, ?it/s]

In [None]:
finalized_model = finalize_model(ensemble_top3)

In [None]:
predictions = predict_model(data=test_df, estimator=finalized_model)
submission['Book-Rating'] = predictions['prediction_label']
submission.to_csv('First_submission_pycaret_0509_float_boosting.csv', index=False)