In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

np.random.seed(42)

# **SPLIT-BY-TIME**

## Load book reviews and split in 80 : 10 ratio

In [3]:
book_reviews_df = pd.read_csv('/content/drive/MyDrive/Priyanka/final_book_reviews.csv')
print(book_reviews_df.head())
print(book_reviews_df.shape)

         item            user  rating   timestamp
0  1101986964  A12NFV3VBMDV4A     5.0  1472169600
1  1519010931  A2E2XE0JX1ZGO0     5.0  1472169600
2  1519010931  A1YJ6VDWVUHPIL     5.0  1472169600
3  0399177663  A343Y76WJTBYP6     2.0  1472169600
4  1503935477  A12NFV3VBMDV4A     4.0  1472169600
(1076310, 4)


In [4]:
book_reviews_df.sort_values('timestamp', inplace=True)
book_reviews_df.reset_index(drop=True, inplace=True)

train_size = int(len(book_reviews_df) * 0.9)

In [5]:
train_book_reviews = book_reviews_df.iloc[:train_size]
test_book_reviews = book_reviews_df.iloc[train_size:]

print("#### Train #######")
print(train_book_reviews.head())
print(train_book_reviews.shape)

print("#### Test #######")
print(test_book_reviews.head())
print(test_book_reviews.shape)

#### Train #######
         item            user  rating   timestamp
0  1101986964  A12NFV3VBMDV4A     5.0  1472169600
1  1537245856  A13KSU7ZWRL04J     5.0  1472169600
2  194642000X  A32SJS0TTSRIM5     5.0  1472169600
3  194642000X  A3KLPA5D8SX7VR     5.0  1472169600
4  194642000X  A39L46WM6GBO4Q     4.0  1472169600
(968679, 4)
#### Test #######
              item            user  rating   timestamp
968679  B017V4IPPO  A3FI4OAOQYQWHH     5.0  1521763200
968680  1542909953  A2EVKNQ71UULXI     4.0  1521763200
968681  1986069915  A1Z4A95K8GQEOQ     4.0  1521763200
968682  0062741772  A3JG9X352CC82P     5.0  1521763200
968683  1912106019  A2LFM0YDD10WG5     4.0  1521763200
(107631, 4)


In [6]:
print("##### Train ######")
print(train_book_reviews['user'].value_counts())

print("##### Test ######")
print(test_book_reviews['user'].value_counts())

##### Train ######
A1HHSUAQDZ1XM6    1117
A3A44JIMIWYFIE    1105
A1JLU5H1CCENWX    1083
A1EPVSC849EI9R    1018
A2R2O6R00PXFC2     912
                  ... 
A3OZINPS8IXRWO       1
A32L79KVJABBO5       1
A2YMB0SM5HAL5T       1
AHVXGQVBH8Y28        1
A1V7IT3S9YM7BY       1
Name: user, Length: 19376, dtype: int64
##### Test ######
A1EPVSC849EI9R    158
A2MPS6SED8DVCW    146
A39M153PSJWSVR    135
A2HX0B5ELOPP5Z    128
A2UCFO5EBDSH04    126
                 ... 
A3QA7NAEQPMIEU      1
A2L4QN7LQDQDG1      1
A26K1OD1IJYCM6      1
A3SM4N0QPWDK0Q      1
A3VY8Y9CQPF5T8      1
Name: user, Length: 15582, dtype: int64


In [7]:
print("##### Train ######")
print(train_book_reviews['item'].value_counts())

print("##### Test ######")
print(test_book_reviews['item'].value_counts())

##### Train ######
B001MQA3DU    1696
0996135669     931
0996135693     789
B001MVNGCU     756
1546904468     732
              ... 
1516108019       1
1946836168       1
0525573291       1
0735218455       1
1538483963       1
Name: item, Length: 17110, dtype: int64
##### Test ######
1986343456    479
1250183979    432
1732624704    399
1986917967    388
1717110983    349
             ... 
1539844838      1
151518045X      1
191069200X      1
1503953076      1
1978240252      1
Name: item, Length: 11564, dtype: int64


In [8]:
train_users = set(train_book_reviews['user'].unique())
train_items = set(train_book_reviews['item'].unique())

test_users = set(test_book_reviews['user'].unique())
test_items = set(test_book_reviews['item'].unique())

unique_users_in_test_not_in_train = test_users - train_users
unique_items_in_test_not_in_train = test_items - train_items

print(f"Number of new users in test set: {len(unique_users_in_test_not_in_train)}")
print(f"Number of new items in test set: {len(unique_items_in_test_not_in_train)}")

Number of new users in test set: 30
Number of new items in test set: 921


## Export Train and Test dataset

In [9]:
train_book_reviews.to_csv('/content/drive/MyDrive/Priyanka/train_book_reviews_by_time.csv', index=False)
test_book_reviews.to_csv('/content/drive/MyDrive/Priyanka/test_book_reviews_by_time.csv', index=False)

# **LEAVE_ONE_OUT SETTING**

# Load user data and leave one interaction per user in test setting

In [10]:
book_reviews_new_df = pd.read_csv('/content/drive/MyDrive/Priyanka/final_book_reviews.csv')
print(book_reviews_new_df.head())
print(book_reviews_new_df.shape)

         item            user  rating   timestamp
0  1101986964  A12NFV3VBMDV4A     5.0  1472169600
1  1519010931  A2E2XE0JX1ZGO0     5.0  1472169600
2  1519010931  A1YJ6VDWVUHPIL     5.0  1472169600
3  0399177663  A343Y76WJTBYP6     2.0  1472169600
4  1503935477  A12NFV3VBMDV4A     4.0  1472169600
(1076310, 4)


In [11]:
book_reviews_new_df.sort_values(by='timestamp', ascending=True, inplace=True)
test_book_reviews_new = book_reviews_new_df.groupby('user').tail(1)

In [12]:
test_book_reviews_new

Unnamed: 0,item,user,rating,timestamp
38609,1494567296,A2NMKI0ZPIE3VU,5.0,1474848000
41888,1941480020,A32382F71YPPQ7,5.0,1475020800
42006,B001MQA3DU,A161OJ95SYSN7A,5.0,1475020800
44637,1523744812,A13VRP9X8VWM06,3.0,1475193600
48570,0399176772,A2J057MQRHLWS4,2.0,1475452800
...,...,...,...,...
1076305,B01FKT9TW0,A3O0898T7NA7GB,5.0,1537747200
1076306,B01ENU0V5Q,AMI929LFF76I6,5.0,1537747200
1076307,B01GLVJ0GQ,A20JZM5K4E54LO,5.0,1537920000
1076308,B01HI9W5HQ,A1MCN1E5GNFNXJ,1.0,1538006400


In [13]:
train_book_reviews_new = book_reviews_new_df.drop(test_book_reviews_new.index)

In [14]:
train_book_reviews_new

Unnamed: 0,item,user,rating,timestamp
0,1101986964,A12NFV3VBMDV4A,5.0,1472169600
626,1537245856,A13KSU7ZWRL04J,5.0,1472169600
627,194642000X,A32SJS0TTSRIM5,5.0,1472169600
628,194642000X,A3KLPA5D8SX7VR,5.0,1472169600
629,194642000X,A39L46WM6GBO4Q,4.0,1472169600
...,...,...,...,...
1076033,0440000785,A2P3LKAINU8JUE,4.0,1536796800
1076093,1720013306,A28GRFK2F3TMWE,4.0,1536883200
1076133,0525483705,A2QHX2UQPXXEN6,3.0,1536969600
1076156,1732624704,A1MXW3BGAZW44D,5.0,1537056000


In [15]:
print("#### Train #######")
print(train_book_reviews_new.head())
print(train_book_reviews_new.shape)

print("#### Test #######")
print(test_book_reviews_new.head())
print(test_book_reviews_new.shape)

#### Train #######
           item            user  rating   timestamp
0    1101986964  A12NFV3VBMDV4A     5.0  1472169600
626  1537245856  A13KSU7ZWRL04J     5.0  1472169600
627  194642000X  A32SJS0TTSRIM5     5.0  1472169600
628  194642000X  A3KLPA5D8SX7VR     5.0  1472169600
629  194642000X  A39L46WM6GBO4Q     4.0  1472169600
(1056904, 4)
#### Test #######
             item            user  rating   timestamp
38609  1494567296  A2NMKI0ZPIE3VU     5.0  1474848000
41888  1941480020  A32382F71YPPQ7     5.0  1475020800
42006  B001MQA3DU  A161OJ95SYSN7A     5.0  1475020800
44637  1523744812  A13VRP9X8VWM06     3.0  1475193600
48570  0399176772  A2J057MQRHLWS4     2.0  1475452800
(19406, 4)


In [16]:
print("##### Train ######")
print(train_book_reviews_new['user'].value_counts())

print("##### Test ######")
print(test_book_reviews_new['user'].value_counts())

##### Train ######
A1JLU5H1CCENWX    1205
A1EPVSC849EI9R    1175
A3A44JIMIWYFIE    1147
A1HHSUAQDZ1XM6    1144
A2R2O6R00PXFC2    1024
                  ... 
A2B9X00PU8VUPF      19
A162OMY0QKVHZT      19
A386NMAGHTUJMC      19
A1HZOAADWDJG2P      19
A1FC571ZMQHWND      19
Name: user, Length: 19406, dtype: int64
##### Test ######
A2NMKI0ZPIE3VU    1
AQ9LIRARAES3N     1
AVUVEQDN5333G     1
A2UXN4B0OL6FFC    1
A36LZJODQYVM1Z    1
                 ..
A3NICGO82Z9FI1    1
AD8STHWAAFZSG     1
A3TU4WKISCSD1P    1
A2BPOVNFWWJ8VC    1
A2ILQKH2NHVQEL    1
Name: user, Length: 19406, dtype: int64


In [17]:
print("##### Train ######")
print(train_book_reviews_new['item'].value_counts())

print("##### Test ######")
print(test_book_reviews_new['item'].value_counts())

##### Train ######
B001MQA3DU    1691
0996135669     957
0996135693     808
B001MVNGCU     752
1546904468     745
              ... 
1720490848       4
1987650840       4
1720918333       3
1503903621       2
1720013306       1
Name: item, Length: 18028, dtype: int64
##### Test ######
1732624704    388
1717795676    210
1947089021     87
1942835280     85
1250165938     85
             ... 
1492863602      1
1945042095      1
1517122562      1
0998555746      1
1501115391      1
Name: item, Length: 5910, dtype: int64


In [18]:
train_users_new = set(train_book_reviews_new['user'].unique())
train_items_new = set(train_book_reviews_new['item'].unique())

test_users_new = set(test_book_reviews_new['user'].unique())
test_items_new = set(test_book_reviews_new['item'].unique())

unique_users_in_test_not_in_train_new = test_users_new - train_users_new
unique_items_in_test_not_in_train_new = test_items_new - train_items_new

print(f"Number of new users in test set: {len(unique_users_in_test_not_in_train_new)}")
print(f"Number of new items in test set: {len(unique_items_in_test_not_in_train_new)}")

Number of new users in test set: 0
Number of new items in test set: 3


# Export train and test datas

In [19]:
train_book_reviews_new.to_csv('/content/drive/MyDrive/Priyanka/train_book_reviews_leave_one_out.csv', index=False)
test_book_reviews_new.to_csv('/content/drive/MyDrive/Priyanka/test_book_reviews_leave_one_out.csv', index=False)