In [50]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm

In [51]:
df_books = pd.read_csv('books.csv', encoding='latin1', sep=';')
df_users = pd.read_csv('users.csv', encoding='latin1', sep=';')
df_ratings = pd.read_csv('ratings.csv', encoding='latin1', sep=';')

### Strategy

-Embedding space for books and users. That means creating an N dimensional "space" R^N to represent similarities between books, then the users will move through that "space" for recommendations. 
the two features for constructing this "space" will be: title and image. 
For image, implementation with Image Net model to extract the meaningful features, from the last layers.
For title, implementation with word2vec embedding.

-For initializing the vector of the user: a function that takes age, location -> [vector]

-For updating the user_embedding: a function depending on (book, rating_by_the_user). 

-For showing the recommendations: cosine_similarity or any other similarity metric.

## Cleaning all data

In [52]:
df_users.head()

Unnamed: 0,User-ID,Location,Age
0,1,"nyc, new york, usa",
1,2,"stockton, california, usa",18.0
2,3,"moscow, yukon territory, russia",
3,4,"porto, v.n.gaia, portugal",17.0
4,5,"farnborough, hants, united kingdom",


##### df books

In [53]:
df_books_cleaned = df_books[['Book-Title', 'Image-URL-S']]

In [54]:
df_books_cleaned.head()

Unnamed: 0,Book-Title,Image-URL-S
0,Classical Mythology,http://images.amazon.com/images/P/0195153448.0...
1,Clara Callan,http://images.amazon.com/images/P/0002005018.0...
2,Decision in Normandy,http://images.amazon.com/images/P/0060973129.0...
3,Flu: The Story of the Great Influenza Pandemic...,http://images.amazon.com/images/P/0374157065.0...
4,The Mummies of Urumchi,http://images.amazon.com/images/P/0393045218.0...


In [55]:
# clean rows that contained nan
df_books_cleaned = df_books_cleaned.dropna(how='any')

In [56]:
# drop duplicates
df_books_cleaned = df_books_cleaned.drop_duplicates(subset=None, keep='first')

In [99]:
# cleaned all rows that contained nan
df_users_cleaned = df_users.dropna(how='any')

In [100]:
# check if location row has 3 values
df_users_cleaned['3_locations'] = df_users_cleaned['Location'].apply(lambda x: len(x.split(',')) == 3)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_users_cleaned['3_locations'] = df_users_cleaned['Location'].apply(lambda x: len(x.split(',')) == 3)


In [101]:
# removing the locations that didn't have 3 locations
df_users_cleaned = df_users_cleaned[df_users_cleaned['3_locations']]

In [102]:
df_users_cleaned


Unnamed: 0,User-ID,Location,Age,3_locations
1,2,"stockton, california, usa",18.0,True
3,4,"porto, v.n.gaia, portugal",17.0,True
5,6,"santa monica, california, usa",61.0,True
9,10,"albacete, wisconsin, spain",26.0,True
10,11,"melbourne, victoria, australia",14.0,True
...,...,...,...,...
278848,278849,"georgetown, ontario, canada",23.0,True
278850,278851,"dallas, texas, usa",33.0,True
278851,278852,"brisbane, queensland, australia",32.0,True
278852,278853,"stranraer, n/a, united kingdom",17.0,True


In [103]:
# isolate country in one column from Location
column = 2 #country
df_users_cleaned['country'] = df_users_cleaned['Location'].apply(lambda x: str(x.split(',')[column]).replace(' ',''))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_users_cleaned['country'] = df_users_cleaned['Location'].apply(lambda x: str(x.split(',')[column]).replace(' ',''))


In [104]:
# longest regex code in history for cleaning country of unwanted characters
df_users_cleaned = df_users_cleaned[~ (df_users_cleaned['country'].str.contains(r'[^\w\s]')) & ~ (df_users_cleaned['country'].str.contains(r'[0-9_]')) & ~(df_users_cleaned['country'] == "")]

In [63]:
# "clean" version of country
u,c = np.unique(df_users_cleaned['country'], return_counts=True) 

[(u[i],c[i]) for i in range(len(u))]

[('afghanistan', 22),
 ('alachua', 1),
 ('albania', 22),
 ('alderney', 6),
 ('algeria', 26),
 ('algérie', 1),
 ('allen', 1),
 ('amalurra', 1),
 ('america', 1),
 ('andorra', 38),
 ('angola', 4),
 ('antarctica', 4),
 ('antiguaandbarbuda', 8),
 ('aotearoa', 1),
 ('argentina', 979),
 ('armenia', 6),
 ('aruba', 1),
 ('asturies', 1),
 ('austbritania', 1),
 ('australia', 8844),
 ('austria', 731),
 ('azerbaijan', 4),
 ('bahamas', 16),
 ('bahrain', 13),
 ('bangladesh', 33),
 ('barbados', 16),
 ('basquecountry', 4),
 ('belarus', 11),
 ('belgique', 3),
 ('belgium', 664),
 ('belize', 19),
 ('benin', 8),
 ('berguedà', 1),
 ('bermuda', 11),
 ('bhutan', 3),
 ('bolivia', 29),
 ('bosniaandherzegovina', 41),
 ('botswana', 7),
 ('brasil', 7),
 ('brazil', 1517),
 ('brunei', 17),
 ('bulgaria', 91),
 ('burkinafaso', 3),
 ('burlington', 1),
 ('burma', 17),
 ('cambodia', 5),
 ('cameroon', 7),
 ('canada', 11339),
 ('capeverde', 1),
 ('caribbeansea', 1),
 ('catalonia', 9),
 ('catalunya', 11),
 ('catalunyaspain'

In [105]:
# replacing countries that do not have correct spacing
 

d = { "bosniaandherzegovina" :"bosnia and herzegovina", "capeverde" : "cape verde", "saintvincentandthegrenadines" : 
    "saint vincent and the grenadines", "unitedarabemirates" : "united arab emirates", "united states" : "united states", 
    "trinidadandtobago" : "trinidad and tobago", "saintkittsandnevis" : "saint kitts and nevis", "newzealand" : "new zealand",
    "unitedkingdom" : "united kingdom" } 

df_users_cleaned['country'] = df_users_cleaned['country'].replace(d)
df_users_cleaned

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_users_cleaned['country'] = df_users_cleaned['country'].replace(d)


Unnamed: 0,User-ID,Location,Age,3_locations,country
1,2,"stockton, california, usa",18.0,True,usa
3,4,"porto, v.n.gaia, portugal",17.0,True,portugal
5,6,"santa monica, california, usa",61.0,True,usa
9,10,"albacete, wisconsin, spain",26.0,True,spain
10,11,"melbourne, victoria, australia",14.0,True,australia
...,...,...,...,...,...
278848,278849,"georgetown, ontario, canada",23.0,True,canada
278850,278851,"dallas, texas, usa",33.0,True,usa
278851,278852,"brisbane, queensland, australia",32.0,True,australia
278852,278853,"stranraer, n/a, united kingdom",17.0,True,united kingdom


In [106]:
# clean rows with crazy age
df_users_cleaned = df_users_cleaned[df_users_cleaned['Age']<=100]
df_users_cleaned 


Unnamed: 0,User-ID,Location,Age,3_locations,country
1,2,"stockton, california, usa",18.0,True,usa
3,4,"porto, v.n.gaia, portugal",17.0,True,portugal
5,6,"santa monica, california, usa",61.0,True,usa
9,10,"albacete, wisconsin, spain",26.0,True,spain
10,11,"melbourne, victoria, australia",14.0,True,australia
...,...,...,...,...,...
278848,278849,"georgetown, ontario, canada",23.0,True,canada
278850,278851,"dallas, texas, usa",33.0,True,usa
278851,278852,"brisbane, queensland, australia",32.0,True,australia
278852,278853,"stranraer, n/a, united kingdom",17.0,True,united kingdom


In [66]:
# eliminate nans first
df_ratings_cleaned = df_ratings.dropna(how='any')

In [67]:
df_ratings_cleaned.groupby(['Book-Rating']).count()

Unnamed: 0_level_0,User-ID,ISBN
Book-Rating,Unnamed: 1_level_1,Unnamed: 2_level_1
0,716109,716109
1,1770,1770
2,2759,2759
3,5996,5996
4,8904,8904
5,50974,50974
6,36924,36924
7,76457,76457
8,103736,103736
9,67541,67541


In [68]:
# undersampling the dataset by reducing 0-rating

# Put all the 0's in a separate dataset
df_without_zero = df_ratings_cleaned.loc[df_ratings_cleaned['Book-Rating'] != 0]

#Randomly select  observations from the non-fraud (majority class)
df_with_zero = df_ratings_cleaned.loc[df_ratings_cleaned['Book-Rating'] == 0].sample(n=45000,random_state=42)

# Concatenate both dataframes again
df_undersampled_ratings = pd.concat([df_without_zero, df_with_zero])
df_undersampled_ratings

Unnamed: 0,User-ID,ISBN,Book-Rating
1,276726,0155061224,5
3,276729,052165615X,3
4,276729,0521795028,6
6,276736,3257224281,8
7,276737,0600570967,6
...,...,...,...
511086,124048,0425195473,0
964683,232964,0836220870,0
617446,149306,1930252501,0
214844,50205,0515114863,0


In [139]:
# creating vector for age and country depending on rating of a book
#df_users_cleaned = df_users_cleaned.groupby(['Age','country'])

#df_users_cleaned.head()
df_ratings


Unnamed: 0,User-ID,ISBN,Book-Rating
0,276725,034545104X,0
1,276726,0155061224,5
2,276727,0446520802,0
3,276729,052165615X,3
4,276729,0521795028,6
...,...,...,...
1149775,276704,1563526298,9
1149776,276706,0679447156,0
1149777,276709,0515107662,10
1149778,276721,0590442449,10


# Downloading all images

In [70]:
import urllib.request

path_to_save = '/home/niki/kings/SEG/Book/book-recommendation/data'

In [71]:
x = df_books_cleaned.loc[:200]

In [72]:
# create a dict that maps int to a title
d = {}

indexs = x.index.values
for i in tqdm(range(len(x))):
    row = x.loc[indexs[i]]
    
    # map title on the dict
    d[i] = row['Book-Title']
    
    # retrieve image and saving the image as the i name for mapping
    urllib.request.urlretrieve(row['Image-URL-S'], "{}{}.jpg".format(path_to_save, i))

100%|██████████| 201/201 [00:27<00:00,  7.33it/s]


In [73]:
# save the dict
import pickle

a_file = open("/home/niki/kings/SEG/Book/book-recommendation/data.pkl", "wb")

pickle.dump(d, a_file)

In [74]:
# load the dict

file_to_read = open("/home/niki/kings/SEG/Book/book-recommendation/data.pkl", "rb")
d = pickle.load(file_to_read)

In [75]:
inv_map = {v: k for k, v in d.items()}

In [76]:
# get the image_path
x['int_map'] = x['Book-Title'].map(inv_map)
x['image_path'] = "{}".format(path_to_save) + x['int_map'].astype(str) + ".jpg"

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  x['int_map'] = x['Book-Title'].map(inv_map)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  x['image_path'] = "{}".format(path_to_save) + x['int_map'].astype(str) + ".jpg"


In [77]:
x = x[['Book-Title','image_path']]
x.head()

Unnamed: 0,Book-Title,image_path
0,Classical Mythology,/home/niki/kings/SEG/Book/book-recommendation/...
1,Clara Callan,/home/niki/kings/SEG/Book/book-recommendation/...
2,Decision in Normandy,/home/niki/kings/SEG/Book/book-recommendation/...
3,Flu: The Story of the Great Influenza Pandemic...,/home/niki/kings/SEG/Book/book-recommendation/...
4,The Mummies of Urumchi,/home/niki/kings/SEG/Book/book-recommendation/...


# Obtaining images as tensors

In [78]:
from PIL import Image
from numpy import asarray

In [79]:
shapes = []
image_info = []

indexs = x.index.values
for i in tqdm(range(len(x))):
    image = Image.open(x.loc[indexs[i]]['image_path'])
    data = asarray(image)
    
    shapes.append(data.shape)
    image_info.append(data)
    
x['img_shape'] = shapes
x['image_info'] = image_info

100%|██████████| 201/201 [00:00<00:00, 2632.62it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  x['img_shape'] = shapes


In [80]:
# keep the ones that have 3 dimensions
x = x[x['img_shape'].apply(lambda y: len(y) == 3)]

In [81]:
x.head()

Unnamed: 0,Book-Title,image_path,img_shape,image_info
0,Classical Mythology,/home/niki/kings/SEG/Book/book-recommendation/...,"(75, 59, 3)","[[[148, 132, 109], [147, 131, 108], [147, 131,..."
1,Clara Callan,/home/niki/kings/SEG/Book/book-recommendation/...,"(75, 49, 3)","[[[179, 122, 113], [182, 124, 110], [182, 117,..."
3,Flu: The Story of the Great Influenza Pandemic...,/home/niki/kings/SEG/Book/book-recommendation/...,"(75, 51, 3)","[[[235, 223, 173], [247, 240, 186], [251, 252,..."
4,The Mummies of Urumchi,/home/niki/kings/SEG/Book/book-recommendation/...,"(75, 49, 3)","[[[246, 251, 247], [252, 255, 253], [251, 255,..."
6,What If?: The World's Foremost Military Histor...,/home/niki/kings/SEG/Book/book-recommendation/...,"(75, 50, 3)","[[[0, 1, 0], [0, 1, 0], [0, 1, 0], [0, 1, 0], ..."


# Transform the book title and image_info into embedding form

In [82]:
import tensorflow as tf

In [83]:
# resize image to (224,224,3)
x['image_info_resized'] = x['image_info'].apply(lambda y: tf.image.resize(y, (224,224)))

In [84]:
# preprocees image_info
tf.keras.applications.mobilenet.preprocess_input(x['image_info_resized'])

0      (((tf.Tensor(0.16078436, shape=(), dtype=float...
1      (((tf.Tensor(0.4039216, shape=(), dtype=float3...
3      (((tf.Tensor(0.84313726, shape=(), dtype=float...
4      (((tf.Tensor(0.92941177, shape=(), dtype=float...
6      (((tf.Tensor(-1.0, shape=(), dtype=float32), t...
                             ...                        
196    (((tf.Tensor(-0.372549, shape=(), dtype=float3...
197    (((tf.Tensor(-0.8352941, shape=(), dtype=float...
198    (((tf.Tensor(-0.9529412, shape=(), dtype=float...
199    (((tf.Tensor(0.81960785, shape=(), dtype=float...
200    (((tf.Tensor(0.45098042, shape=(), dtype=float...
Name: image_info_resized, Length: 171, dtype: object

In [85]:
model = tf.keras.applications.mobilenet_v2.MobileNetV2(
    input_shape=None, alpha=1.0, include_top=True, weights='imagenet',
)

In [86]:
x['img_embedding'] = x['image_info_resized'].apply(lambda y: model(tf.expand_dims(y, axis=0)))

In [87]:
x = x[['Book-Title','image_path','img_embedding']]

In [88]:
# x.to_csv('data/img_embedding.csv', index=False)
x.to_csv('/home/niki/kings/SEG/Book/book-recommendation/data/img_embedding.csv', index=False)


# Importing NLP data

In [89]:
# to do: apply word2vec model to titles.

# Try model only on Image

In [90]:
from sklearn.metrics.pairwise import cosine_similarity

In [122]:
image_n = 177
print(x.head())
# row = x[x['image_path'] == './data/book_images/{}.jpg'.format(image_n)].index.values[0]
print(x[x['image_path'] == '/home/niki/kings/SEG/Book/book-recommendation/data{}.jpg'.format(image_n)].index.values
)
row = x[x['image_path'] == '/home/niki/kings/SEG/Book/book-recommendation/data{}.jpg'.format(image_n)].index.values[0]

indexs = x.index.values

scores_mse = []
scores_cs = []
for i in tqdm(range(len(indexs))):
    score_mse = tf.keras.metrics.mean_squared_error(x.loc[row]['img_embedding'], x.loc[indexs[i]]['img_embedding']).numpy()
    scores_mse.append(score_mse)
    
    #score_cs = cosine_similarity(x.loc[row]['img_embedding'], x.loc[indexs[i]]['img_embedding'])
    #scores_cs.append(score_cs[0])

                                          Book-Title  \
0                                Classical Mythology   
1                                       Clara Callan   
3  Flu: The Story of the Great Influenza Pandemic...   
4                             The Mummies of Urumchi   
6  What If?: The World's Foremost Military Histor...   

                                          image_path  \
0  /home/niki/kings/SEG/Book/book-recommendation/...   
1  /home/niki/kings/SEG/Book/book-recommendation/...   
3  /home/niki/kings/SEG/Book/book-recommendation/...   
4  /home/niki/kings/SEG/Book/book-recommendation/...   
6  /home/niki/kings/SEG/Book/book-recommendation/...   

                                       img_embedding  
0  ((tf.Tensor(0.00054164225, shape=(), dtype=flo...  
1  ((tf.Tensor(6.882686e-05, shape=(), dtype=floa...  
3  ((tf.Tensor(0.0003654444, shape=(), dtype=floa...  
4  ((tf.Tensor(0.00019347905, shape=(), dtype=flo...  
6  ((tf.Tensor(4.755374e-05, shape=(), dtype=floa..

100%|██████████| 171/171 [00:00<00:00, 2985.58it/s]


In [121]:
print(x.head()['image_path'].loc[0])

/home/niki/kings/SEG/Book/book-recommendation/data0.jpg


In [None]:
x['mse_score_{}'.format(image_n)] = scores_mse
x['cs_score_{}'.format(image_n)] = scores_cs

In [None]:
x.sort_values('mse_score_{}'.format(image_n)).head(10)

Unnamed: 0,Book-Title,image_path,img_embedding,mse_score_11,mse_score_177,cs_score_177
177,Der illustrierte Mann. ErzÃ?Â¤hlungen.,./data/book_images/177.jpg,"((tf.Tensor(8.626678e-05, shape=(), dtype=floa...",[4.952654e-05],[0.0],[1.0000001]
184,Die Mechanismen der Freude. ErzÃ?Â¤hlungen.,./data/book_images/184.jpg,"((tf.Tensor(0.00013835599, shape=(), dtype=flo...",[3.4072535e-05],[1.2562682e-05],[0.8050602]
29,OUT OF THE SILENT PLANET,./data/book_images/29.jpg,"((tf.Tensor(0.0022507187, shape=(), dtype=floa...",[4.044192e-05],[1.8114302e-05],[0.6842945]
183,Das Kind von morgen. ErzÃ?Â¤hlungen.,./data/book_images/183.jpg,"((tf.Tensor(0.00016410062, shape=(), dtype=flo...",[5.4939046e-05],[1.9970837e-05],[0.6851735]
178,Der KÃ?Â¶nig in Gelb.,./data/book_images/178.jpg,"((tf.Tensor(0.000269483, shape=(), dtype=float...",[4.341485e-05],[2.0836836e-05],[0.69961214]
30,Prague : A Novel,./data/book_images/30.jpg,"((tf.Tensor(0.0012666977, shape=(), dtype=floa...",[5.7706715e-05],[2.5908312e-05],[0.5674356]
196,Neun ErzÃ?Â¤hlungen.,./data/book_images/196.jpg,"((tf.Tensor(0.0003372069, shape=(), dtype=floa...",[3.9473543e-05],[2.8378201e-05],[0.44814217]
185,Familientreffen. ErzÃ?Â¤hlungen.,./data/book_images/185.jpg,"((tf.Tensor(7.7248056e-05, shape=(), dtype=flo...",[4.9842038e-05],[2.9418885e-05],[0.77509767]
179,Fahrenheit 451,./data/book_images/179.jpg,"((tf.Tensor(6.564988e-05, shape=(), dtype=floa...",[4.8493523e-05],[3.340422e-05],[0.6715101]
35,Tage der Unschuld.,./data/book_images/35.jpg,"((tf.Tensor(0.0006927976, shape=(), dtype=floa...",[4.6555182e-05],[3.4110017e-05],[0.26235813]


# Do the initialization of the vector depending on location age

In [137]:
#df_users_cleaned.groupby('Age')
# df_users_cleaned.head()df_users_cleaned.head()


TypeError: Index(...) must be called with a collection of some kind, 0 was passed

# Method to update the recommendations