In [1]:
from func import *

import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse import coo_matrix

from lightfm import LightFM, cross_validation
from lightfm.evaluation import auc_score, precision_at_k, recall_at_k
from lightfm.data import Dataset

from sklearn.model_selection import train_test_split

from enum import Enum, auto

import pickle

from matplotlib import pyplot as plt

  books=pd.read_csv('data/Books.csv')


<h1>Obsah:</h1>

<ul>
    
- Čtení dat, preprocessing, příprava vstupů pro model
- Trénování modelu
- Evaluace modelu
- Predikce pro náhodného uživatele z existujícího datasetu
- Zavedení nového uživatele (hodnotil knihy, které neměly hodnocení)
- Zavedení nového uživatele (hodnotil knihy, které již byly hodnocené)
- (Cold start problem) Zavedení nového uživatele - bez hodnocení, pouze na základě user features
</ul>

<h1>Funkce</h1>

In [9]:
class PredictionType(Enum):
    WORST = auto()
    BEST = auto()

In [10]:
def interactions_filtering(dataset):
    """
    Pouze první trénovací dataset. Nové interakce neočisťujeme.
    """
    
    # Definovat společné IDs s Books i Users
    common_ids_isbn = pd.merge(ratings, books, on="ISBN", how="inner")["ISBN"].unique()
    common_ids_userid = pd.merge(ratings, users, on="User-ID", how="inner")["User-ID"].unique()

    # Očistit dataset o hodnoty, které nejsou v Book a Users - nutné
    dataset = dataset[dataset["ISBN"].isin(common_ids_isbn) & dataset["User-ID"].isin(common_ids_userid)]
    
    # Pouze explicitní feedback, nikoli implicitní (viz zadání)
    dataset = dataset[dataset["Book-Rating"] > 0] # pozitivní vliv na metriky
    
    # Pouze knihy, které byly hodnocené xkrát a více
    dataset = dataset.groupby("ISBN").filter(lambda x: len(x)>=15)
    
    # Pouze uživatelé, kteří hodnotily x a více knih (cold start)
    dataset = dataset[dataset.groupby("User-ID").ISBN.transform('nunique')>=10]
    
    # Upravit velikost datasetu kvůli délce trénování a evaluace modelu.
    if len(dataset) > 150000:
        dataset = dataset.sample(150000)
    
    return dataset

def dataset_preprocessing(dataset):
    # Ponechat hodnocení v původním stavu
    dataset["OriginalRating"] = dataset["Book-Rating"]
    
    # Ponechat pouze hodnocení x+
    #dataset = dataset[dataset["Book-Rating"]>7]
    
    # !Není možné použít v kombinaci s metodou odčítání od průměru. Exponenciálně větší šance na doporučení s přibývajícím hodnocením.
    #dataset["Book-Rating"] = dataset["Book-Rating"].transform(lambda x: x*x)
    
    # Každý má hodnotící standard nastavený jinak. Mirka Spáčilová dává nejvíce 7/10.
    #dataset["Book-Rating"] = dataset.groupby("User-ID")["Book-Rating"].transform(lambda x: (x - x.mean()) / x.std()) #" "
    #dataset["Book-Rating"] = dataset["Book-Rating"].fillna(0)
    
    return dataset

def split_location(location):
    parts = location.split(",")

    while len(parts) < 3:
        parts.insert(0, "")  
    city = parts[-3].strip() if parts[-1] else "" 
    region = parts[-2].strip() if len(parts) > 1 and parts[-2] else ""
    country = parts[-1].strip() if len(parts) > 2 and parts[-3] else ""
    
    return pd.Series([city, region, country], index=['city', 'region', 'country'])

def input_preparation(dataframe, dataset):
    dataset_tuple = list(zip(dataframe["User-ID"], dataframe["ISBN"], dataframe["Book-Rating"]))
    interactions, weights = dataset.build_interactions(i for i in dataset_tuple)
    return interactions, weights

def check_isbns(isbn_list):
    isbn_set = set(isbn_list)
    books_isbn_set = set(books["ISBN"])
    
    common_isbns = isbn_set.intersection(books_isbn_set)
    
    invalid_isbns = [x for x in isbn_set if x not in books_isbn_set]
    
    print(f"{len(common_isbns)}/{len(isbn_set)} valid ISBNs.")
    
    if len(common_isbns) != len(isbn_set):
        print("")
        print ("Invalid ISBNs:")
        for isbn in invalid_isbns:
            print(isbn)

def previous_ratings(userid, noitems=None):
    user_ratings = ratings[ratings["User-ID"]==userid]
    user_books = pd.merge(user_ratings, books, on="ISBN")
    user_books = user_books.sort_values(by="Book-Rating", ascending=False)
    
    if noitems is not None:
        user_books = user_books[:noitems]

    print(f"User ID: {userid}. Previous ratings (highest to lowest):")

    for _, row in user_books.iterrows():
        print(f"- {int((row['OriginalRating']))}*, {row['Book-Title']}, {row['Book-Author']}, {row['Year-Of-Publication']} (ISBN: {row['ISBN']})")
        
def prediction(userid, where, noitems, remove_rated):
    usercode = user_id_map.get(userid)
    
    _, n_items = train_interactions.shape
    scores = pd.Series(model.predict(usercode,np.arange(n_items)))

    items = pd.DataFrame(scores)

    items.index = items.index.map(item_id_inverse_map)
    items = items.reset_index(names="ISBN").rename(columns={0: "prob"})

    if remove_rated == True:
        ratedisbn = ratings[ratings["User-ID"]==userid]["ISBN"]
    elif remove_rated == False:
        ratedisbn = []

    booksfinal = pd.merge(books,items,on="ISBN",how="inner")

    if where == PredictionType.BEST:
        booksfinal = booksfinal.sort_values(by="prob", ascending=False)
        print(f"Predicted: best {noitems}:")
    elif where == PredictionType.WORST:
        booksfinal = booksfinal.sort_values(by="prob", ascending=True)
        print(f"Predictd: worst {noitems}:")

    for _, row in booksfinal[~booksfinal["ISBN"].isin(ratedisbn)][:noitems].iterrows():
        print(f"- {row['Book-Title']}, {row['Book-Author']}, {row['Year-Of-Publication']} (ISBN: {row['ISBN']})")

<h1>Data Reading</h1>

In [11]:
ratings = pd.read_csv("data/Ratings.csv")
books=pd.read_csv('data/Books.csv')
users=pd.read_csv("data/Users.csv")

  books=pd.read_csv('data/Books.csv')


<h1>DataFrame Preprocessing</h1>

<h2>Ratings</h2>

In [12]:
ratings = interactions_filtering(ratings)
ratings = dataset_preprocessing(ratings)
ratings

Unnamed: 0,User-ID,ISBN,Book-Rating,OriginalRating
1456,277427,002542730X,10,10
1474,277427,0061009059,9,9
1522,277427,0316776963,8,8
1543,277427,0345413903,10,10
1564,277427,0380702843,8,8
...,...,...,...,...
1149590,276680,0688163173,10,10
1149592,276680,0743203631,7,7
1149604,276680,0743486226,6,6
1149629,276680,1573229083,7,7


In [13]:
interactions_unique_users = ratings["User-ID"].unique()
interactions_unique_isbns = ratings["ISBN"].unique()

print("Unique users: ", len(interactions_unique_users))
print("Unique books: ", len(interactions_unique_isbns))

Unique users:  2158
Unique books:  3140


<h2>Users (features)</h2>

In [14]:
users.head()

Unnamed: 0,User-ID,Location,Age
0,1,"nyc, new york, usa",
1,2,"stockton, california, usa",18.0
2,3,"moscow, yukon territory, russia",
3,4,"porto, v.n.gaia, portugal",17.0
4,5,"farnborough, hants, united kingdom",


In [15]:
# Rozdělení Location feature na City, Region a Country
users[["City", "Region", "Country"]] = users["Location"].apply(lambda x: split_location(x))

# Nahrazení NaN hodnot ve věku nejčetnější hodnotou v datasetu
users["Age"] = users["Age"].fillna(users["Age"].mode()[0])

# Odstranění uživatelů, kteří nejsou v Ratings datasetu - kvůli dimenzionalitě
users = users[users["User-ID"].isin(set(interactions_unique_users))]

# List všech features - vstup pro dataset
unique_user_features = users[["Age", "City", "Region", "Country"]].values.flatten().tolist()

# Veškeré features spojené s User Id
user_features_tpl = [(user_id, [age, city, region, country]) 
             for user_id, age, city, region, country 
             in zip(users["User-ID"], users["Age"], users["City"], users["Region"], users["Country"])]

<h2>Books (features)</h2>

In [16]:
books.head()

Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher,Image-URL-S,Image-URL-M,Image-URL-L
0,195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...
1,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...
2,60973129,Decision in Normandy,Carlo D'Este,1991,HarperPerennial,http://images.amazon.com/images/P/0060973129.0...,http://images.amazon.com/images/P/0060973129.0...,http://images.amazon.com/images/P/0060973129.0...
3,374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata,1999,Farrar Straus Giroux,http://images.amazon.com/images/P/0374157065.0...,http://images.amazon.com/images/P/0374157065.0...,http://images.amazon.com/images/P/0374157065.0...
4,393045218,The Mummies of Urumchi,E. J. W. Barber,1999,W. W. Norton &amp; Company,http://images.amazon.com/images/P/0393045218.0...,http://images.amazon.com/images/P/0393045218.0...,http://images.amazon.com/images/P/0393045218.0...


In [17]:
# Odstranění knih, kteří nejsou v Ratings datasetu - kvůli dimenzionalitě
books = books[books["ISBN"].isin(set(interactions_unique_isbns))]

# List všech features - vstup pro dataset
unique_book_features = books[["Book-Author", "Year-Of-Publication", "Publisher"]].values.flatten().tolist()

# Veškeré features spojené s ISBN
book_features_tpl = [(isbn, [author, year, publisher]) 
             for isbn, author, year, publisher 
             in zip(books["ISBN"], books["Book-Author"], books["Year-Of-Publication"], books["Publisher"])]

<h1>Train/Test split (Interactions)</h1>

In [18]:
#train, test = train_test_split(ratings, test_size=0.2, random_state=42)

In [19]:
# Unique user IDs
user_ids = ratings['User-ID'].unique()

# Split user IDs into train and test sets
train_users, test_users = train_test_split(user_ids, test_size=0.2, random_state=42)

# Split the DataFrame into train and test sets based on user IDs
train = ratings[ratings['User-ID'].isin(train_users)]
test = ratings[ratings['User-ID'].isin(test_users)]

In [20]:
print("Train", len(train), "hodnocení")
print("Unikátní uživatelé: ", len(train["User-ID"].unique()))
print("Unikátní knihy: ", len(train["ISBN"].unique()))
print("AVG počet hodnocení na uživatele: ", len(train) / len(train["User-ID"].unique()))
print("AVG počet hodnocení na knihu: ", len(train) / len(train["ISBN"].unique()))

Train 41066 hodnocení
Unikátní uživatelé:  1726
Unikátní knihy:  3137
AVG počet hodnocení na uživatele:  23.792584009269987
AVG počet hodnocení na knihu:  13.090851131654446


In [21]:
print("Test", len(test), "hodnocení")
print("Unikátní uživatelé: ", len(test["User-ID"].unique()))
print("Unikátní knihy: ", len(test["ISBN"].unique()))
print("AVG počet hodnocení na uživatele: ", len(test) / len(test["User-ID"].unique()))
print("AVG počet hodnocení na knihu: ", len(test) / len(test["ISBN"].unique()))

Test 10623 hodnocení
Unikátní uživatelé:  432
Unikátní knihy:  2790
AVG počet hodnocení na uživatele:  24.59027777777778
AVG počet hodnocení na knihu:  3.80752688172043


<h1>Input preparation</h1>

<h2>Interactions</h2>

In [22]:
full_dataset = Dataset()
full_dataset.fit(users=interactions_unique_users, 
                 items=interactions_unique_isbns, 
                 user_features=unique_user_features,
                 item_features=unique_book_features)


user_id_map, user_features_map, item_id_map, item_features_map = full_dataset.mapping()

user_id_inverse_map = {v: k for k, v in user_id_map.items()}
item_id_inverse_map = {v: k for k, v in item_id_map.items()}

In [23]:
train_interactions, train_weights = input_preparation(train, full_dataset)
test_interactions, test_weights = input_preparation(test, full_dataset)

<h2>Features</h2>

In [24]:
user_features_matrix = full_dataset.build_user_features(user_features_tpl, normalize=False)

In [25]:
item_features_matrix = full_dataset.build_item_features(book_features_tpl, normalize=False)

<h1>Model</h1>

In [26]:
model = LightFM(loss="warp",
                learning_schedule="adagrad",
                learning_rate=0.01,
                item_alpha=0.005, # L2 regularizace
                user_alpha=0.005,
                no_components = 250,
                k=10,
               )

In [27]:
model.fit(train_interactions,
          sample_weight=train_weights,
          user_features=user_features_matrix,
          item_features=item_features_matrix,
          epochs=100, 
          num_threads=4,
          verbose=True,
         )

Epoch: 100%|██████████████████████████████████| 100/100 [01:09<00:00,  1.44it/s]


<lightfm.lightfm.LightFM at 0x2bb20db50>

<h1>Model Evaluation</h1>

In [28]:
train_auc = auc_score(model,
                      train_interactions,
                      user_features=user_features_matrix,
                      item_features=item_features_matrix,
                     ).mean()

test_auc = auc_score(model,
                     test_interactions,
                     train_interactions=train_interactions,
                     user_features=user_features_matrix,
                     item_features=item_features_matrix,
                    ).mean()

print(f"Train AUC: {train_auc}")
print(f"Test AUC: {test_auc}")

Train AUC: 0.9927167892456055
Test AUC: 0.6455078721046448


In [29]:
train_precision = precision_at_k(model,
                                 train_interactions,
                                 user_features=user_features_matrix,
                                 item_features=item_features_matrix,
                                 k=10,
                                ).mean()

test_precision = precision_at_k(model,
                                test_interactions,
                                train_interactions=train_interactions,
                                user_features=user_features_matrix,
                                item_features=item_features_matrix,
                                k=10,
                                ).mean()

print(f"Train Precision: {train_precision}")
print(f"Test Precision: {test_precision}")

Train Precision: 0.5327925682067871
Test Precision: 0.05486111342906952


In [30]:
train_recall = recall_at_k(model,
                           train_interactions,
                           user_features=user_features_matrix,
                           item_features=item_features_matrix,
                           k=10,
                          ).mean()

test_recall = recall_at_k(model,
                          test_interactions,
                          train_interactions=train_interactions,
                          user_features=user_features_matrix,
                          item_features=item_features_matrix,
                          k=10,
                         ).mean()

print(f"Train Recall: {train_recall}")
print(f"Test Recall: {test_recall}")

Train Recall: 0.29586290165683843
Test Recall: 0.02485984105859187


<h1>Single user Prediction</h1>

In [31]:
userid = int(ratings.sample(1)["User-ID"]) # random user
userid

  userid = int(ratings.sample(1)["User-ID"]) # random user


167494

In [35]:
previous_ratings(userid, 10)

User ID: 167494. Previous ratings (highest to lowest):
- 8*, The Tommyknockers, Stephen King, 1994 (ISBN: 0451156609)
- 8*, The Color of Water: A Black Man's Tribute to His White Mother, James McBride, 1997 (ISBN: 1573225789)
- 7*, Red Dragon, Thomas Harris, 2000 (ISBN: 0440206154)
- 7*, The Witch of Blackbird Pond (Laurel Leaf Books), ELIZABETH GEORGE SPEARE, 1978 (ISBN: 0440995779)
- 7*, Seinlanguage, Jerry Seinfeld, 1995 (ISBN: 0553569155)
- 7*, All the Pretty Horses (The Border Trilogy, Vol 1), CORMAC MCCARTHY, 1993 (ISBN: 0679744398)
- 7*, What Dreams May Come : A Novel, Richard Matheson, 1998 (ISBN: 0812570944)
- 6*, The Accidental Tourist, Anne Tyler, 1994 (ISBN: 0425092917)
- 5*, FORREST GUMP (Movie Tie in), Winston Groom, 1994 (ISBN: 0671894455)
- 5*, All I Really Need to Know, ROBERT FULGHUM, 1989 (ISBN: 080410526X)


In [36]:
prediction(userid, PredictionType.BEST, 10, remove_rated=False)

Predicted: best 10:
- The Lovely Bones: A Novel, Alice Sebold, 2002 (ISBN: 0316666343)
- The Da Vinci Code, Dan Brown, 2003 (ISBN: 0385504209)
- Interview with the Vampire, Anne Rice, 1993 (ISBN: 0345337662)
- Where the Heart Is (Oprah's Book Club (Paperback)), Billie Letts, 1998 (ISBN: 0446672211)
- Harry Potter and the Order of the Phoenix (Book 5), J. K. Rowling, 2003 (ISBN: 043935806X)
- Girl with a Pearl Earring, Tracy Chevalier, 2001 (ISBN: 0452282152)
- Good in Bed, Jennifer Weiner, 2002 (ISBN: 0743418174)
- 1st to Die: A Novel, James Patterson, 2002 (ISBN: 0446610038)
- Face the Fire (Three Sisters Island Trilogy), Nora Roberts, 2002 (ISBN: 051513287X)
- SHIPPING NEWS, Annie Proulx, 1994 (ISBN: 0671510053)


<h1>New user: books unrated before</h1>

<h2>Update datasetu a modelu</h2>

In [37]:
"""
New User Testing
user_id = 123456

Sci-Fi Fantasy fan
Crime not fan

375704027 -> 9 (Murakami)
385177259 -> 10 (Asimov)
345309014 -> 10 (Asimov)
60929871 -> 8 (Brave New World, Huxley)
553106635 -> 10 (Game of Thrones)
3548245978 -> 1 (Nesbo)
3548255442 -> 2 (Nesbo)
877959943 -> 1 (McBain)
877959870 -> 4 (McBain)
446517380 -> 3 (McBain)
553256785 -> 2 (A. Christie)
671555235 -> 3 (A. Christie)
"""

ratings_new_userid = 123456
ratings_new_isbn = ["0375704027", 
                    "0385177259", 
                    "0345309014", 
                    "0060929871", 
                    "0553106635", 
                    "3548245978", 
                    "3548255442", 
                    "0877959943", 
                    "0877959870", 
                    "0446517380", 
                    "0553256785", 
                    "0671555235"]
ratings_new_rating = [9, 10, 10, 8, 10, 1, 2, 1, 4, 3, 2, 3]
ratings_new = pd.DataFrame({
    "User-ID": ratings_new_userid,
    "ISBN": ratings_new_isbn,
    "Book-Rating": ratings_new_rating
})

In [38]:
check_isbns(ratings_new_isbn)

2/12 valid ISBNs.

Invalid ISBNs:
0385177259
3548245978
0345309014
0446517380
0671555235
0877959870
0553256785
3548255442
0877959943
0553106635


In [39]:
# Preprocessing nových dat
ratings_new = dataset_preprocessing(ratings_new)

# Spojení v celým Ratings datasetem
ratings = pd.concat([ratings, ratings_new], ignore_index=True)

In [40]:
full_dataset.fit_partial(users=ratings_new["User-ID"].unique(), 
                         items=ratings_new["ISBN"].unique())

user_id_map, user_feature_map, item_id_map, item_feature_map = full_dataset.mapping()

user_id_inverse_map = {v: k for k, v in user_id_map.items()}
item_id_inverse_map = {v: k for k, v in item_id_map.items()}

In [41]:
new_interactions, new_weights = input_preparation(ratings_new, full_dataset)

In [42]:
# Fit Partial for Model (model.fit_partial) nefunguje pro nové uživatele - dimenzionalita nesedí

model.fit(new_interactions,
          sample_weight=new_weights,
          #user_features=user_features_matrix,
          #item_features=item_features_matrix,
          num_threads=4,
          verbose=True,
         )

Epoch: 100%|█████████████████████████████████████| 1/1 [00:00<00:00, 357.39it/s]


<lightfm.lightfm.LightFM at 0x2bb20db50>

<h2>Predikce</h2>

In [43]:
# Problém je, že knihy v novém datasetu nebyly hodnoceny vícekrát, takže predikce pokulhává.

user_id = 123456
no_items = 10

previous_ratings(user_id, no_items) 
print("")
prediction(user_id, PredictionType.BEST, no_items, remove_rated=False)

User ID: 123456. Previous ratings (highest to lowest):
- 9*, Norwegian Wood (Vintage International Original), Haruki Murakami, 2000 (ISBN: 0375704027)
- 8*, Brave New World, Aldous Huxley, 1998 (ISBN: 0060929871)

Predicted: best 10:
- Norwegian Wood (Vintage International Original), Haruki Murakami, 2000 (ISBN: 0375704027)
- Brave New World, Aldous Huxley, 1998 (ISBN: 0060929871)
- Relic, Douglas Preston, 2003 (ISBN: 0812543262)
- Back When We Were Grownups : A Novel (Ballantine Reader's Circle), ANNE TYLER, 2002 (ISBN: 0345446860)
- Loose Screws (Red Dress Ink (Paperback)), Karen Templeton, 2002 (ISBN: 0373250193)
- Blood Shot (V.I. Warshawski Novels (Paperback)), Sara Paretsky, 1989 (ISBN: 0440204208)
- Tunnel Vision (V.I. Warshawski Novels (Paperback)), Sara Paretsky, 1995 (ISBN: 0440217520)
- Love You Forever, Robert N. Munsch, 1986 (ISBN: 0920668372)
- The Notebook, Nicholas Sparks, 1998 (ISBN: 0446605239)
- Still Pumped From Using The Mouse, Scott Adams, 1996 (ISBN: 0836210263)


<h1>New user: book rated before</h1>

In [44]:
"""
Fan of Sci-fi
Not a Fan of Classics

60809833 -> Brave New World, Huxley
60914653 -> The Unbearable Lightness of Being, Kundera
60934913 -> Kitchen Confidential: Adventures in the Culinary Underbelly, Anthony Bourdain
61020710 -> The Color of Magic, Terry Pratchett
64471047 -> The Lion, the Witch, and the Wardrobe (The Chronicles of Narnia, Book 2), C. S. Lewis
64471063 -> The Horse and His Boy, C. S. Lewis
64471101 -> The Magician's Nephew (rack) (Narnia), C. S. Lewis
64407667 -> The Bad Beginning (A Series of Unfortunate Events, Book 1), Lemony Snicket
64407675 -> The Reptile Room (A Series of Unfortunate Events, Book 2), Lemony Snicket
64407683 -> The Wide Window (A Series of Unfortunate Events, Book 3), Lemony Snicket
140042393 -> The Grapes of Wrath, John Steinbeck
140042520 -> Dharma Bums, Jack Kerouac
140042598 -> On the Road, Jack Kerouac
140053204 -> Travels With Charley: In Search of America, John Steinbeck
345303067 -> 2010: Odyssey Two, Arthur C. Clarke
345339703 -> The Fellowship of the Ring (The Lord of the Rings, Part 1), J.R.R. TOLKIEN
345339711 -> The Two Towers (The Lord of the Rings, Part 2), J.R.R. TOLKIEN
345389964 -> A Son of the Circus, John Irving
"""

ratings_new_userid = 123457
ratings_new_isbn = ['0060809833', 
                    '0060914653', 
                    '0060934913', 
                    '0061020710', 
                    '0064471047', 
                    '0064471063', 
                    '0064471101', 
                    '0064407667', 
                    '0064407675', 
                    '0064407683', 
                    '0140042393', 
                    '0140042520', 
                    '0140042598', 
                    '0140053204', 
                    '0345303067', 
                    '0345339703', 
                    '0345339711', 
                    '0345389964']
ratings_new_rating = [9,2,1,8,9,10,8, 7, 7, 8, 0, 3, 1, 2, 8, 9, 10, 4]
ratings_new = pd.DataFrame({
    "User-ID": ratings_new_userid,
    "ISBN": ratings_new_isbn,
    "Book-Rating": ratings_new_rating
})

In [45]:
# Features

In [46]:
check_isbns(ratings_new_isbn)

18/18 valid ISBNs.


In [47]:
# Preprocessing nových dat
ratings_new = dataset_preprocessing(ratings_new)

# Spojení v celým Ratings datasetem
ratings = pd.concat([ratings, ratings_new], ignore_index=True)

In [48]:
full_dataset.fit_partial(users=ratings_new["User-ID"].unique(), 
                         items=ratings_new["ISBN"].unique())

user_id_map, user_feature_map, item_id_map, item_feature_map = full_dataset.mapping()

user_id_inverse_map = {v: k for k, v in user_id_map.items()}
item_id_inverse_map = {v: k for k, v in item_id_map.items()}

In [49]:
new_interactions, new_weights = input_preparation(ratings_new, full_dataset)

In [50]:
# Fit Partial for Model (model.fit_partial) nefunguje pro nové uživatele - dimenzionalita neodpovídá

model.fit(new_interactions,
          sample_weight=new_weights,
          num_threads=4,
          verbose=True,
         )

Epoch: 100%|█████████████████████████████████████| 1/1 [00:00<00:00, 479.13it/s]


<lightfm.lightfm.LightFM at 0x2bb20db50>

In [51]:
user_id = 123457
no_items = 10

previous_ratings(user_id, no_items) 
print("")
prediction(user_id, PredictionType.BEST, no_items, remove_rated=True)

User ID: 123457. Previous ratings (highest to lowest):
- 10*, The Two Towers (The Lord of the Rings, Part 2), J.R.R. TOLKIEN, 1986 (ISBN: 0345339711)
- 10*, The Horse and His Boy, C. S. Lewis, 1994 (ISBN: 0064471063)
- 9*, Brave New World, Aldous Huxley, 1989 (ISBN: 0060809833)
- 9*, The Fellowship of the Ring (The Lord of the Rings, Part 1), J.R.R. TOLKIEN, 1986 (ISBN: 0345339703)
- 9*, The Lion, the Witch, and the Wardrobe (The Chronicles of Narnia, Book 2), C. S. Lewis, 1994 (ISBN: 0064471047)
- 8*, 2010: Odyssey Two, Arthur C. Clarke, 1984 (ISBN: 0345303067)
- 8*, The Wide Window (A Series of Unfortunate Events, Book 3), Lemony Snicket, 2000 (ISBN: 0064407683)
- 8*, The Magician's Nephew (rack) (Narnia), C. S. Lewis, 2002 (ISBN: 0064471101)
- 8*, The Color of Magic, Terry Pratchett, 2000 (ISBN: 0061020710)
- 7*, The Bad Beginning (A Series of Unfortunate Events, Book 1), Lemony Snicket, 1999 (ISBN: 0064407667)

Predicted: best 10:
- Different Seasons (Signet), Stephen King, 2004 (I

<h1>New user: no ratings before (cold start)</h1>

new_userid = 123458

features = [24.0, 'arden hills', 'minnesota', 'usa']

full_dataset.fit_partial(users=np.array([new_userid]), 
                         user_features=features,
                        )

user_id_map, user_feature_map, item_id_map, item_feature_map = full_dataset.mapping()

user_id_inverse_map = {v: k for k, v in user_id_map.items()}
item_id_inverse_map = {v: k for k, v in item_id_map.items()}

#features
user_tuple = (new_userid, [24.0, 'arden hills', 'minnesota', 'usa'])

#model.fit()

model.fit(train_interactions,
          #sample_weight=train_weights,
          user_features=user_features_matrix,
          #item_features=item_features_matrix,
          #epochs=100, 
          #num_threads=4,
          #verbose=True,
         )

<h1>Model save</h1>