In [1]:
import torch
from torch.nn.functional import cosine_similarity
from cf_model import MLPCollaborativeFilter,MLPContentFilter  # Import your model class

# import sqlite3

In [2]:
DB_NAME = 'indie_letterboxd_v2'

In [3]:
!ls

best_col_model_checkpoint.pth		main.ipynb
best_contentfiltermodel_checkpoint.pth	main.py
best_model_checkpoint.pth		matching.ipynb
best_model_state.pth			menv
cf_model.py				mlruns
col_encoder.pkl				models
con_encoder.pkl				movie.py
config.py				path_to_your_database.db
dataset.py				__pycache__
db.ipynb				q.py
Dockerfile				README.md
done.txt				requirements.txt
encoder.pkl				server
fdstests.ipynb				test.db
filemanager.py				testsources.ipynb
final_col_model_checkpoint.pth		todo.txt
final_con_model_checkpoint.pth		train_colfilter.ipynb
holocenemodels				train_confilter.ipynb
indie_letterboxd.db			user.py
indie_letterboxd_v2.db			wandb
letterboxd				web_spider.py
letterboxd.db


In [4]:
## Load models

In [5]:
import pickle

# Replace 'yourfile.pkl' with the path to your actual pickle file
pickle_file_path = 'col_encoder.pkl'

# Open the file in binary read mode
with open(pickle_file_path, 'rb') as file:
    # Load the object from the file
    col_encoder = pickle.load(file)

# Replace 'yourfile.pkl' with the path to your actual pickle file
pickle_file_path = 'con_encoder.pkl'

# Open the file in binary read mode
with open(pickle_file_path, 'rb') as file:
    # Load the object from the file
    con_encoder = pickle.load(file)

In [None]:
FEATURES = 700
#col features
num_users = len(col_encoder.vocab_to_idx['users'])+1
num_movies = len(col_encoder.vocab_to_idx['movies'])+1
#con features
num_mv_features = len(con_encoder.one_hot_encode('Action','genres')) + len(con_encoder.one_hot_encode('Bulgaria','countries'))
con_num_users = len(con_encoder.vocab_to_idx['users'])

In [10]:
col_model = MLPCollaborativeFilter(num_users, num_movies, embedding_dim=FEATURES)

# Load the entire checkpoint
checkpoint = torch.load('./final_col_model_checkpoint.pth')

# Extract the model's state dictionary from the checkpoint
model_state_dict = checkpoint['model_state_dict']

# Now load the state dictionary into your model
col_model.load_state_dict(model_state_dict)

con_model = MLPContentFilter(len(con_encoder.vocab_to_idx['users']),num_mv_features, embedding_dim=FEATURES)
# Load the entire checkpoint
checkpoint = torch.load('./final_con_model_checkpoint.pth')
# Extract the model's state dictionary from the checkpoint
model_state_dict = checkpoint['model_state_dict']
# Now load the state dictionary into your model
con_model.load_state_dict(model_state_dict)

<All keys matched successfully>

In [17]:
def find_similar_users(model, target_user, top_n=10):
    """
    Find top_n most similar users to the target_user_id based on their user embeddings.
    
    Args:
    - model: The trained collaborative filtering model with user embeddings.
    - target_user_id: The ID of the user for whom to find similar users.
    - top_n: Number of similar users to retrieve.
    
    Returns:
    - top_similar_users: Indices of the top_n similar users.
    """
    # Assuming 'user_embeddings' is retrieved from your model
    user_embeddings = model.user_embedding.weight.data

    # Ensure target_user_id is valid
    target_user_enc = col_encoder.encode(target_user,'users')
    print(target_user_enc)
    if target_user_enc >= len(user_embeddings):
        raise ValueError("Target user ID is out of range.")
    
    target_user_embedding = user_embeddings[target_user_enc].unsqueeze(0)
    similarities = cosine_similarity(target_user_embedding, user_embeddings)

    # Adjust top_n if necessary
    num_users = user_embeddings.size(0)
    actual_top_n = min(top_n, num_users - 1)  # Ensure we don't exceed the number of available users
    
    # Find top similar users excluding the target user itself
    values, indices = torch.topk(similarities, actual_top_n + 1)
    top_similar_users = indices[indices != target_user_enc][:actual_top_n]  # Exclude target user and adjust to actual_top_n
    
    return top_similar_users.tolist()


In [47]:
def find_similar_users_avghybrid(target_user, genre, country, top_n=10):
    """
    Find top_n most similar users to the target_user_id based on their user embeddings.
    
    Args:
    - model: The trained collaborative filtering model with user embeddings.
    - target_user_id: The ID of the user for whom to find similar users.
    - top_n: Number of similar users to retrieve.
    
    Returns:
    - top_similar_users: Indices of the top_n similar users.
    """
    # Assuming 'user_embeddings' is retrieved from your model
    user_embeddings = col_model.user_embedding.weight.data

    # Ensure target_user_id is valid
    target_user_enc = col_encoder.encode(target_user,'users')
    print(target_user_enc)
    if target_user_enc >= len(user_embeddings):
        raise ValueError("Target user ID is out of range.")
    
    target_user_embedding = user_embeddings[target_user_enc].unsqueeze(0)
    cf_similarities = cosine_similarity(target_user_embedding, user_embeddings)



    from dataset import ConFDataset

    data = [(target_user,genre,country,0.0)]

    ds = ConFDataset(data,con_encoder)

    dl = torch.utils.data.DataLoader(ds, batch_size=1)

    con_model.eval() # swtich off batch normalisation
    with torch.no_grad():  # Disable gradient computation
        for user, movie_features, ratings in dl:
            cbf_scores = con_model(user,movie_features)  # Generate predictions


    # Step 3: Average CF and CBF similarities/scores
    combined_scores = (cf_similarities.squeeze() + cbf_scores) / 2

    print('combined_scores :',combined_scores)
    # Step 4: Find top similar users, excluding the target user
    _, indices = torch.topk(combined_scores, top_n + 1)  # Get indices of top scores
    top_similar_users = [con_encoder.decode(idx.item(),'users') for idx in indices if idx != target_user_enc][:top_n]  # Exclude target user
    
    return top_similar_users

In [42]:
def get_user_movie_profiles():



    import sqlite3

    # Connect to your database
    conn = sqlite3.connect(DB_NAME+'.db')
    cursor = conn.cursor()
    
    # SQL query
    query = '''
                SELECT 
                    Users.user_id,
                    Users.name AS user_name,
                    GROUP_CONCAT(DISTINCT Genres.genre_id) AS genre_ids,
                    GROUP_CONCAT(DISTINCT Genres.name) AS genre_names,
                    Countries.country_id,
                    Countries.country_name,
                    Movies.movie_id,
                    Movies.title AS movie_name,
                    Reviews.rating
                FROM 
                    Reviews
                JOIN
                    Users ON Reviews.user_id = Users.user_id
                JOIN 
                    Movies ON Reviews.movie_id = Movies.movie_id
                JOIN 
                    Countries ON Movies.country_id = Countries.country_id
                LEFT JOIN 
                    MoviesGenres ON Movies.movie_id = MoviesGenres.movie_id
                LEFT JOIN 
                    Genres ON MoviesGenres.genre_id = Genres.genre_id
                GROUP BY 
                    Reviews.review_id, Reviews.rating
                ORDER BY 
                    Movies.movie_id, Users.user_id
    
            '''
    
    # Execute the query
    cursor.execute(query)
    
    # Fetch all results
    results = cursor.fetchall()
    
    # Close the connection
    conn.close()
    
    return results

In [43]:
def get_user_movie_scores(user_name,user_movies):

    for user_movie in user_movies:
        
        if user_name in user_movie[1]:
            
            yield {'name':user_movie[1],'film name':user_movie[7],'rating':user_movie[-1]}
    

In [44]:
DB_NAME = 'indie_letterboxd_v2'
# Connect to your SQLite database
conn = sqlite3.connect(DB_NAME+'.db')

# Create a cursor object
cur = conn.cursor()

# Execute the query to fetch all users
cur.execute("SELECT user_id, name FROM Users")

# Fetch all rows
users = cur.fetchall()

# Close the connection
conn.close()

# Print the users
# for user_id, name in users:
#     print(user_id, name)
users[:10]

[('e5b13caf-817e-4161-ad39-b4a25bf3d367', 'Matias Kivimäki'),
 ('42e54389-1ba4-4a78-8e3c-856b71a92d12', 'Monica Johansson'),
 ('d9b17abb-9b5d-42e2-8de7-c3e356797052', 'Արշավիր Դուդուկչյան'),
 ('90fc05a6-a309-4ed9-82ab-5b2f1f97ce7f', 'रमेश दवाडी'),
 ('62e6acdc-aa52-4b80-a025-bb3fc3685512', 'लिटन मानन्धर'),
 ('ee809b11-b626-43b0-ab63-b8f086385e71', 'Lorena da Paiva'),
 ('0effd6e8-0fe1-4da3-8000-8c03034cefa5', 'Jennifer Paz Jaume'),
 ('f5e69203-1f6b-462a-8331-add5a67a3dbd', 'Erkki Hannula'),
 ('c7d0e89b-2377-4b8f-81e6-9d311db8f67c', 'Yolal Sezer'),
 ('33a9202c-a660-4696-93f2-be4067200911', 'Johannes Hartikainen')]

In [46]:
# Example usage
target_user_name = 'Francis Robinson'
genre = 'Drama,Romance'.split(',')
country = 'South Korea'




print(find_similar_users_avghybrid(target_user_name, genre, country, top_n=10))





649
[('Francis Robinson', ['Drama', 'Romance'], 'South Korea', 0.0)]
(tensor(319), tensor([0., 1., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0.]), tensor(0.))
1
combined_scores : tensor([74.0444, 74.0686, 74.0434,  ..., 74.0467, 74.0879, 74.0410])
['Janet Barnett', 'Фрацил Джогов', 'Toby Knight', 'Rolandas Ambrasas', 'Emil Haugland', 'Leah Greene', 'Г-ца Лидийка Вампирска', 'Danuše Švecová', 'مهدیس روحانی', 'Filipa Assunção']


In [28]:
## Users

In [29]:
user_movies = get_user_movie_profiles()
user_movies[:5]

[('0381a2ce-de6e-4a92-8117-8a290833c219',
  'Terry Roberson',
  '2,9',
  'Drama,Romance',
  29,
  'South Korea',
  '001e5226-4f9f-4e01-a824-016ddcd62aad',
  'Endings Beginnings',
  0.4),
 ('05f8c9fa-0bce-4185-aaf2-1361253cdeba',
  'Edward Roman',
  '2,9',
  'Drama,Romance',
  29,
  'South Korea',
  '001e5226-4f9f-4e01-a824-016ddcd62aad',
  'Endings Beginnings',
  0.4),
 ('0687954c-a123-43ff-8833-7d5d5c8413a1',
  'विद्या नाम',
  '2,9',
  'Drama,Romance',
  29,
  'South Korea',
  '001e5226-4f9f-4e01-a824-016ddcd62aad',
  'Endings Beginnings',
  0.82),
 ('0b24d5c6-59e4-4fa5-9433-dd00f714dd31',
  'Francis Robinson',
  '2,9',
  'Drama,Romance',
  29,
  'South Korea',
  '001e5226-4f9f-4e01-a824-016ddcd62aad',
  'Endings Beginnings',
  1),
 ('122337fe-0776-4fe9-b89e-aae1fb6b7430',
  'Г-жа Панда Плюнкова',
  '2,9',
  'Drama,Romance',
  29,
  'South Korea',
  '001e5226-4f9f-4e01-a824-016ddcd62aad',
  'Endings Beginnings',
  1)]

In [31]:
decoded_users = [col_encoder.decode(su,'users') for su in similar_users]

In [33]:
target_user_film_reviews = [um for um in get_user_movie_scores(target_user_name,user_movies)]
target_user_film_reviews

[{'name': 'Lorena da Paiva', 'film name': 'Snow and the Bear', 'rating': 0.9},
 {'name': 'Lorena da Paiva', 'film name': 'America Latina', 'rating': 0.5},
 {'name': 'Lorena da Paiva',
  'film name': 'Mothers and Monsters',
  'rating': 0.6},
 {'name': 'Lorena da Paiva', 'film name': 'Riceboy Sleeps', 'rating': 0.8},
 {'name': 'Lorena da Paiva',
  'film name': 'The Happiest Man in the World',
  'rating': 0.7},
 {'name': 'Lorena da Paiva',
  'film name': 'Onoda 10000 Nights in the Jungle',
  'rating': 0.6},
 {'name': 'Lorena da Paiva', 'film name': 'The Woman King', 'rating': 0.7},
 {'name': 'Lorena da Paiva', 'film name': 'Soft', 'rating': 0.5},
 {'name': 'Lorena da Paiva', 'film name': 'Blood Quantum', 'rating': 0.7},
 {'name': 'Lorena da Paiva', 'film name': 'Victim', 'rating': 0.7},
 {'name': 'Lorena da Paiva', 'film name': 'Brothers', 'rating': 0.7},
 {'name': 'Lorena da Paiva', 'film name': 'Fire Will Come', 'rating': 0.8},
 {'name': 'Lorena da Paiva', 'film name': 'Free Money', 'ra

In [39]:
user_ums = [um for du in decoded_users for um in get_user_movie_scores(du,user_movies)]

In [40]:
user_ums

[{'name': '周建', 'film name': 'Stellar', 'rating': 0.8},
 {'name': '周建',
  'film name': 'Please Speak Continuously and Describe Your Experiences as They Come to You',
  'rating': 0.8},
 {'name': '周建', 'film name': 'You Are Not My Mother', 'rating': 0.6},
 {'name': '周建', 'film name': 'And Tomorrow the Entire World', 'rating': 0.4},
 {'name': '周建', 'film name': 'Zana', 'rating': 0.6},
 {'name': '周建', 'film name': 'Blow the Man Down', 'rating': 0.8},
 {'name': '周建', 'film name': 'The Pink Cloud', 'rating': 0.7},
 {'name': '周建', 'film name': 'Under the Fig Trees', 'rating': 0.8},
 {'name': '周建', 'film name': 'The Truffle Hunters', 'rating': 1},
 {'name': 'შაქრო ქორიძე',
  'film name': 'Please Speak Continuously and Describe Your Experiences as They Come to You',
  'rating': 0.7},
 {'name': 'შაქრო ქორიძე', 'film name': 'Nope', 'rating': 1},
 {'name': 'შაქრო ქორიძე', 'film name': 'Toras Husband', 'rating': 0.7},
 {'name': 'შაქრო ქორიძე', 'film name': 'Listening to Kenny G', 'rating': 1},
 {'n