In [1]:
# imports
import pandas as pd
import openai
from typing import List
import numpy as np
from openai.embeddings_utils import distances_from_embeddings, indices_of_nearest_neighbors_from_distances
from collections import Counter
import os

In [18]:
users = pd.read_csv("embeddings/users_filtered_final.csv") #document with user interactions
users.head()

Unnamed: 0,User,ID,Interactions_emb
0,U244,N17157 N38621 N35022 N50578 N264 N9120 N23907 ...,"[-0.005149974951877837, -0.013250857458654631,..."
1,U68369,N19381 N54536,"[0.0025621717686590273, 0.004183989018201828, ..."
2,U50236,N4020 N44292 N50292 N40772 N57737 N33969 N4054...,"[-0.010138329240492436, -0.01179651383115145, ..."
3,U77060,N23105 N41375,"[-0.005568941123783588, -0.025914330035448074,..."
4,U5596,N459 N56253 N62931 N55846 N29849 N45729 N62834...,"[-0.012533644353970886, -0.011675744312297967,..."


In [19]:
news = pd.read_csv("embeddings/news_emb_final.csv") #document with user interactions
news.head()

Unnamed: 0,ID,Category,SubCategory,Content,Content_emb
0,N55528,lifestyle,lifestyleroyals,"The Brands Queen Elizabeth, Prince Charles, an...","[0.005885085556656122, -0.007782096974551678, ..."
1,N19639,health,weightloss,50 Worst Habits For Belly Fat These seemingly ...,"[-0.004876355174928904, -0.007969613187015057,..."
2,N61837,news,newsworld,The Cost of Trump's Aid Freeze in the Trenches...,"[-0.02760046347975731, -0.013719998300075531, ..."
3,N53526,health,voices,I Was An NBA Wife. Here's How It Affected My M...,"[-0.0297758337110281, -0.014837449416518211, 0..."
4,N38324,health,medical,"How to Get Rid of Skin Tags, According to a De...","[0.005073545966297388, 0.004160495940595865, 0..."


In [20]:
# Create a dictionary with user interactions
user_dict = {}
for index, row in users.iterrows():
    user = row['User']
    interactions = row['Interactions_emb']
    user_dict[user] = ('Content', interactions)

for user, (content, embeddings_str) in user_dict.items():
    # Check for empty string before converting to float
    embeddings_list = [float(value) if value.strip() else 0.0 for value in embeddings_str.strip('[]').split(',')]
    user_dict[user] = (content, embeddings_list)


In [21]:
def print_recommendations_for_user(
    user_id: str,
    users_dict: dict,
) -> pd.DataFrame:
    user_embeddings = []
    user_articles_embeddings = users_dict[user_id][1]
    user_embeddings_np = np.array(user_articles_embeddings)
    user_embeddings = user_embeddings_np.tolist()

    content_dictionary = []
    for key in users_dict.keys():
        content_dictionary.append(users_dict[key][1])

    # Assume you have the following functions imported from embeddings_utils.py
    distances = distances_from_embeddings(user_embeddings, content_dictionary, distance_metric="cosine")
    indices_of_nearest_neighbors = indices_of_nearest_neighbors_from_distances(distances)

    user_keys = list(users_dict.keys())  # Fixed a typo: users_dict instead of user_dict
    selected_user_keys = []
    selected_distances = []

    for i in indices_of_nearest_neighbors:
        if 0 <= i < len(user_keys) and distances[i] != 0:
            user_key = user_keys[i]
            selected_user_keys.append(user_key)
            selected_distances.append(distances[i])

    # Creating a DataFrame
    result_df = pd.DataFrame({
        'User': selected_user_keys,
        'distance': selected_distances
    })

    return result_df

In [22]:
#create list with interacted items for a particular user
def user_list(user_id, df):
    user_list = df.loc[df['User'] == user_id, 'ID'].tolist()
    if len(user_list) > 0:
        return user_list[0].split()
    else:
        return []

In [23]:
def print_article_recommendations(user_id):
    recommendations = print_recommendations_for_user(user_id, user_dict)
    collab_recommender = pd.merge(recommendations, users, on='User', how='left')
    collab_recommender = collab_recommender.drop(['Interactions_emb'], axis=1)
    
    collab_recommender['ID'] = collab_recommender['ID'].str.split()
    collab_recommender = collab_recommender.explode('ID')

    article_distances = collab_recommender.groupby('ID').agg({'distance': np.sum}).reset_index()
    article_users = collab_recommender.groupby('ID')['User'].apply(list).reset_index()

    article_distances_users = pd.merge(article_distances, article_users, on='ID', how='left')
    article_distances_users['N_users'] = article_distances_users['User'].apply(len)
    article_distances_users['article_distance'] = article_distances_users['distance'] / article_distances_users['N_users']

    collab_final = pd.merge(news, article_distances_users[['ID', 'article_distance']], on='ID', how='left')
    collab_final = collab_final.drop(['Content_emb', 'Category', 'SubCategory', 'Content'], axis=1)
    
    collab_final['article_distance'].fillna(1, inplace=True)  # Replace NaN with 1
    collab_final = collab_final.sort_values(by='article_distance', ascending=True, na_position='last')
    
    # Delete the rows with articles read by the considered user 
    mask = collab_final['ID'].isin(user_list(user_id, users))

    # Invert the mask to keep the rows that are not in the list
    collab_rec = collab_final[~mask]
    
    return collab_rec


Saving recommendations for each user

In [24]:
# Assuming you have a DataFrame named 'users' with a 'User' column
users_list = users['User'].tolist()
len(users_list)

49945

In [25]:
# Calculate the size of each part
part_size = len(users_list) // 3

# Divide the list into three parts
part1 = users_list[:part_size]
part2 = users_list[part_size:2*part_size]
part3 = users_list[2*part_size:]

In [26]:
mypart = part1

In [27]:
# # Calculate the number of unique users in df
# num_unique_users = users['User'].nunique()

# print("Number of unique users:", num_unique_users)

In [3]:
def list_ids_in_folder(folder_path):
    ids = set()
    
    for filename in os.listdir(folder_path):
        if filename.endswith(".csv"):
            # Extracting the ID from the filename
            file_id = filename.split("_")[0][1:]
            ids.add(file_id)
    
    return list(ids)


In [4]:
folder_path = 'collaborative_recommendations'
ids_list = list_ids_in_folder(folder_path)
ids_list = ['U' + num for num in ids_list]
len(ids_list)

33296

In [30]:
set2 = set(ids_list)
rest_users = [item for item in mypart if item.strip() not in set2]
len(rest_users)

0

In [31]:
def user_list(user_id, df):
    user_list = df.loc[df['User'] == user_id, 'ID'].tolist()
    if len(user_list) > 0:
        user_id_str = str(user_list[0])  # Convert to string
        return user_id_str.split()
    else:
        return []


In [32]:
for i in rest_users:
    #Example usage
    recommendations = print_article_recommendations(user_id=i)
    recommendations.to_csv('collaborative_recommendations/' + i + '_collab.csv', index=False)