In [1]:
# imports
import pandas as pd
import numpy as np
from openai.embeddings_utils import distances_from_embeddings, indices_of_nearest_neighbors_from_distances
import os

In [2]:
users = pd.read_csv("embeddings/users_emb_TEST.csv") #document with user interactions
users.columns =['User', 'ID', 'Interactions_emb']
users.head()

Unnamed: 0,User,ID,Interactions_emb
0,U13740,N55189 N42782 N34694 N45794 N18445 N63302 N104...,"[-0.015360403599010574, -0.022621901123784482,..."
1,U91836,N31739 N6072 N63045 N23979 N35656 N43353 N8129...,"[-0.005392600822233362, -0.004443325935426401,..."
2,U73700,N10732 N25792 N7563 N21087 N41087 N5445 N60384...,"[-0.004993118400064608, -0.008147992255787054,..."


In [3]:
news = pd.read_csv("embeddings/news_emb_TEST.csv") #document with user interactions
news.head()

Unnamed: 0,ID,Category,SubCategory,Content,Content_emb
0,N55189,tv,tvnews,"'Wheel Of Fortune' Guest Delivers Hilarious, O...","[-0.020967688411474228, -0.020634232088923454,..."
1,N42782,sports,baseball_mlb,Three takeaways from Yankees' ALCS Game 5 vict...,"[-0.008319429121911526, 0.0013667173916473985,..."
2,N34694,tv,tvnews,Rosie O'Donnell: Barbara Walters Isn't 'Up to ...,"[-0.042908210307359695, -0.018993420526385307,..."
3,N45794,news,newscrime,Four flight attendants were arrested in Miami'...,"[-0.007930373772978783, -0.0221870094537735, 0..."
4,N18445,sports,football_ncaa,Michigan sends breakup tweet to Notre Dame as ...,"[-0.020153459161520004, -0.044797133654356, 0...."


In [4]:
# Create a dictionary with user interactions
user_dict = {}
for index, row in users.iterrows():
    user = row['User']
    interactions = row['Interactions_emb']
    user_dict[user] = ('Content', interactions)

for user, (content, embeddings_str) in user_dict.items():
    # Check for empty string before converting to float
    embeddings_list = [float(value) if value.strip() else 0.0 for value in embeddings_str.strip('[]').split(',')]
    user_dict[user] = (content, embeddings_list)


In [5]:
def print_recommendations_for_user(
    user_id: str,
    users_dict: dict,
) -> pd.DataFrame:
    user_embeddings = []
    user_articles_embeddings = users_dict[user_id][1]
    user_embeddings_np = np.array(user_articles_embeddings)
    user_embeddings = user_embeddings_np.tolist()

    content_dictionary = []
    for key in users_dict.keys():
        content_dictionary.append(users_dict[key][1])

    # Assume you have the following functions imported from embeddings_utils.py
    distances = distances_from_embeddings(user_embeddings, content_dictionary, distance_metric="cosine")
    indices_of_nearest_neighbors = indices_of_nearest_neighbors_from_distances(distances)

    user_keys = list(users_dict.keys())  # Fixed a typo: users_dict instead of user_dict
    selected_user_keys = []
    selected_distances = []

    for i in indices_of_nearest_neighbors:
        if 0 <= i < len(user_keys) and distances[i] != 0:
            user_key = user_keys[i]
            selected_user_keys.append(user_key)
            selected_distances.append(distances[i])

    # Creating a DataFrame
    result_df = pd.DataFrame({
        'User': selected_user_keys,
        'distance': selected_distances
    })

    return result_df

In [6]:
#create list with interacted items for a particular user
def user_list(user_id, df):
    user_list = df.loc[df['User'] == user_id, 'ID'].tolist()
    if len(user_list) > 0:
        return user_list[0].split()
    else:
        return []

In [7]:
def print_article_recommendations(user_id):
    recommendations = print_recommendations_for_user(user_id, user_dict)
    collab_recommender = pd.merge(recommendations, users, on='User', how='left')
    collab_recommender = collab_recommender.drop(['Interactions_emb'], axis=1)
    
    collab_recommender['ID'] = collab_recommender['ID'].str.split()
    collab_recommender = collab_recommender.explode('ID')

    article_distances = collab_recommender.groupby('ID').agg({'distance': np.sum}).reset_index()
    article_users = collab_recommender.groupby('ID')['User'].apply(list).reset_index()

    article_distances_users = pd.merge(article_distances, article_users, on='ID', how='left')
    article_distances_users['N_users'] = article_distances_users['User'].apply(len)
    article_distances_users['article_distance'] = article_distances_users['distance'] / article_distances_users['N_users']

    collab_final = pd.merge(news, article_distances_users[['ID', 'article_distance']], on='ID', how='left')
    collab_final = collab_final.drop(['Content_emb', 'Category', 'SubCategory', 'Content'], axis=1)
    
    collab_final['article_distance'].fillna(1, inplace=True)  # Replace NaN with 1
    collab_final = collab_final.sort_values(by='article_distance', ascending=True, na_position='last')
    
    # Delete the rows with articles read by the considered user 
    mask = collab_final['ID'].isin(user_list(user_id, users))

    # Invert the mask to keep the rows that are not in the list
    collab_rec = collab_final[~mask]
    
    return collab_rec


Saving recommendations for each user

In [8]:
# Assuming you have a DataFrame named 'users' with a 'User' column
users_list = users['User'].tolist()

In [11]:
def user_list(user_id, df):
    user_list = df.loc[df['User'] == user_id, 'ID'].tolist()
    if len(user_list) > 0:
        user_id_str = str(user_list[0])  # Convert to string
        return user_id_str.split()
    else:
        return []


In [13]:
for i in users_list:
    #Example usage
    recommendations = print_article_recommendations(user_id=i)
    recommendations.to_csv('collaborative_recommendations' + i + '_collab.csv', index=False)

  article_distances = collab_recommender.groupby('ID').agg({'distance': np.sum}).reset_index()
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  collab_final['article_distance'].fillna(1, inplace=True)  # Replace NaN with 1
  article_distances = collab_recommender.groupby('ID').agg({'distance': np.sum}).reset_index()
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation