In [1]:
# imports
import pandas as pd
import openai
from typing import List
import numpy as np
from openai.embeddings_utils import distances_from_embeddings, indices_of_nearest_neighbors_from_distances
from collections import Counter
import os

In [2]:
users = pd.read_csv("embeddings/users_emb_final.csv") #document with user interactions
users.head()

Unnamed: 0,User,ID,Interactions_emb
0,U244,N17157 N38621 N35022 N50578 N264 N9120 N23907 ...,"[-0.005149974951877837, -0.013250857458654631,..."
1,U68369,N19381 N54536,"[0.0025621717686590273, 0.004183989018201828, ..."
2,U50236,N4020 N44292 N50292 N40772 N57737 N33969 N4054...,"[-0.010138329240492436, -0.01179651383115145, ..."
3,U77060,N23105 N41375,"[-0.005568941123783588, -0.025914330035448074,..."
4,U5596,N459 N56253 N62931 N55846 N29849 N45729 N62834...,"[-0.012533644353970886, -0.011675744312297967,..."


In [3]:
news = pd.read_csv("embeddings/news_emb_final.csv") #document with user interactions
news.head()

Unnamed: 0,ID,Category,SubCategory,Content,Content_emb
0,N55528,lifestyle,lifestyleroyals,"The Brands Queen Elizabeth, Prince Charles, an...","[0.005885085556656122, -0.007782096974551678, ..."
1,N19639,health,weightloss,50 Worst Habits For Belly Fat These seemingly ...,"[-0.004876355174928904, -0.007969613187015057,..."
2,N61837,news,newsworld,The Cost of Trump's Aid Freeze in the Trenches...,"[-0.02760046347975731, -0.013719998300075531, ..."
3,N53526,health,voices,I Was An NBA Wife. Here's How It Affected My M...,"[-0.0297758337110281, -0.014837449416518211, 0..."
4,N38324,health,medical,"How to Get Rid of Skin Tags, According to a De...","[0.005073545966297388, 0.004160495940595865, 0..."


In [4]:
# Create a dictionary with user interactions
user_dict = {}
for index, row in users.iterrows():
    user = row['User']
    interactions = row['Interactions_emb']
    user_dict[user] = ('Content', interactions)

for user, (content, embeddings_str) in user_dict.items():
    # Check for empty string before converting to float
    embeddings_list = [float(value) if value.strip() else 0.0 for value in embeddings_str.strip('[]').split(',')]
    user_dict[user] = (content, embeddings_list)


In [5]:
def print_recommendations_for_user(
    user_id: str,
    users_dict: dict,
) -> pd.DataFrame:
    user_embeddings = []
    user_articles_embeddings = users_dict[user_id][1]
    user_embeddings_np = np.array(user_articles_embeddings)
    user_embeddings = user_embeddings_np.tolist()

    content_dictionary = []
    for key in users_dict.keys():
        content_dictionary.append(users_dict[key][1])

    # Assume you have the following functions imported from embeddings_utils.py
    distances = distances_from_embeddings(user_embeddings, content_dictionary, distance_metric="cosine")
    indices_of_nearest_neighbors = indices_of_nearest_neighbors_from_distances(distances)

    user_keys = list(users_dict.keys())  # Fixed a typo: users_dict instead of user_dict
    selected_user_keys = []
    selected_distances = []

    for i in indices_of_nearest_neighbors:
        if 0 <= i < len(user_keys) and distances[i] != 0:
            user_key = user_keys[i]
            selected_user_keys.append(user_key)
            selected_distances.append(distances[i])

    # Creating a DataFrame
    result_df = pd.DataFrame({
        'User': selected_user_keys,
        'distance': selected_distances
    })

    return result_df

In [6]:
#create list with interacted items for a particular user
def user_list(user_id, df):
    user_list = df.loc[df['User'] == user_id, 'ID'].tolist()
    if len(user_list) > 0:
        return user_list[0].split()
    else:
        return []

In [7]:
def print_article_recommendations(user_id):
    recommendations = print_recommendations_for_user(user_id, user_dict)
    collab_recommender = pd.merge(recommendations, users, on='User', how='left')
    collab_recommender = collab_recommender.drop(['Interactions_emb'], axis=1)
    
    collab_recommender['ID'] = collab_recommender['ID'].str.split()
    collab_recommender = collab_recommender.explode('ID')

    article_distances = collab_recommender.groupby('ID').agg({'distance': np.sum}).reset_index()
    article_users = collab_recommender.groupby('ID')['User'].apply(list).reset_index()

    article_distances_users = pd.merge(article_distances, article_users, on='ID', how='left')
    article_distances_users['N_users'] = article_distances_users['User'].apply(len)
    article_distances_users['article_distance'] = article_distances_users['distance'] / article_distances_users['N_users']

    collab_final = pd.merge(news, article_distances_users[['ID', 'article_distance']], on='ID', how='left')
    collab_final = collab_final.drop(['Content_emb', 'Category', 'SubCategory', 'Content'], axis=1)
    
    collab_final['article_distance'].fillna(1, inplace=True)  # Replace NaN with 1
    collab_final = collab_final.sort_values(by='article_distance', ascending=True, na_position='last')
    
    # Delete the rows with articles read by the considered user 
    mask = collab_final['ID'].isin(user_list(user_id, users))

    # Invert the mask to keep the rows that are not in the list
    collab_rec = collab_final[~mask]
    
    return collab_rec


In [8]:
print_article_recommendations('U139')

  dist = 1.0 - uv / np.sqrt(uu * vv)


Unnamed: 0,ID,article_distance
5266,N59261,0.009378
29210,N3647,0.009888
19772,N15115,0.009910
33276,N42623,0.010049
26625,N65215,0.010049
...,...,...
38422,N62457,1.000000
38421,N40674,1.000000
38420,N34308,1.000000
38436,N30437,1.000000


Saving recommendations for each user

In [10]:
# Assuming you have a DataFrame named 'users' with a 'User' column
users_list = users['User'].tolist()

# Print the resulting list
print(users_list)

['U244', 'U68369', 'U50236', 'U77060', 'U5596', 'U85030', 'U11009', 'U87192', 'U15896', 'U49080', 'U49080', 'U22467', 'U84890', 'U70037', 'U16955', 'U64013', 'U47709', 'U39867', 'U82161', 'U4996', 'U78134', 'U17723', 'U47070', 'U47596', 'U70545', 'U33996', 'U65115', 'U63844', 'U62225', 'U22103', 'U46814', 'U93438', 'U74989', 'U33179', 'U37146', 'U25233', 'U8430', 'U39052', 'U54735', 'U39200', 'U36001', 'U43442', 'U84095', 'U19672', 'U7360', 'U12573', 'U74966', 'U77156', 'U34048', 'U88742', 'U27870', 'U68537', 'U89767', 'U3169', 'U16249', 'U7003', 'U6655', 'U37501', 'U33571', 'U46288', 'U53808', 'U23825', 'U139', 'U39181', 'U85540', 'U15965', 'U87334', 'U33303', 'U53971', 'U59978', 'U26157', 'U26727', 'U45664', 'U32838', 'U50603', 'U80408', 'U44024', 'U42367', 'U17193', 'U21018', 'U21649', 'U45510', 'U60076', 'U85405', 'U6471', 'U58622', 'U87066', 'U1773', 'U81892', 'U28900', 'U9224', 'U90870', 'U60262', 'U90850', 'U25241', 'U37576', 'U46735', 'U43975', 'U81272', 'U68214', 'U63647', 'U9

In [11]:
len(users_list)

156500

In [12]:
def user_list(user_id, df):
    user_list = df.loc[df['User'] == user_id, 'ID'].tolist()
    if len(user_list) > 0:
        user_id_str = str(user_list[0])  # Convert to string
        return user_id_str.split()
    else:
        return []


In [45]:
for i in users_list:
    # Example usage
    recommendations = print_article_recommendations(user_id=i)
    recommendations.to_csv('collaborative_recommendations/' + i + '_collab.csv', index=False)

Exception ignored in: <Finalize object, dead>
Traceback (most recent call last):
  File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/multiprocessing/util.py", line 218, in __call__
    if self._pid != getpid():
KeyboardInterrupt: 
