In [1]:
# imports
import pandas as pd
import numpy as np
from typing import List
from openai.embeddings_utils import distances_from_embeddings
import os


In [2]:
#upload data with user interactions
users = pd.read_csv("embeddings/users_emb_TEST.csv") #document with user interactions
users.columns =['User', 'ID', 'Interactions_emb']
users.head()

Unnamed: 0,User,ID,Interactions_emb
0,U13740,N55189 N42782 N34694 N45794 N18445 N63302 N104...,"[-0.015360403599010574, -0.022621901123784482,..."
1,U91836,N31739 N6072 N63045 N23979 N35656 N43353 N8129...,"[-0.005392600822233362, -0.004443325935426401,..."
2,U73700,N10732 N25792 N7563 N21087 N41087 N5445 N60384...,"[-0.004993118400064608, -0.008147992255787054,..."


In [4]:
#upload the data with news articles
news = pd.read_csv("embeddings/news_emb_TEST.csv") #document with user interactions
news = news.drop(['Content', 'Category', 'SubCategory'], axis=1)
news.head()

Unnamed: 0,ID,Content_emb
0,N55189,"[-0.020967688411474228, -0.020634232088923454,..."
1,N42782,"[-0.008319429121911526, 0.0013667173916473985,..."
2,N34694,"[-0.042908210307359695, -0.018993420526385307,..."
3,N45794,"[-0.007930373772978783, -0.0221870094537735, 0..."
4,N18445,"[-0.020153459161520004, -0.044797133654356, 0...."


In [5]:
# Create a dictionary with user interactions
user_dict = {}
for index, row in users.iterrows():
    user = row['User']
    interactions = row['Interactions_emb']
    user_dict[user] = ('Content', interactions)

for user, (content, embeddings_str) in user_dict.items():
    # Check for empty string before converting to float
    embeddings_list = [float(value) if value.strip() else 0.0 for value in embeddings_str.strip('[]').split(',')]
    user_dict[user] = (content, embeddings_list)


In [6]:
# Create a dictionary with news interactions
news_dict = {}
for index, row in news.iterrows():
    news_id = row['ID']
    interactions = row['Content_emb']
    news_dict[news_id] = ('Content', interactions)

for news_id, (content, embeddings_str) in news_dict.items():
    # Check for empty string before converting to float
    embeddings_list = [float(value) if value.strip() else 0.0 for value in embeddings_str.strip('[]').split(',')]
    news_dict[news_id] = (content, embeddings_list)

In [7]:
#create list with interacted items for a particular user
def user_list(user_id, df):
    user_list = df.loc[df['User'] == user_id, 'ID'].tolist()
    if len(user_list) > 0:
        user_id_str = str(user_list[0])  # Convert to string
        return user_id_str.split()
    else:
        return []

In [8]:
#create content recommendations
def distances_based_on_content(user_id: str):
    user_embedding = user_dict[user_id][1]  # Get the embedding of the user
    news_ids = list(news_dict.keys()) #list of news articles
    news_embeddings = [news_dict[news_id][1] for news_id in news_ids]# Get the embedding of each news article

    # Calculate distances from the user to each news article
    distances = distances_from_embeddings(user_embedding, news_embeddings, distance_metric="cosine")

    # Create a DataFrame with news IDs and corresponding distances
    df = pd.DataFrame({'ID': news_ids, 'Distance': distances})
    sorted_df = df.sort_values(by='Distance', ascending=True)
    
    #delete articles already read by a user
    items_to_delete = user_list(user_id, users)
    filtered_df = sorted_df[~sorted_df['ID'].isin(items_to_delete)]
    
    #reset indexes in a new df 
    filtered_df.reset_index(drop=True, inplace=True)
    
    return filtered_df

In [9]:
distances_based_on_content('U13740')

Unnamed: 0,ID,Distance
0,N48031,0.138946
1,N39317,0.143167
2,N43353,0.146500
3,N47020,0.150886
4,N42977,0.152021
...,...,...
126,N13131,0.202738
127,N18870,0.216875
128,N24233,0.218297
129,N45509,0.222987


Saving recommendations for each user

In [12]:
# Assuming you have a DataFrame named 'users' with a 'User' column
users_list = users['User'].tolist()

In [13]:
def list_ids_in_folder(folder_path):
    ids = set()
    
    for filename in os.listdir(folder_path):
        if filename.endswith(".csv"):
            # Extracting the ID from the filename
            file_id = filename.split("_")[0][1:]
            ids.add(file_id)
    
    return list(ids)


In [14]:
for i in users_list:
    #Example usage
    recommendations = distances_based_on_content(user_id=i)
    recommendations.to_csv('content_recommendations_test/' + i + '_content.csv', index=False)