In [1]:
###example https://www.mlq.ai/fine-tuning-gpt-3-recommendations/###

In [2]:
# imports
import pandas as pd
import openai
import numpy as np
from typing import List
import ast
from openai.embeddings_utils import distances_from_embeddings
import os


In [3]:
#upload data with user interactions
users = pd.read_csv("embeddings/users_filtered_final.csv") #document with user interactions
users.columns =['User', 'ID', 'Interactions_emb']
users.head()

Unnamed: 0,User,ID,Interactions_emb
0,U244,N17157 N38621 N35022 N50578 N264 N9120 N23907 ...,"[-0.005149974951877837, -0.013250857458654631,..."
1,U68369,N19381 N54536,"[0.0025621717686590273, 0.004183989018201828, ..."
2,U50236,N4020 N44292 N50292 N40772 N57737 N33969 N4054...,"[-0.010138329240492436, -0.01179651383115145, ..."
3,U77060,N23105 N41375,"[-0.005568941123783588, -0.025914330035448074,..."
4,U5596,N459 N56253 N62931 N55846 N29849 N45729 N62834...,"[-0.012533644353970886, -0.011675744312297967,..."


In [4]:
#upload the data with news articles
news = pd.read_csv("embeddings/news_emb_final.csv") #document with user interactions
news = news.drop(['Content', 'Category', 'SubCategory'], axis=1)
news.head()

Unnamed: 0,ID,Content_emb
0,N55528,"[0.005885085556656122, -0.007782096974551678, ..."
1,N19639,"[-0.004876355174928904, -0.007969613187015057,..."
2,N61837,"[-0.02760046347975731, -0.013719998300075531, ..."
3,N53526,"[-0.0297758337110281, -0.014837449416518211, 0..."
4,N38324,"[0.005073545966297388, 0.004160495940595865, 0..."


In [5]:
# Create a dictionary with user interactions
user_dict = {}
for index, row in users.iterrows():
    user = row['User']
    interactions = row['Interactions_emb']
    user_dict[user] = ('Content', interactions)

for user, (content, embeddings_str) in user_dict.items():
    # Check for empty string before converting to float
    embeddings_list = [float(value) if value.strip() else 0.0 for value in embeddings_str.strip('[]').split(',')]
    user_dict[user] = (content, embeddings_list)


In [6]:
# Create a dictionary with news interactions
news_dict = {}
for index, row in news.iterrows():
    news_id = row['ID']
    interactions = row['Content_emb']
    news_dict[news_id] = ('Content', interactions)

for news_id, (content, embeddings_str) in news_dict.items():
    # Check for empty string before converting to float
    embeddings_list = [float(value) if value.strip() else 0.0 for value in embeddings_str.strip('[]').split(',')]
    news_dict[news_id] = (content, embeddings_list)

In [7]:
#create list with interacted items for a particular user
def user_list(user_id, df):
    user_list = df.loc[df['User'] == user_id, 'ID'].tolist()
    if len(user_list) > 0:
        user_id_str = str(user_list[0])  # Convert to string
        return user_id_str.split()
    else:
        return []

In [8]:
#create content recommendations
def distances_based_on_content(user_id: str):
    user_embedding = user_dict[user_id][1]  # Get the embedding of the user
    news_ids = list(news_dict.keys()) #list of news articles
    news_embeddings = [news_dict[news_id][1] for news_id in news_ids]# Get the embedding of each news article

    # Calculate distances from the user to each news article
    distances = distances_from_embeddings(user_embedding, news_embeddings, distance_metric="cosine")

    # Create a DataFrame with news IDs and corresponding distances
    df = pd.DataFrame({'ID': news_ids, 'Distance': distances})
    sorted_df = df.sort_values(by='Distance', ascending=True)
    
    #delete articles already read by a user
    items_to_delete = user_list(user_id, users)
    filtered_df = sorted_df[~sorted_df['ID'].isin(items_to_delete)]
    
    #reset indexes in a new df 
    filtered_df.reset_index(drop=True, inplace=True)
    
    return filtered_df

In [9]:
distances_based_on_content('U68369')

Unnamed: 0,ID,Distance
0,N37526,0.120128
1,N372,0.124554
2,N47440,0.124870
3,N40767,0.125969
4,N2208,0.126796
...,...,...
51275,N10305,0.995382
51276,N63770,0.997056
51277,N6175,1.006691
51278,N9846,1.019749


Saving recommendations for each user

In [10]:
# Assuming you have a DataFrame named 'users' with a 'User' column
users_list = users['User'].tolist()
len(users_list)

49945

In [11]:
# Calculate the size of each part
part_size = len(users_list) // 3

# Divide the list into three parts
part1 = users_list[:part_size]
part2 = users_list[part_size:2*part_size]
part3 = users_list[2*part_size:]

In [17]:
mypart = part3

In [18]:
def list_ids_in_folder(folder_path):
    ids = set()
    
    for filename in os.listdir(folder_path):
        if filename.endswith(".csv"):
            # Extracting the ID from the filename
            file_id = filename.split("_")[0][1:]
            ids.add(file_id)
    
    return list(ids)


In [27]:
folder_path = 'content_recommendations'
ids_list = list_ids_in_folder(folder_path)
ids_list = ['U' + num for num in ids_list]
len(ids_list)

23153

In [28]:
set2 = set(ids_list)
rest_users = [item for item in mypart if item.strip() not in set2]
len(rest_users)

10144

In [29]:
for i in rest_users:
    #Example usage
    recommendations = distances_based_on_content(user_id=i)
    recommendations.to_csv('content_recommendations/' + i + '_content.csv', index=False)

KeyboardInterrupt: 