In [1]:
###example https://www.mlq.ai/fine-tuning-gpt-3-recommendations/###

In [1]:
# imports
import pandas as pd
import openai
import numpy as np
from typing import List
import ast
from openai.embeddings_utils import distances_from_embeddings, indices_of_nearest_neighbors_from_distances


In [10]:
users = pd.read_csv("embeddings/users_emb_extra.csv") #document with user interactions
users.columns =['User', 'ID', 'Interactions_emb']
users.head()

Unnamed: 0,User,ID,Interactions_emb
0,U111,N61837 N39237 N9786 N43620 N61409,"[-0.011852237349376082, -0.015658087749034166,..."
1,U222,N13861 N41051 N40272 N24967 N14962,"[0.010383155662566423, -0.0077624950557947155,..."
2,U333,N9786 N47214 N24905 N56618 N34406,"[-0.021996299363672735, -0.010908919479697942,..."
3,U444,N20336 N30961 N61765 N40969 N11472,"[-0.014395372092258185, -0.007225491013377905,..."
4,U555,N59295 N9721 N3574 N43620 N22028,"[-0.006499886885285378, -0.010832596011459828,..."


In [6]:
news = pd.read_csv("embeddings/news_emb_final.csv") #document with user interactions
news.head()

Unnamed: 0,ID,Category,SubCategory,Content,Content_emb
0,N55528,lifestyle,lifestyleroyals,"The Brands Queen Elizabeth, Prince Charles, an...","[0.005885085556656122, -0.007782096974551678, ..."
1,N19639,health,weightloss,50 Worst Habits For Belly Fat These seemingly ...,"[-0.004876355174928904, -0.007969613187015057,..."
2,N61837,news,newsworld,The Cost of Trump's Aid Freeze in the Trenches...,"[-0.02760046347975731, -0.013719998300075531, ..."
3,N53526,health,voices,I Was An NBA Wife. Here's How It Affected My M...,"[-0.0297758337110281, -0.014837449416518211, 0..."
4,N38324,health,medical,"How to Get Rid of Skin Tags, According to a De...","[0.005073545966297388, 0.004160495940595865, 0..."


In [7]:
news.columns =['ID', 'Category', 'SubCategory', 'Content', 'Content_emb']
news_dict = news.set_index('ID').to_dict(orient='index')

In [11]:
# Create a dictionary with user interactions

user_dict = {}
for index, row in users.iterrows():
    user = row['User']
    interactions = row['Interactions_emb']
    user_dict[user] = ('Content', interactions)

for user, (content, embeddings_str) in user_dict.items():
    embeddings_list = [float(value) for value in embeddings_str.strip('[]').split(',')]
    user_dict[user] = (content, embeddings_list)

print(user_dict)

{'U111': ('Content', [-0.011852237349376082, -0.015658087749034166, 0.0009249899536371231, -0.020731644239276648, -0.00658577773720026, 0.010585291578900069, -0.008148252475075423, -0.007516421377658844, -0.00329395430162549, -0.013264022208750248, 0.02573445439338684, 0.01817621849477291, 0.004265624145045876, 0.000487590953707695, -0.00709435926983133, -0.0044609214644879104, 0.021869755908846857, -0.009886214329162613, 0.009695398426265456, -0.022962742391973733, -0.01323599403258413, 0.011390022980049253, -0.012463520327582956, 0.00833500362932682, -0.01343333008699119, -0.00618530809879303, 0.010797980334609747, -0.018623103899881244, 0.010221999790519475, -0.009532964415848255, -0.007043654238805175, -0.009792708395980298, -0.022650618012994526, -0.013860295806080104, -0.011905784322880208, -0.01312517337501049, 0.004601524490863085, -0.01867465302348137, 0.012858768971636892, -0.007557340711355209, 0.01218207348138094, 0.00836142348125577, -0.007401139987632632, -0.0050501618534

In [12]:
# Create a dictionary with news articles
news_dictionary = news.set_index('ID').to_dict(orient='index')
# Function to transform values from string to appropriate data types and format as tuples
def transform_values_to_tuples(data_dict):
    for key, value in data_dict.items():
        for sub_key, sub_value in value.items():
            # Convert the value to a list of floats if it's a string
            if isinstance(sub_value, str):
                try:
                    data_dict[key][sub_key] = ast.literal_eval(sub_value)
                except (ValueError, SyntaxError):
                    # If literal_eval fails (e.g., the value is not a valid Python literal),
                    # keep the value as a string (or you can handle it differently if needed)
                    pass
        data_dict[key] = (
            ('Category', data_dict[key]['Category']),
            ('SubCategory', data_dict[key]['SubCategory']),
            ('Content_emb', data_dict[key]['Content_emb'])
        )

# Transform values to tuples for the entire dictionary
transform_values_to_tuples(news_dictionary)

In [13]:
#create list with interacted items for a particular user
def user_list(user_id, df):
    user_list = df.loc[df['User'] == user_id, 'ID'].tolist()
    if len(user_list) > 0:
        return user_list[0].split()
    else:
        return []

In [14]:
def print_recommendations_for_user(
    user_id: str,
    users_dict: dict,
    news_dictionary: dict
) -> pd.DataFrame:
    """Get the nearest neighbors for a particular user based on article embeddings."""
    
    # Your code to get user's embeddings
    user_articles_embeddings = users_dict[user_id][1]
    user_embeddings_np = np.array(user_articles_embeddings)
    user_embeddings = user_embeddings_np.tolist()
    
    content_dictionary = []
    for key in news_dictionary.keys():
        content_dictionary.append(news_dictionary[key][2][1])
    
    # Your code to calculate distances
    distances = distances_from_embeddings(user_embeddings, content_dictionary, distance_metric="cosine")
    # Your code to get indices of nearest neighbors
    indices_of_nearest_neighbors = indices_of_nearest_neighbors_from_distances(distances)
    
    # Initialize lists to store data
    selected_keys = []
    selected_distances = []
    
    # Populate the lists with selected keys and distances
    for i in indices_of_nearest_neighbors:
        keys_list = list(news_dictionary.keys())
        selected_key = keys_list[i]
        if selected_key not in user_list(user_id, users):
            selected_keys.append(selected_key)
            selected_distances.append(distances[i])
    
    # Create a DataFrame with 'ID' and 'distances' columns
    result_df = pd.DataFrame({'ID': selected_keys, 'distance': selected_distances})
    
    return result_df


In [16]:
# Example usage
user_id = "U222"  # Specify the user ID for which you want to generate recommendations

recommendations = print_recommendations_for_user(
    user_id=user_id,
    users_dict=user_dict,
    news_dictionary = news_dictionary
)

In [12]:
content_recommender = pd.merge(recommendations, news, on='ID', how='left')
content_recommender = content_recommender.drop(['Content_emb'], axis=1)

In [13]:
content_recommender.head()

Unnamed: 0,ID,distance,Category,SubCategory,Content
0,N51188,0.11612,sports,baseball_mlb_videos,ALCS Game 6 Highlights: Yankees vs. Astros Aft...
1,N62846,0.14174,sports,baseball_mlb,"Nationals win first World Series title, stormi..."
2,N12603,0.147599,sports,more_sports,"Astros beat Yankees, will face Nationals in Wo..."
3,N41777,0.157279,sports,baseball_mlb,Kate Upton was so fired up after the Astros' b...
4,N56541,0.159829,sports,baseball_mlb,"Soto, Nationals edge Astros in World Series op..."


In [14]:
#content_recommender.to_csv('content_test.csv', index= False)

Saving recommendations for each user

In [15]:
# Assuming you have a DataFrame named 'users' with a 'User' column
users_list = users['User'].unique().tolist()

# Print the resulting list
print(users_list)

['U10045', 'U1111', 'U11306', 'U13000', 'U13740', 'U14000', 'U15000', 'U17841', 'U19739', 'U29155', 'U34670', 'U38627', 'U46596', 'U53231', 'U63162', 'U73700', 'U79199', 'U8125', 'U8312', 'U8355', 'U89744', 'U91836', 'U92486']


In [16]:
for i in users_list:
    # Example usage
    user_id = i  # Specify the user ID for which you want to generate recommendations

    recommendations = print_recommendations_for_user(
        user_id=user_id,
        users_dict=user_dict,
        news_dictionary = news_dictionary
    )
    
    recommendations.to_csv('content_recommendations/' + i + '_cont.csv', index= False)