In [6]:
import pandas as pd
import openai
from openai.embeddings_utils import get_embedding
import os

Calculate embeddings for news articles

In [26]:
openai.api_key = "key"

In [27]:
#load the data with news articles
news = pd.read_csv("MIND/newsTEST.tsv", sep='\t', header=None) #document with news description
news.columns =['ID', 'Category', 'SubCategory', 'Title', 'Abstract', 'URL', 'Title Entities', 'Abstract Entities']
news['Content'] = news['Title'].fillna('') + ' ' + news['Abstract'].fillna('')
news = news.drop(['URL', 'Title Entities', 'Abstract Entities', 'Title', 'Abstract'], axis=1)
news.head()

Unnamed: 0,ID,Category,SubCategory,Content
0,N55189,tv,tvnews,"'Wheel Of Fortune' Guest Delivers Hilarious, O..."
1,N42782,sports,baseball_mlb,Three takeaways from Yankees' ALCS Game 5 vict...
2,N34694,tv,tvnews,Rosie O'Donnell: Barbara Walters Isn't 'Up to ...
3,N45794,news,newscrime,Four flight attendants were arrested in Miami'...
4,N18445,sports,football_ncaa,Michigan sends breakup tweet to Notre Dame as ...


In [28]:
#function transforms the values of column from text into embeddings
def hashable_column(column_value):
    if isinstance(column_value, dict):
        hashable_items = []
        
        for key, value in column_value.items():
            if key == 'Content':
                # Convert 'Content' value to embedding
                value = get_embedding(value, engine="text-embedding-ada-002")
            elif isinstance(value, dict):
                value = hashable_column(value)
            elif isinstance(value, list):
                print("Unhashable list value at key:", key, "with value:", value)
            elif isinstance(value, str):
                value = value.replace("\n", " ")
            hashable_items.append((key, value))
        return tuple(sorted(hashable_items))
    else:
        if isinstance(column_value, str):
            return get_embedding(column_value, engine="text-embedding-ada-002")
        else:
            return column_value

In [29]:
#save news embeddings 
news['Content_emb'] = news['Content'].apply(hashable_column)

In [30]:
news.to_csv('embeddings/news_emb_TEST.csv', index=False)

In [31]:
news = pd.read_csv("embeddings/news_emb_TEST.csv") #document with news content
news.head()

Unnamed: 0,ID,Category,SubCategory,Content,Content_emb
0,N55189,tv,tvnews,"'Wheel Of Fortune' Guest Delivers Hilarious, O...","[-0.020967688411474228, -0.020634232088923454,..."
1,N42782,sports,baseball_mlb,Three takeaways from Yankees' ALCS Game 5 vict...,"[-0.008319429121911526, 0.0013667173916473985,..."
2,N34694,tv,tvnews,Rosie O'Donnell: Barbara Walters Isn't 'Up to ...,"[-0.042908210307359695, -0.018993420526385307,..."
3,N45794,news,newscrime,Four flight attendants were arrested in Miami'...,"[-0.007930373772978783, -0.0221870094537735, 0..."
4,N18445,sports,football_ncaa,Michigan sends breakup tweet to Notre Dame as ...,"[-0.020153459161520004, -0.044797133654356, 0...."


Calculate embeddings for user profiles 

In [32]:
#load the data with user behaviors
interactions = pd.read_csv("MIND/behaviorsTEST.tsv",sep='\t',  header=None)
interactions.columns =['User', 'Time', 'ID', 'Impressions'] 
interactions = interactions.drop(['Time', 'Impressions'], axis=1)
interactions.head()

Unnamed: 0,User,ID
0,U13740,N55189 N42782 N34694 N45794 N18445 N63302 N104...
1,U91836,N31739 N6072 N63045 N23979 N35656 N43353 N8129...
2,U73700,N10732 N25792 N7563 N21087 N41087 N5445 N60384...


In [33]:
# Create a dictionary with users and related articles 
users_dict = {}

# Iterate over each row in df1
for _, row in interactions.iterrows():
    user = row['User']
    article_ids = str(row['ID']).split()  # Convert to string before splitting
    articles_dict = {}

    # Iterate over each article ID
    for article_id in article_ids:
        # Filter df2 to retrieve the content, topic, and subtopic based on the ID
        article_data = news[news['ID'] == article_id]

        if not article_data.empty:
            content = article_data['Content_emb'].values[0]
            topic = article_data['Category'].values[0]
            subtopic = article_data['SubCategory'].values[0]

            # Store the ID, topic, subtopic, and content in a dictionary
            article_dict = {'ID': article_id, 'topic': topic, 'subtopic': subtopic, 'Content_emb': content}

            # Add the article dictionary to the user's articles dictionary
            articles_dict[article_id] = article_dict

    # Add the user's articles dictionary to the result dictionary
    users_dict[user] = articles_dict

# Print the resulting dictionary
print(users_dict)

{'U13740': {'N55189': {'ID': 'N55189', 'topic': 'tv', 'subtopic': 'tvnews', 'Content_emb': '[-0.020967688411474228, -0.020634232088923454, 0.0057854545302689075, -0.001869018655270338, -0.015779118984937668, 0.002360865706577897, 0.0010303778108209372, -0.007075927685946226, -0.03379905968904495, -0.005421988200396299, 0.024835772812366486, 0.03662676364183426, -0.007316015660762787, 0.013591649942100048, 0.005371969658881426, 0.03081129677593708, 0.019567174836993217, 0.010350462049245834, 0.003661342430859804, -0.05666077509522438, -0.03558638319373131, -0.014431958086788654, -0.01980726234614849, -0.016125913709402084, -0.014365267008543015, -0.015525693073868752, 0.014231884852051735, -0.010537196882069111, 0.005608723033219576, -0.0004922638181596994, -0.002364200307056308, -0.0073960451409220695, 0.0045650070533156395, -0.02734335884451866, -0.03550635278224945, -0.025769447907805443, 0.0016172596951946616, -0.008176331408321857, 0.03713361546397209, 0.0015647404361516237, 0.0176

In [34]:
# Create a dictionary with users and combined content
dictionary_combined = {}

for user, content_dict in users_dict.items():
    combined_content_list = [eval(sub_dict['Content_emb']) for sub_dict in content_dict.values()]
    mean_content_emb = [sum(i) / len(i) for i in zip(*combined_content_list)]
    dictionary_combined[user] = {'Content_emb_mean': mean_content_emb}

print(dictionary_combined)

{'U13740': {'Content_emb_mean': [-0.015360403599010574, -0.022621901123784482, 0.015852123317826126, -0.015558802677939335, -0.01676577552118235, 0.007063333876430988, -0.0011156885852364616, -0.003619424780481495, -0.015871210366539244, -0.01899308054190543, 0.018678937314285174, 0.019860617247306638, -0.002366086985501978, -0.002793591986927721, -0.0026528702631670362, -0.0025197314615878793, 0.023922715129123792, -0.003032062823573748, 0.004128284028006924, -0.02313333237543702, -0.013002040692501597, 0.003901030014579495, -0.012302561313845217, 0.0034387879616891346, -0.005017741686768002, -0.0023684605645636716, 0.013780794597955214, -0.0149372399577664, 0.013720988606413206, -0.014220921516728898, 0.0033688475377857685, -0.0030466549651464447, -0.015585637205125144, -0.015562537478722839, -0.017534302806274757, -0.01894267001706693, -0.002584046089193887, -0.008324620163572641, 0.006375873535742155, 0.0018383328440702623, 0.012212160746760977, -0.010926020008304881, -0.0045482317

In [35]:
# Assuming 'dictionary_combined' is the dictionary you created
df_combined = pd.DataFrame(list(dictionary_combined.items()), columns=['User', 'Content'])

# Expand the 'Content' column into separate columns
df_combined = pd.concat([df_combined['User'], pd.DataFrame(df_combined['Content'].to_dict()).T], axis=1)

# Print the resulting DataFrame
print(df_combined)

     User                                   Content_emb_mean
0  U13740  [-0.015360403599010574, -0.022621901123784482,...
1  U91836  [-0.005392600822233362, -0.004443325935426401,...
2  U73700  [-0.004993118400064608, -0.008147992255787054,...


In [36]:
# Assuming 'dictionary_combined' is the dictionary you created
df_combined = pd.DataFrame(list(dictionary_combined.items()), columns=['User', 'Content'])

# Convert the 'Content' column values to lists
df_combined['Content'] = df_combined['Content'].apply(lambda x: x['Content_emb_mean'])

# Merge with the original DataFrame to get the 'ID' column
df_combined = pd.merge(df_combined, interactions[['User', 'ID']], on='User')

# Reorder columns for better readability (if needed)
df_combined = df_combined[['User', 'ID', 'Content']]

# Print the resulting DataFrame
print(df_combined)

     User                                                 ID  \
0  U13740  N55189 N42782 N34694 N45794 N18445 N63302 N104...   
1  U91836  N31739 N6072 N63045 N23979 N35656 N43353 N8129...   
2  U73700  N10732 N25792 N7563 N21087 N41087 N5445 N60384...   

                                             Content  
0  [-0.015360403599010574, -0.022621901123784482,...  
1  [-0.005392600822233362, -0.004443325935426401,...  
2  [-0.004993118400064608, -0.008147992255787054,...  


In [37]:
df_combined.to_csv('embeddings/users_emb_TEST.csv', index= False)

In [40]:
users = pd.read_csv("embeddings/users_emb_TEST.csv") #document with user interactions
users.columns =['User', 'ID', 'Interactions_emb']
users.head()

Unnamed: 0,User,ID,Interactions_emb
0,U13740,N55189 N42782 N34694 N45794 N18445 N63302 N104...,"[-0.015360403599010574, -0.022621901123784482,..."
1,U91836,N31739 N6072 N63045 N23979 N35656 N43353 N8129...,"[-0.005392600822233362, -0.004443325935426401,..."
2,U73700,N10732 N25792 N7563 N21087 N41087 N5445 N60384...,"[-0.004993118400064608, -0.008147992255787054,..."
