In [None]:
import pandas as pd
import os

In [None]:
news = pd.read_csv("embeddings/news_emb_final.csv") #document with news content
news.head()

In [None]:
#load the data with user behaviors
interactions = pd.read_csv("MIND/behaviors.tsv",sep='\t',  header=None)
interactions.columns =['User', 'Time', 'ID', 'Impressions'] 
interactions = interactions.drop(['Time', 'Impressions'], axis=1)
interactions.head()

In [None]:
# Determine the number of parts you want to divide it into
num_parts = 500  # Adjust this number based on your system's capacity

# Calculate the approximate size of each part
part_size = len(interactions) // num_parts


# Create a folder to save the parts
output_folder = 'interactions_parts'
os.makedirs(output_folder, exist_ok=True)

# Divide the DataFrame into parts and save each part in the folder
for i, part in enumerate(range(num_parts)):
    start_index = i * part_size
    end_index = (i + 1) * part_size

    # Extract the part of the DataFrame
    df_part = interactions.iloc[start_index:end_index]

    # Save the part to a CSV file in the output folder
    output_file = os.path.join(output_folder, f'part_{i+1}.csv')
    df_part.to_csv(output_file, index=False)

In [None]:
#load the data with user behaviors
interactions = pd.read_csv("interactions_parts/part_1.csv")
interactions

In [None]:
# Create a dictionary with users and related articles 
users_dict = {}

# Iterate over each row in df1
for _, row in interactions.iterrows():
    user = row['User']
    article_ids = str(row['ID']).split()  # Convert to string before splitting
    articles_dict = {}

    # Iterate over each article ID
    for article_id in article_ids:
        # Filter df2 to retrieve the content, topic, and subtopic based on the ID
        article_data = news[news['ID'] == article_id]

        if not article_data.empty:
            content = article_data['Content_emb'].values[0]
            topic = article_data['Category'].values[0]
            subtopic = article_data['SubCategory'].values[0]

            # Store the ID, topic, subtopic, and content in a dictionary
            article_dict = {'ID': article_id, 'topic': topic, 'subtopic': subtopic, 'Content_emb': content}

            # Add the article dictionary to the user's articles dictionary
            articles_dict[article_id] = article_dict

    # Add the user's articles dictionary to the result dictionary
    users_dict[user] = articles_dict

# Print the resulting dictionary
print(users_dict)

In [None]:
# Create a dictionary with users and combined content
dictionary_combined = {}

for user, content_dict in users_dict.items():
    combined_content_list = [eval(sub_dict['Content_emb']) for sub_dict in content_dict.values()]
    mean_content_emb = [sum(i) / len(i) for i in zip(*combined_content_list)]
    dictionary_combined[user] = {'Content_emb_mean': mean_content_emb}

print(dictionary_combined)

In [None]:
# Assuming 'dictionary_combined' is the dictionary you created
df_combined = pd.DataFrame(list(dictionary_combined.items()), columns=['User', 'Content'])

# Expand the 'Content' column into separate columns
df_combined = pd.concat([df_combined['User'], pd.DataFrame(df_combined['Content'].to_dict()).T], axis=1)

# Print the resulting DataFrame
print(df_combined)

In [None]:
# Assuming 'dictionary_combined' is the dictionary you created
df_combined = pd.DataFrame(list(dictionary_combined.items()), columns=['User', 'Content'])

# Convert the 'Content' column values to lists
df_combined['Content'] = df_combined['Content'].apply(lambda x: x['Content_emb_mean'])

# Merge with the original DataFrame to get the 'ID' column
df_combined = pd.merge(df_combined, interactions[['User', 'ID']], on='User')

# Reorder columns for better readability (if needed)
df_combined = df_combined[['User', 'ID', 'Content']]

# Print the resulting DataFrame
print(df_combined)

In [None]:
df_combined.to_csv('embeddings/users_emb_extra.csv', index= False)