In [1]:
import pandas as pd
import os

In [None]:
news = pd.read_csv("embeddings/news_emb_final.csv") #document with news content
news.head()

In [None]:
#load the data with user behaviors
interactions = pd.read_csv("MIND/behaviors.tsv",sep='\t',  header=None)
interactions.columns =['User', 'Time', 'ID', 'Impressions'] 
interactions = interactions.drop(['Time', 'Impressions'], axis=1)
interactions.head()

In [None]:
# Determine the number of parts you want to divide it into
num_parts = 500  # Adjust this number based on your system's capacity

# Calculate the approximate size of each part
part_size = len(interactions) // num_parts


# Create a folder to save the parts
output_folder = 'interactions_parts'
os.makedirs(output_folder, exist_ok=True)

# Divide the DataFrame into parts and save each part in the folder
for i, part in enumerate(range(num_parts)):
    start_index = i * part_size
    end_index = (i + 1) * part_size

    # Extract the part of the DataFrame
    df_part = interactions.iloc[start_index:end_index]

    # Save the part to a CSV file in the output folder
    output_file = os.path.join(output_folder, f'part_{i+1}.csv')
    df_part.to_csv(output_file, index=False)

In [None]:
#load the data with user behaviors
interactions = pd.read_csv("interactions_parts/part_1.csv")
interactions

In [None]:
# Create a dictionary with users and related articles 
users_dict = {}

# Iterate over each row in df1
for _, row in interactions.iterrows():
    user = row['User']
    article_ids = str(row['ID']).split()  # Convert to string before splitting
    articles_dict = {}

    # Iterate over each article ID
    for article_id in article_ids:
        # Filter df2 to retrieve the content, topic, and subtopic based on the ID
        article_data = news[news['ID'] == article_id]

        if not article_data.empty:
            content = article_data['Content_emb'].values[0]
            topic = article_data['Category'].values[0]
            subtopic = article_data['SubCategory'].values[0]

            # Store the ID, topic, subtopic, and content in a dictionary
            article_dict = {'ID': article_id, 'topic': topic, 'subtopic': subtopic, 'Content_emb': content}

            # Add the article dictionary to the user's articles dictionary
            articles_dict[article_id] = article_dict

    # Add the user's articles dictionary to the result dictionary
    users_dict[user] = articles_dict

# Print the resulting dictionary
print(users_dict)

In [None]:
# Create a dictionary with users and combined content
dictionary_combined = {}

for user, content_dict in users_dict.items():
    combined_content_list = [eval(sub_dict['Content_emb']) for sub_dict in content_dict.values()]
    mean_content_emb = [sum(i) / len(i) for i in zip(*combined_content_list)]
    dictionary_combined[user] = {'Content_emb_mean': mean_content_emb}

print(dictionary_combined)

In [None]:
# Assuming 'dictionary_combined' is the dictionary you created
df_combined = pd.DataFrame(list(dictionary_combined.items()), columns=['User', 'Content'])

# Expand the 'Content' column into separate columns
df_combined = pd.concat([df_combined['User'], pd.DataFrame(df_combined['Content'].to_dict()).T], axis=1)

# Print the resulting DataFrame
print(df_combined)

In [None]:
# Assuming 'dictionary_combined' is the dictionary you created
df_combined = pd.DataFrame(list(dictionary_combined.items()), columns=['User', 'Content'])

# Convert the 'Content' column values to lists
df_combined['Content'] = df_combined['Content'].apply(lambda x: x['Content_emb_mean'])

# Merge with the original DataFrame to get the 'ID' column
df_combined = pd.merge(df_combined, interactions[['User', 'ID']], on='User')

# Reorder columns for better readability (if needed)
df_combined = df_combined[['User', 'ID', 'Content']]

# Print the resulting DataFrame
print(df_combined)

In [None]:
df_combined.to_csv('embeddings/users_emb_extra.csv', index= False)

In [2]:
users = pd.read_csv("embeddings/users_emb_final.csv") #document with user interactions
users.columns =['User', 'ID', 'Interactions_emb']
users.head()

Unnamed: 0,User,ID,Interactions_emb
0,U244,N17157 N38621 N35022 N50578 N264 N9120 N23907 ...,"[-0.005149974951877837, -0.013250857458654631,..."
1,U68369,N19381 N54536,"[0.0025621717686590273, 0.004183989018201828, ..."
2,U50236,N4020 N44292 N50292 N40772 N57737 N33969 N4054...,"[-0.010138329240492436, -0.01179651383115145, ..."
3,U77060,N23105 N41375,"[-0.005568941123783588, -0.025914330035448074,..."
4,U5596,N459 N56253 N62931 N55846 N29849 N45729 N62834...,"[-0.012533644353970886, -0.011675744312297967,..."


In [14]:
user_rows = users[users['User'] == 'U154']
print(user_rows)

Empty DataFrame
Columns: [User, ID, Interactions_emb]
Index: []


In [7]:
users_list = users['User'].tolist()

In [9]:
len(users_list)

156500

In [10]:
from collections import Counter

counter = Counter(users_list)

# Print the repeated values and their occurrences
for value, count in counter.items():
    if count > 1:
        print(f"{value} is repeated {count} times.")

U244 is repeated 5 times.
U68369 is repeated 2 times.
U50236 is repeated 13 times.
U77060 is repeated 2 times.
U5596 is repeated 9 times.
U85030 is repeated 2 times.
U11009 is repeated 2 times.
U87192 is repeated 3 times.
U15896 is repeated 3 times.
U49080 is repeated 16 times.
U22467 is repeated 4 times.
U84890 is repeated 6 times.
U70037 is repeated 5 times.
U16955 is repeated 4 times.
U64013 is repeated 2 times.
U47709 is repeated 6 times.
U39867 is repeated 14 times.
U82161 is repeated 8 times.
U4996 is repeated 3 times.
U78134 is repeated 3 times.
U17723 is repeated 12 times.
U47070 is repeated 5 times.
U47596 is repeated 5 times.
U70545 is repeated 9 times.
U33996 is repeated 6 times.
U65115 is repeated 2 times.
U63844 is repeated 7 times.
U46814 is repeated 3 times.
U93438 is repeated 9 times.
U74989 is repeated 9 times.
U33179 is repeated 2 times.
U37146 is repeated 3 times.
U8430 is repeated 6 times.
U39052 is repeated 3 times.
U54735 is repeated 19 times.
U39200 is repeated 8

In [18]:
filtered_df = users.drop_duplicates(subset='User')

In [19]:
filtered_df.to_csv ("embeddings/users_filtered_final.csv", index = None)

In [15]:
# Calculate the number of unique users in df
num_unique_users = users['User'].unique()

print("Number of unique users:", num_unique_users)

Number of unique users: 49945
