# Reading the data

In [14]:
import pandas as pd
import numpy as np

# 1. Read behaviors.tsv
behaviors_path = '/content/behaviors.tsv'
train_df = pd.read_csv(behaviors_path, sep='\t', header=None, names=[
    "Impression ID", "User ID", "Time", "History", "Impressions"
])
print("Behaviors Data:")
print(train_df.head())

# 2. Read news.tsv
news_path = '/content/news.tsv'
news_df = pd.read_csv(news_path, sep='\t', header=None, names=[
    "News ID", "Category", "SubCategory", "Title", "Abstract", "URL",
    "Title Entities", "Abstract Entities"
])
news_df = news_df.drop(columns=['URL'])
print("\nNews Data:")
print(news_df.head())

# 3. Read entity_embedding.vec
entity_embedding_path = '/content/entity_embedding.vec'
entity_embeddings = {}
with open(entity_embedding_path, 'r') as file:
    for line in file:
        parts = line.strip().split()
        entity_id = parts[0]
        embedding = np.array(parts[1:], dtype=float)
        entity_embeddings[entity_id] = embedding


# 4. Read relation_embedding.vec
relation_embedding_path = '/content/relation_embedding.vec'
relation_embeddings = {}
with open(relation_embedding_path, 'r') as file:
    for line in file:
        parts = line.strip().split()
        relation_id = parts[0]
        embedding = np.array(parts[1:], dtype=float)
        relation_embeddings[relation_id] = embedding



Behaviors Data:
   Impression ID  User ID                    Time  \
0              1   U87243  11/10/2019 11:30:54 AM   
1              2  U598644   11/12/2019 1:45:29 PM   
2              3  U532401  11/13/2019 11:23:03 AM   
3              4  U593596  11/12/2019 12:24:09 PM   
4              5  U239687   11/14/2019 8:03:01 PM   

                                             History  \
0  N8668 N39081 N65259 N79529 N73408 N43615 N2937...   
1  N56056 N8726 N70353 N67998 N83823 N111108 N107...   
2  N128643 N87446 N122948 N9375 N82348 N129412 N5...   
3  N31043 N39592 N4104 N8223 N114581 N92747 N1207...   
4  N65250 N122359 N71723 N53796 N41663 N41484 N11...   

                                         Impressions  
0  N78206-0 N26368-0 N7578-0 N58592-0 N19858-0 N5...  
1  N47996-0 N82719-0 N117066-0 N8491-0 N123784-0 ...  
2              N103852-0 N53474-0 N127836-0 N47925-1  
3  N38902-0 N76434-0 N71593-0 N100073-0 N108736-0...  
4  N76209-0 N48841-0 N67937-0 N62235-0 N6307-0 N3... 

In [15]:
train_df['Time'] = pd.to_datetime(train_df['Time'])  # Ensure the column is in datetime format
min_date = train_df['Time'].min()
max_date = train_df['Time'].max()

print("Minimum date:", min_date)
print("Maximum date:", max_date)


Minimum date: 2019-11-09 00:00:00
Maximum date: 2019-11-14 23:59:59


In [16]:
# Remove rows where 'History' is NaN to avoid errors during counting
print(train_df.shape)
train_df = train_df.dropna(subset=['History'])
print(train_df.shape)
# Count the number of history items for each User ID
train_df['History Count'] = train_df['History'].apply(lambda x: len(x.split(' ')))

# Display the User ID and corresponding history count
history_counts = train_df[['User ID', 'History Count']]
print(history_counts.head())
train_df.shape

(1968593, 5)
(1927969, 5)
   User ID  History Count
0   U87243             16
1  U598644             24
2  U532401             16
3  U593596             13
4  U239687            339


(1927969, 6)

In [17]:
import pandas as pd
from collections import defaultdict

# 3. Prepare data for collaborative filtering
interaction_data = defaultdict(lambda: defaultdict(int))  # Using defaultdict to store counts of each news_id for each user

# Iterate over each row in the DataFrame
for index, row in train_df.iterrows():
    user_id = row['User ID']
    impressions = row['Impressions']

    # Check if impressions is not NaN
    if pd.notna(impressions):
        impressions = impressions.split()  # Split the impressions string into a list
        for impression in impressions:
            news_id, clicked = impression.split('-')  # Separate news ID and click indicator
            clicked = int(clicked)  # Convert click indicator to integer
            interaction_data[user_id][news_id] += clicked  # Accumulate clicks for the same news_id

# Combine the impressions into one list for each user
combined_interactions = []

# Create the list for each user, including all impressions with their click counts
for user_id, news_dict in interaction_data.items():
    # Create a list of "news_id-click_count" for the user
    user_impressions = [f"{news_id}-{click_count}" for news_id, click_count in news_dict.items()]
    combined_interactions.append((user_id, " ".join(user_impressions)))

# Create a DataFrame from the combined interactions
interactions_df = pd.DataFrame(combined_interactions, columns=['User ID', 'Impressions_new'])

# Merge the new interactions back with the original train_df
train_df = pd.merge(train_df, interactions_df, on='User ID', how='left')

# Now, train_df will contain the 'Impressions_new' column with all impressions for each user
print(train_df.head())  # Display the first few rows of the updated train_df


   Impression ID  User ID                Time  \
0              1   U87243 2019-11-10 11:30:54   
1              2  U598644 2019-11-12 13:45:29   
2              3  U532401 2019-11-13 11:23:03   
3              4  U593596 2019-11-12 12:24:09   
4              5  U239687 2019-11-14 20:03:01   

                                             History  \
0  N8668 N39081 N65259 N79529 N73408 N43615 N2937...   
1  N56056 N8726 N70353 N67998 N83823 N111108 N107...   
2  N128643 N87446 N122948 N9375 N82348 N129412 N5...   
3  N31043 N39592 N4104 N8223 N114581 N92747 N1207...   
4  N65250 N122359 N71723 N53796 N41663 N41484 N11...   

                                         Impressions  History Count  \
0  N78206-0 N26368-0 N7578-0 N58592-0 N19858-0 N5...             16   
1  N47996-0 N82719-0 N117066-0 N8491-0 N123784-0 ...             24   
2              N103852-0 N53474-0 N127836-0 N47925-1             16   
3  N38902-0 N76434-0 N71593-0 N100073-0 N108736-0...             13   
4  N76209-0 N

# Keep unique users

In [18]:
# prompt: get the most recent history for each user and only retain that row

# Group by 'User ID' and get the row with the latest 'Time' for each user
train_df_latest = train_df.loc[train_df.groupby('User ID')['Time'].idxmax()]

train_df_latest.shape

# Remove rows where 'History' is NaN to avoid errors during counting
train_df_latest = train_df_latest.dropna(subset=['History'])

# Count the number of history items for each User ID
train_df_latest['History Count'] = train_df_latest['History'].apply(lambda x: len(x.split(' ')))

# Display the User ID and corresponding history count
history_counts = train_df_latest[['User ID', 'History Count']]
print(history_counts.head())

# Remove duplicate User IDs, keeping the first occurrence
train_df = train_df.drop_duplicates(subset='User ID', keep='first').reset_index(drop=True)


train_df_latest.shape

       User ID  History Count
901532      U0              8
571026      U1             72
613095     U10              3
631603    U100             43
611813   U1000              9


(668745, 7)

In [19]:
import pandas as pd
from sklearn.preprocessing import MultiLabelBinarizer



# Convert to DataFrame
train_df_10000 = train_df[:1000]

# Step 1: Split each user's history into lists and handle non-string entries
train_df_10000['History'] = train_df_10000['History'].apply(lambda x: str(x).split() if isinstance(x, str) else [])

# Step 2: Use MultiLabelBinarizer to create the user-item matrix
mlb = MultiLabelBinarizer()
user_item_matrix = pd.DataFrame(mlb.fit_transform(train_df_10000['History']),
                                index=train_df_10000['User ID'],
                                columns=mlb.classes_)

print(user_item_matrix)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df_10000['History'] = train_df_10000['History'].apply(lambda x: str(x).split() if isinstance(x, str) else [])


         N100  N100022  N100026  N100033  N100063  N100068  N100080  N100091  \
User ID                                                                        
U87243      0        0        0        0        0        0        0        0   
U598644     0        0        0        0        0        0        0        0   
U532401     0        0        0        0        0        0        0        0   
U593596     0        0        0        0        0        0        0        0   
U239687     0        0        0        1        0        0        0        0   
...       ...      ...      ...      ...      ...      ...      ...      ...   
U115243     0        0        0        0        0        0        0        0   
U311138     0        0        0        0        0        0        1        0   
U383721     0        0        0        0        0        0        0        0   
U307781     0        0        0        0        0        0        0        0   
U552836     0        0        0        0

# Multi Armed Bandit

In [20]:
import numpy as np
import pandas as pd
from ast import literal_eval
from collections import Counter

class PersonalizedNewsRecommender:
    def __init__(self, epsilon, n_recommendations, train_df, all_articles,
                 user_col_name='User ID', history_col_name='History', random_seed=2024):
        np.random.seed(random_seed)

        self.epsilon = epsilon
        self.train_df = train_df
        self.user_col_name = user_col_name
        self.history_col_name = history_col_name
        self.n_recommendations = n_recommendations
        self.all_articles = set(all_articles)  # Complete set of available articles

        # Convert history strings to lists if they're strings
        if self.train_df[self.history_col_name].dtype == 'object':
            self.train_df[self.history_col_name] = self.train_df[self.history_col_name].apply(
                lambda x: literal_eval(x) if isinstance(x, str) else x
            )

        self.users = self.train_df[self.user_col_name].unique()

        # Calculate global item popularity
        self.item_popularity = self._calculate_item_popularity()

    def _calculate_item_popularity(self):
        """Calculate how many times each item appears across all histories"""
        all_items = [item for history in self.train_df[self.history_col_name] for item in history]
        return dict(Counter(all_items))

    def get_recommendations_for_user(self, user_id):
        # Get user's history
        user_history = set(self.train_df[
            self.train_df[self.user_col_name] == user_id
        ][self.history_col_name].iloc[0])

        # Available items (all articles except ones user has already seen)
        available_items = list(self.all_articles - user_history)

        if not available_items:
            return []  # Return empty if no new items available

        recommendations = []
        for _ in range(min(self.n_recommendations, len(available_items))):
            # Exploration or exploitation choice
            if np.random.rand() < self.epsilon:  # Explore
                chosen_item = np.random.choice(available_items)
            else:  # Exploit
                # Choose based on global popularity (default to 0 for never-clicked articles)
                item_scores = {
                    item: self.item_popularity.get(item, 0)
                    for item in available_items
                }
                chosen_item = max(item_scores.items(), key=lambda x: x[1])[0]

            # Add to recommendations and remove from available items
            recommendations.append(chosen_item)
            available_items.remove(chosen_item)

        return recommendations

    def generate_all_recommendations(self):
        """Generate recommendations for all users"""
        all_recommendations = {}

        for i, user_id in enumerate(self.users):
            recommendations = self.get_recommendations_for_user(user_id)
            all_recommendations[user_id] = recommendations

            # Print progress every 1000 users
            if (i + 1) % 1000 == 0:
                print(f"Generated recommendations for {i + 1} users")

        # Convert to DataFrame
        recommendation_records = []
        for user_id, recs in all_recommendations.items():
            for rank, item_id in enumerate(recs, 1):
                recommendation_records.append({
                    'User ID': user_id,
                    'News ID': item_id,
                    'Rank': rank
                })

        recommendations_df = pd.DataFrame(recommendation_records)
        return recommendations_df


In [21]:
# Initialize the recommender
recommender = PersonalizedNewsRecommender(
    epsilon=0.3,  # 10% exploration rate (you can adjust this)
    n_recommendations=50,  # How many recommendations you want per user
    train_df=train_df_10000,  # Keyword argument for the training DataFrame
    all_articles=user_item_matrix.columns  # Keyword argument for all available articles
)

# Generate recommendations for all users
recommendations_df = recommender.generate_all_recommendations()

# Print the recommendations
print("\nRecommendations for all users:")
print(recommendations_df)

# If you want recommendations for a specific user:
specific_user = 'U87243'
user_recs = recommender.get_recommendations_for_user(specific_user)
print(f"\nRecommendations for user {specific_user}:")
print(user_recs)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.train_df[self.history_col_name] = self.train_df[self.history_col_name].apply(


Generated recommendations for 1000 users

Recommendations for all users:
       User ID  News ID  Rank
0       U87243   N91597     1
1       U87243    N9375     2
2       U87243   N25725     3
3       U87243   N80126     4
4       U87243  N104737     5
...        ...      ...   ...
49995  U552836   N13604    46
49996  U552836   N50645    47
49997  U552836  N114787    48
49998  U552836    N3435    49
49999  U552836    N7553    50

[50000 rows x 3 columns]

Recommendations for user U87243:
['N91597', 'N9375', 'N80126', 'N74685', 'N104737', 'N88875', 'N45124', 'N128965', 'N17456', 'N71977', 'N128643', 'N72345', 'N54360', 'N1713', 'N82348', 'N79909', 'N96479', 'N72571', 'N79285', 'N86208', 'N18904', 'N22651', 'N73137', 'N90947', 'N93049', 'N72976', 'N18399', 'N58200', 'N454', 'N9740', 'N85544', 'N53360', 'N71643', 'N120031', 'N4289', 'N94737', 'N85484', 'N97355', 'N98095', 'N115167', 'N114571', 'N65119', 'N31043', 'N85056', 'N114848', 'N95066', 'N53933', 'N48876', 'N67369', 'N51166']


In [23]:
def count_correct_recommendations(recommended_news, impressions_new):

    # Split the impressions string into individual "news_id-click_count" pairs
    if isinstance(impressions_new, np.ndarray):
        impressions_new = impressions_new[0]  # Get the string from the array

    # Split the impressions string into individual "news_id-click_count" pairs
    impressions = impressions_new.split()

    # Create a dictionary of impressions where the key is the news_id and value is the click_count
    impressions_dict = {impression.split('-')[0]: int(impression.split('-')[1]) for impression in impressions}

    correct_count = 0
    total_count = 0

    # Iterate over each recommended news ID and check if it appears in the impressions
    for news_id in recommended_news:
        if news_id in impressions_dict:
            total_count += 1
            if impressions_dict[news_id] == 1:  # Check if the news item was clicked
                correct_count += 1

    return correct_count, total_count

In [25]:
mab_correct_count = 0
mab_total_count = 0



for uid in train_df_10000['User ID'].values:
    # Apply find_unseen_news_for_user_item_based (item-based method)
    user_recs = recommender.get_recommendations_for_user(uid)

    impressions_new = train_df_10000[train_df_10000['User ID'] == uid]["Impressions_new"].values
    correct_count_item_based, total_count_item_based = count_correct_recommendations(user_recs, impressions_new)
    mab_correct_count += correct_count_item_based
    mab_total_count += total_count_item_based

print(f"MAB (Method 4):")
print(f"  Correct Count: {mab_correct_count}")
print(f"  Total Count: {mab_total_count}")
print(f"  Accuracy: {mab_correct_count / mab_total_count * 100:.2f}%")


MAB (Method 4):
  Correct Count: 23
  Total Count: 568
  Accuracy: 4.05%
