# Initialization

In [None]:
# Initialize Libraries
import numpy as np
import pandas as pd
import seaborn
import matplotlib.pyplot as plt
import os
!pip install -U sentence_transformers
import gc

In [None]:
# To save and read data files from your Google drive
from google.colab import drive
drive.mount('/content/drive')

# Load and explore the data set

In [None]:
# Import the news.tsv file
path = '/content/drive/MyDrive/2024 Spring/Text Mining/Projects/Project4/'
news_df = pd.read_csv(path+'news.tsv', sep='\t', header = None)
news_df.columns = ['News_ID', 'Category', 'Subcategory', 'Title', 'Abstract', 'URL', 'Title_entities', 'Abstract_entities']
news_df.info()
news_df.head()

In [None]:
news_df.to_csv(path + 'news.tsv', sep="\t", index=False)

In [None]:
# Import the behavior.tsv file
behaviors_df = pd.read_csv(path+'behaviors.tsv', sep='\t', header = None)
behaviors_df.columns = ['Impression_ID', 'User_ID','Time', 'History','Impressions']
behaviors_df.info()
behaviors_df.head()

# Feature Engineering

In [None]:
# Make a dictionary for the text (embeddings will include the title, abstract, category, and subcategory)
news_text_dict = {
    row['News_ID']: f"{row['Title']} {row['Abstract']} {row['Category']} {row['Subcategory']}"
    for _, row in news_df.iterrows()
}
# news_text_dict

In [None]:
# For the behavior file

# Delete all the rows that either don't have any history or don't have impresssions
behaviors_df = behaviors_df.dropna(subset=["History", "Impressions"])

# Make a newer table consisting of user, last_news, recommendation, and clicked
behavior_table_rows = []

for index, row in behaviors_df.iterrows():
  for impression in row['Impressions'].split():
    id = row['User_ID']
    last_news = row['History'].split()[-1]
    recommendation = impression.split('-')[0]
    clicked = impression.split('-')[1]
    behavior_table_rows.append([id, last_news, recommendation, clicked])

behavior_table = pd.DataFrame(behavior_table_rows, columns=['id', 'last_news', 'recommendation', 'clicked'])
behavior_table.head()

In [None]:
# Add the text to the feature
behavior_table["last_news_text"] = behavior_table["last_news"].map(news_text_dict)
behavior_table["recommendation_text"] = behavior_table["recommendation"].map(news_text_dict)
behavior_table.head()

# Create Embeddings

In [None]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('all-MiniLM-L6-v2')

In [None]:
# For the Behavior file
last_news_list = behavior_table['last_news_text'].astype(str).tolist()
recommend = behavior_table['recommendation_text'].astype(str).tolist()

news_embeddings_path = path + 'last_news_embeddings.npy'
recommend_embeddings_path = path + 'recommend_embeddings.npy'

# If embeddings exist, load them. If they don't, make them.
if os.path.exists(news_embeddings_path):
  last_news_embeddings = np.load(news_embeddings_path)
  del last_news_list
  gc.collect()
else:
  last_news_embeddings = model.encode(last_news_list, batch_size=32, show_progress_bar=True)
  np.save(news_embeddings_path, last_news_embeddings)

# If embeddings exist, load them. If they don't, make them.
if os.path.exists(recommend_embeddings_path):
  recommend_embeddings = np.load(recommend_embeddings_path)
  del recommend
  gc.collect()
else:
  recommend_embeddings = model.encode(recommend, batch_size=32, show_progress_bar=True)
  np.save(recommend_embeddings_path, recommend_embeddings)

last_news_embeddings.shape
recommend_embeddings.shape

# Determine Scores!

In [None]:
from numpy.linalg import norm

behavior_scored_path = path + 'behavior_scored.pkl'

# Cosine Similarity
if os.path.exists(behavior_scored_path):
    behavior_table = pd.read_pickle(behavior_scored_path)
else:
    dot_products = np.sum(last_news_embeddings * recommend_embeddings, axis=1)
    norms = norm(last_news_embeddings, axis=1) * norm(recommend_embeddings, axis=1)
    cosine_scores = dot_products / norms

    behavior_table["cosine_score"] = cosine_scores
    behavior_table.to_pickle(behavior_scored_path)

behavior_table["clicked"] = behavior_table["clicked"].astype(int)

In [None]:
behavior_table

In [None]:
# Precision @ n and MRR function
def precision_and_mrr(df, n):
    clicked = 0
    counted = 0
    scores = []
    for index, group in df.groupby("id"):  # id = user_id
        top_n = group.sort_values("cosine_score", ascending=False).head(n)
        if top_n["clicked"].sum() > 0:
            clicked += 1
        counted += 1

        sorted = group.sort_values("cosine_score", ascending=False).reset_index(drop=True)
        for index_2, row in sorted.iterrows():
          if row["clicked"] == 1:
            scores.append(1 / (index_2 + 1))
            break
        else:
          scores.append[0]

    return clicked / counted if counted > 0 else 0, sum(scores) / len(scores)


n=10
precision, mrr = precision_and_mrr(behavior_table, n)
print(f"Precision @ {n} : {precision:.4f}")
print(f"MRR : {mrr:.4f}")