Precision and recall
https://insidelearningmachines.com/precisionk_and_recallk/#:~:text=Precision%40k%20and%20Recall%40k%20are%20metrics%20used%20to%20evaluate,end%20user%20by%20the%20model.

In [1]:
# imports
import pandas as pd
import numpy as np
from typing import List
import os
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
import warnings


In [17]:
interactions = pd.read_csv("MIND/behaviors_extra.tsv",sep='\t',  header=None)
interactions.columns =['User', 'Time', 'ID', 'Impressions'] 
interactions = interactions.drop(['Time'], axis=1)
interactions.head()

Unnamed: 0,User,ID,Impressions
0,U111,N61837 N39237 N9786 N43620 N61409,N7482-1 N6379-0
1,U222,N13861 N41051 N40272 N24967 N14962,N55689-1 N35729-0
2,U333,N9786 N47214 N24905 N56618 N34406,N20678-0 N39317-0 N58114-0 N20495-0
3,U444,N20336 N30961 N61765 N40969 N11472,N50014-0
4,U555,N59295 N9721 N3574 N43620 N22028,N35729-0 N33632-0 N49685-1 N27581-0


In [16]:
#users = interactions_emb
interactions_emb = pd.read_csv("embeddings/users_emb_extra.csv") #document with user interactions
interactions_emb.head()

Unnamed: 0,User,ID,Content
0,U111,N61837 N39237 N9786 N43620 N61409,"[-0.011852237349376082, -0.015658087749034166,..."
1,U222,N13861 N41051 N40272 N24967 N14962,"[0.010383155662566423, -0.0077624950557947155,..."
2,U333,N9786 N47214 N24905 N56618 N34406,"[-0.021996299363672735, -0.010908919479697942,..."
3,U444,N20336 N30961 N61765 N40969 N11472,"[-0.014395372092258185, -0.007225491013377905,..."
4,U555,N59295 N9721 N3574 N43620 N22028,"[-0.006499886885285378, -0.010832596011459828,..."


In [8]:
#load the data with news articles
news = pd.read_csv("embeddings/news_emb_final.csv") #document with news content
news.head()

Unnamed: 0,ID,Category,SubCategory,Content,Content_emb
0,N55528,lifestyle,lifestyleroyals,"The Brands Queen Elizabeth, Prince Charles, an...","[0.005885085556656122, -0.007782096974551678, ..."
1,N19639,health,weightloss,50 Worst Habits For Belly Fat These seemingly ...,"[-0.004876355174928904, -0.007969613187015057,..."
2,N61837,news,newsworld,The Cost of Trump's Aid Freeze in the Trenches...,"[-0.02760046347975731, -0.013719998300075531, ..."
3,N53526,health,voices,I Was An NBA Wife. Here's How It Affected My M...,"[-0.0297758337110281, -0.014837449416518211, 0..."
4,N38324,health,medical,"How to Get Rid of Skin Tags, According to a De...","[0.005073545966297388, 0.004160495940595865, 0..."


Calculate the proportion of -1 and 0 in 'Impressions' 

In [14]:
# Function to count the number of suffixes
def count_suffixes(row, suffix):
    impressions = row['Impressions'].split()
    count = sum(1 for imp in impressions if imp.endswith(suffix))
    return count

In [19]:
# Counting "-1" and "-0" suffixes
interactions['-1 Count'] = interactions.apply(lambda row: count_suffixes(row, '-1'), axis=1)
interactions['-0 Count'] = interactions.apply(lambda row: count_suffixes(row, '-0'), axis=1)

# Total count across all users
total_minus_1 = interactions['-1 Count'].sum()
total_minus_0 = interactions['-0 Count'].sum()

print("Total -1 count:", total_minus_1)
print("Total -0 count:", total_minus_0)

Total -1 count: 3
Total -0 count: 11


In [20]:
total_minus_1/total_minus_0

0.2727272727272727

Calculate precision and recall

In [21]:
def create_user_df(input_df, user):
    user_row = input_df[input_df['User'] == user]

    if user_row.empty:
        return None

    impressions = user_row['Impressions'].values[0].split()

    news_ids = []
    true_values = []

    for impression in impressions:
        news_id, true_value = impression.split('-')
        news_ids.append(news_id)
        true_values.append(int(true_value))

    user_df = pd.DataFrame({'ID': news_ids, 'true_value': true_values})
    return user_df

In [22]:
# Assuming you have a DataFrame named 'users' with a 'User' column
users_list = interactions_emb['User'].unique().tolist()

# Print the resulting list
print(users_list)

['U111', 'U222', 'U333', 'U444', 'U555', 'U666']


In [28]:
#recommder_type = [content, collab, hybrid]
def precision_and_recall (list_name, recommender_type):
    if recommender_type == 'content':
        column = 'distance'
    if recommender_type == 'collab':
        column = 'collaborative_rec'
    if recommender_type == 'hybrid':
        column = 'mean'
    
    all_precision = []
    all_recall = []
    
    user_df = list_name
    
    for i in users_list:
        
        user_df = create_user_df(interactions, i)
        news = pd.read_csv("content_recommendations/" + i + "_V2_cont.csv") 
        
        id_to_new_column = news.set_index('ID')[column].to_dict()
        user_df[column] = user_df['ID'].map(id_to_new_column)
        
        three_percent = np.percentile(news[column], 3)
        
        user_df['predicted_value'] = user_df[column].apply(lambda x: 0 if x >= three_percent else 1)
        
        # Assuming df is your DataFrame
        true_values = user_df['true_value']
        predicted_values = user_df['predicted_value']

        # Calculate precision
        precision = precision_score(true_values, predicted_values, average='weighted') #'weighted' or 'macro'
        recall =recall_score(true_values, predicted_values, average='weighted') #'weighted' or 'macro'
        
        all_precision.append(precision)
        all_recall.append(recall)
        #print(user_df)
    print(np.mean(all_precision))
    print(np.mean(all_recall))

In [29]:
warnings.filterwarnings("ignore")

In [30]:
precision_and_recall(users_list, 'content')

0.6770833333333334
0.7916666666666666


In [None]:
precision_and_recall(users_list, 'collab')