Precision and recall
https://insidelearningmachines.com/precisionk_and_recallk/#:~:text=Precision%40k%20and%20Recall%40k%20are%20metrics%20used%20to%20evaluate,end%20user%20by%20the%20model.

In [1]:
# imports
import pandas as pd
import numpy as np
from typing import List
import os
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
import warnings


In [8]:
interactions = pd.read_csv("MIND/behaviorsTEST.tsv",sep='\t',  header=None)
interactions.columns =['User', 'Time', 'ID', 'Impressions'] 
interactions = interactions.drop(['Time'], axis=1)
interactions.head()

Unnamed: 0,User,ID,Impressions
0,U13740,N55189 N42782 N34694 N45794 N18445 N63302 N104...,N55689-1 N35729-0
1,U91836,N31739 N6072 N63045 N23979 N35656 N43353 N8129...,N20678-0 N39317-0 N58114-0 N20495-0 N42977-0 N...
2,U73700,N10732 N25792 N7563 N21087 N41087 N5445 N60384...,N50014-0 N23877-0 N35389-0 N49712-0 N16844-0 N...


In [5]:
#users = interactions_emb
interactions = pd.read_csv("embeddings/users_emb_TEST.csv") #document with user interactions
interactions.columns =['User', 'ID', 'Interactions_emb'] 
interactions.head()

Unnamed: 0,User,ID,Interactions_emb
0,U13740,N55189 N42782 N34694 N45794 N18445 N63302 N104...,"[-0.015360403599010574, -0.022621901123784482,..."
1,U91836,N31739 N6072 N63045 N23979 N35656 N43353 N8129...,"[-0.005392600822233362, -0.004443325935426401,..."
2,U73700,N10732 N25792 N7563 N21087 N41087 N5445 N60384...,"[-0.004993118400064608, -0.008147992255787054,..."


In [4]:
#load the data with news articles
news = pd.read_csv("embeddings/news_emb_TEST.csv") #document with news content
news.head()

Unnamed: 0,ID,Category,SubCategory,Content,Content_emb
0,N55189,tv,tvnews,"'Wheel Of Fortune' Guest Delivers Hilarious, O...","[-0.020967688411474228, -0.020634232088923454,..."
1,N42782,sports,baseball_mlb,Three takeaways from Yankees' ALCS Game 5 vict...,"[-0.008319429121911526, 0.0013667173916473985,..."
2,N34694,tv,tvnews,Rosie O'Donnell: Barbara Walters Isn't 'Up to ...,"[-0.042908210307359695, -0.018993420526385307,..."
3,N45794,news,newscrime,Four flight attendants were arrested in Miami'...,"[-0.007930373772978783, -0.0221870094537735, 0..."
4,N18445,sports,football_ncaa,Michigan sends breakup tweet to Notre Dame as ...,"[-0.020153459161520004, -0.044797133654356, 0...."


Calculate the proportion of -1 and 0 in 'Impressions' 

In [6]:
# Function to count the number of suffixes
def count_suffixes(row, suffix):
    impressions = row['Impressions'].split()
    count = sum(1 for imp in impressions if imp.endswith(suffix))
    return count

In [10]:
# Counting "-1" and "-0" suffixes
interactions['-1 Count'] = interactions.apply(lambda row: count_suffixes(row, '-1'), axis=1)
interactions['-0 Count'] = interactions.apply(lambda row: count_suffixes(row, '-0'), axis=1)

# Total count across all users
total_minus_1 = interactions['-1 Count'].sum()
total_minus_0 = interactions['-0 Count'].sum()

print("Total -1 count:", total_minus_1)
print("Total -0 count:", total_minus_0)

Total -1 count: 3
Total -0 count: 46


In [11]:
total_minus_1/total_minus_0

0.06521739130434782

Calculate precision and recall

In [12]:
def create_user_df(input_df, user):
    user_row = input_df[input_df['User'] == user]

    if user_row.empty:
        return None

    impressions = user_row['Impressions'].values[0].split()

    news_ids = []
    true_values = []

    for impression in impressions:
        news_id, true_value = impression.split('-')
        news_ids.append(news_id)
        true_values.append(int(true_value))

    user_df = pd.DataFrame({'ID': news_ids, 'true_value': true_values})
    return user_df

In [13]:
def list_ids_in_folder(folder_path):
    ids = set()
    
    for filename in os.listdir(folder_path):
        if filename.endswith(".csv"):
            # Extracting the ID from the filename
            file_id = filename.split("_")[0][1:]
            ids.add(file_id)
    
    return list(ids)

In [14]:
folder_path = 'hybrid_recommendations'
users_list = list_ids_in_folder(folder_path)
users_list = ['U' + num for num in users_list]
len(users_list)

3

In [16]:
#recommder_type = [content, collab, hybrid]
def precision_and_recall (list_name, recommender_type):

    
    all_precision = []
    all_recall = []
    
    user_df = list_name
    
    for i in users_list[:10]:
        if recommender_type == 'content':
            column = 'Distance'
            path = "content_recommendations/" + i + "_content.csv"
        if recommender_type == 'collab':
            column = 'article_distance'
            path = "collaborative_recommendations/" + i + "_collab.csv"
        if recommender_type == 'hybrid':
            column = 'hybrid'
            path = "hybrid_recommendations/" + i + "_hybrid.csv"
        
        user_df = create_user_df(interactions, i)
        news = pd.read_csv(path) 
        #new sorting for hybrid
        news = news.sort_values(by='hybrid')
        
        #print (news)
        
        id_to_new_column = news.set_index('ID')[column].to_dict()
        user_df[column] = user_df['ID'].map(id_to_new_column)
        
        three_percent = np.percentile(news[column], 4)
        
        user_df['predicted_value'] = user_df[column].apply(lambda x: 0 if x >= three_percent else 1)
        
        # Assuming df is your DataFrame
        true_values = user_df['true_value']
        predicted_values = user_df['predicted_value']

        # Calculate precision
        precision = precision_score(true_values, predicted_values, average='macro') #'weighted' or 'macro'
        recall =recall_score(true_values, predicted_values, average='macro') #'weighted' or 'macro'
        
        all_precision.append(precision)
        all_recall.append(recall)
        #print(user_df)
    print(np.mean(all_precision))
    print(np.mean(all_recall))

In [17]:
warnings.filterwarnings("ignore")

In [18]:
precision_and_recall(users_list, "hybrid")

0.3967532467532468
0.49523809523809526
