Precision and recall
https://insidelearningmachines.com/precisionk_and_recallk/#:~:text=Precision%40k%20and%20Recall%40k%20are%20metrics%20used%20to%20evaluate,end%20user%20by%20the%20model.

In [47]:
# imports
import pandas as pd
import numpy as np
from typing import List
import os

In [48]:
behaviors = pd.read_csv("MIND/behaviors_test.csv") 
behaviors = behaviors.drop(['0', '2', '3'], axis=1)
behaviors.columns =['User', 'Impressions']

In [49]:
users = pd.read_csv("data_embeddings/users_emb_test.csv") #document with user interactions
users.head()

Unnamed: 0,User,ID,Interactions_emb
0,U10045,N52865 N22570 N37481 N55189 N16158 N63276 N150...,"[-0.016643907874822617, -0.011585889384150505,..."
1,U10585,N59873 N60894 N19760 N44018 N29499 N24002 N350...,"[-0.008049571886658669, -0.016986606642603874,..."
2,U11306,N18064 N38868 N60340 N31801 N39778 N54842 N110...,"[-0.03472215309739113, -0.024884670972824097, ..."
3,U12328,N1150 N13008 N12833 N1570 N48914 N15254 N27448...,"[-0.015663105994462967, -0.009945932775735855,..."
4,U12957,N41308 N12900 N45020 N22544 N61106 N44940 N520...,"[0.0014848418068140745, -0.02082444727420807, ..."


In [55]:
news = pd.read_csv("hybrid_test.csv") 
news.head()

Unnamed: 0,ID,category,sub_category,content,collaborative_rec,content_rec,mean,two_one
0,N17463,news,newspolitics,Democrats unveil procedures for Trump's impeac...,0.163646,0.263923,0.213784,0.230497
1,N57364,news,newspolitics,Facebook's Zuckerberg grilled by AOC on fact-c...,0.163646,0.308835,0.236241,0.260439
2,N45824,sports,football_nfl_videos,Ian Rapoport: MRI results are good news for Pa...,0.163646,0.22388,0.193763,0.203802
3,N5211,sports,football_nfl,Jets trade DL Leonard Williams to Giants The U...,0.163646,0.228177,0.195912,0.206667
4,N4408,sports,football_nfl,Patrick Mahomes already back at practice for C...,0.163646,0.227315,0.195481,0.206092


In [56]:
# Function to count the number of suffixes
def count_suffixes(row, suffix):
    impressions = row['Impressions'].split()
    count = sum(1 for imp in impressions if imp.endswith(suffix))
    return count

In [58]:
# Counting "-1" and "-0" suffixes
behaviors['-1 Count'] = behaviors.apply(lambda row: count_suffixes(row, '-1'), axis=1)
behaviors['-0 Count'] = behaviors.apply(lambda row: count_suffixes(row, '-0'), axis=1)

# Total count across all users
total_minus_1 = behaviors['-1 Count'].sum()
total_minus_0 = behaviors['-0 Count'].sum()

print("Total -1 count:", total_minus_1)
print("Total -0 count:", total_minus_0)

Total -1 count: 137
Total -0 count: 3670


In [59]:
total_minus_1/total_minus_0

0.03732970027247957

In [51]:
def create_user_df(input_df, user):
    user_row = input_df[input_df['User'] == user]

    if user_row.empty:
        return None

    impressions = user_row['Impressions'].values[0].split()

    news_ids = []
    true_values = []

    for impression in impressions:
        news_id, true_value = impression.split('-')
        news_ids.append(news_id)
        true_values.append(int(true_value))

    user_df = pd.DataFrame({'ID': news_ids, 'true_value': true_values})
    return user_df

In [52]:
# Assuming you have a DataFrame named 'users' with a 'User' column
users_list = users['User'].unique().tolist()

# Print the resulting list
print(users_list)

['U10045', 'U10585', 'U11306', 'U12328', 'U12957', 'U13000', 'U13227', 'U13740', 'U1376', 'U15141', 'U15363', 'U16402', 'U1700', 'U17841', 'U19722', 'U19739', 'U22930', 'U23485', 'U24775', 'U2579', 'U27024', 'U27804', 'U29155', 'U2935', 'U34670', 'U36009', 'U3616', 'U37127', 'U37844', 'U38627', 'U38865', 'U39029', 'U39222', 'U39643', 'U39703', 'U39868', 'U40466', 'U40937', 'U44808', 'U45798', 'U46596', 'U47477', 'U47654', 'U47761', 'U47892', 'U49572', 'U50562', 'U50695', 'U5286', 'U52914', 'U53159', 'U53231', 'U53276', 'U54128', 'U5627', 'U60663', 'U61371', 'U61875', 'U63162', 'U63808', 'U66486', 'U67119', 'U67565', 'U69950', 'U700', 'U70879', 'U73329', 'U73700', 'U7471', 'U78244', 'U78765', 'U78954', 'U79199', 'U7932', 'U80709', 'U80798', 'U8125', 'U81585', 'U8312', 'U8355', 'U83994', 'U85394', 'U86017', 'U89744', 'U91389', 'U91678', 'U91836', 'U92093', 'U92183', 'U92486', 'U9306', 'U9312']


In [53]:
four_perc = []
for i in users_list:
    # Example usage
    user_df = create_user_df(behaviors, i)
    news = pd.read_csv("hybrid_recommendations/" + i + "_hybr.csv") 
    
    # Sorting the 'content_rec' column in ascending order
    sorted_content_rec = news['content_rec'].sort_values(ascending=True)

    # Calculating the index for the value representing the maximum of 4% of the sorted data
    index_4_percent = int(len(sorted_content_rec) * 0.04)  # 4% of the data

    # Getting the value at the calculated index
    value_at_4_percent = sorted_content_rec.iloc[index_4_percent]
    four_perc.append(value_at_4_percent)
    
print (np.mean(four_perc))
    
    

0.21719577695429057


In [54]:
for i in users_list:
    # Example usage
    user_df = create_user_df(behaviors, i)
    news = pd.read_csv("hybrid_recommendations/" + i + "_hybr.csv") 
    
    id_to_new_column = news.set_index('ID')['content_rec'].to_dict()
    user_df['content_rec'] = user_df['ID'].map(id_to_new_column)
    
    print(user_df)

        ID  true_value  content_rec
0   N59673           0          NaN
1   N34876           0          NaN
2   N14592           0          NaN
3   N53470           0          NaN
4   N39010           0          NaN
5   N51048           0          NaN
6   N47061           0          NaN
7   N64094           0          NaN
8    N4913           0          NaN
9   N35233           0          NaN
10  N13907           0          NaN
11  N26795           0          NaN
12  N28213           0          NaN
13    N287           0          NaN
14  N21420           1          NaN
15  N22417           0          NaN
        ID  true_value  content_rec
0   N14592           0          NaN
1   N12029           0          NaN
2   N45704           0          NaN
3    N9806           0          NaN
4   N37497           0          NaN
5   N58114           0          NaN
6   N33677           0          NaN
7   N39317           0          NaN
8   N42977           0          NaN
9   N22407           0      

In [18]:
news = pd.read_csv("hybrid_test.csv") 
news.head()

Unnamed: 0,ID,category,sub_category,content,collaborative_rec,content_rec,mean,two_one
0,N10414,movies,movienews,"Robert Evans, 'Chinatown' Producer and Paramou...",0.246655,0.273961,0.260308,0.264859
1,N53526,health,voices,I Was An NBA Wife. Here's How It Affected My M...,0.256467,0.26315,0.259809,0.260922
2,N63302,lifestyle,lifestylebuzz,This Wedding Photo of a Canine Best Man Captur...,0.259734,0.281452,0.270593,0.274213
3,N55189,tv,tvnews,"'Wheel Of Fortune' Guest Delivers Hilarious, O...",0.263554,0.271511,0.267533,0.268859
4,N45794,news,newscrime,Four flight attendants were arrested in Miami'...,0.266187,0.272487,0.269337,0.270387


In [60]:
behaviors = pd.read_csv("MIND/behaviors_test.csv") 
behaviors = behaviors.drop(['0', '2', '3'], axis=1)
behaviors.columns =['User', 'Impressions']
behaviors.head()

Unnamed: 0,User,Impressions
0,U13000,N7482-1 N6379-0
1,U13740,N55689-1 N35729-0
2,U91836,N20678-0 N39317-0 N58114-0 N20495-0 N42977-0 N...
3,U73700,N50014-0 N23877-0 N35389-0 N49712-0 N16844-0 N...
4,U34670,N35729-0 N33632-0 N49685-1 N27581-0


In [20]:
# Calculating the mean of 'content_rec' column
mean_content_rec = news['content_rec'].mean() 

print("Mean of 'content_rec' column:", mean_content_rec)

Mean of 'content_rec' column: 0.2556758361506847


In [47]:
# Sorting the 'content_rec' column in ascending order
sorted_content_rec = news['content_rec'].sort_values(ascending=True)

# Calculating the index for the value representing the maximum of 4% of the sorted data
index_4_percent = int(len(sorted_content_rec) * 0.04)  # 4% of the data

# Getting the value at the calculated index
value_at_4_percent = sorted_content_rec.iloc[index_4_percent]

print("Value at 4% of the sorted 'content_rec' data:", value_at_4_percent)


Value at 4% of the sorted 'content_rec' data: 0.2474820252069627


In [22]:
# Function to count the number of suffixes
def count_suffixes(row, suffix):
    impressions = row['Impressions'].split()
    count = sum(1 for imp in impressions if imp.endswith(suffix))
    return count

In [23]:
# Counting "-1" and "-0" suffixes
behaviors['-1 Count'] = behaviors.apply(lambda row: count_suffixes(row, '-1'), axis=1)
behaviors['-0 Count'] = behaviors.apply(lambda row: count_suffixes(row, '-0'), axis=1)

# Total count across all users
total_minus_1 = behaviors['-1 Count'].sum()
total_minus_0 = behaviors['-0 Count'].sum()

print("Total -1 count:", total_minus_1)
print("Total -0 count:", total_minus_0)

Total -1 count: 137
Total -0 count: 3670


In [24]:
total_minus_1/total_minus_0

0.03732970027247957

In [63]:
def create_user_df(input_df, user):
    user_row = input_df[input_df['User'] == user]

    if user_row.empty:
        return None

    impressions = user_row['Impressions'].values[0].split()

    news_ids = []
    true_values = []

    for impression in impressions:
        news_id, true_value = impression.split('-')
        news_ids.append(news_id)
        true_values.append(int(true_value))

    user_df = pd.DataFrame({'ID': news_ids, 'true_value': true_values})
    return user_df

In [64]:
user_input = 'U13000'
user_df = create_user_df(behaviors, user_input)
print(user_df)


      ID  true_value
0  N7482           1
1  N6379           0


In [65]:
# Create a dictionary mapping IDs to new column values from df2
id_to_new_column = news.set_index('ID')['content_rec'].to_dict()

# Add the new column to df1 based on the mapping
user_df['content_rec'] = user_df['ID'].map(id_to_new_column)

print(user_df)

      ID  true_value  content_rec
0  N7482           1          NaN
1  N6379           0          NaN


In [19]:
# Assuming df is your DataFrame
user_df['predicted_value'] = user_df['content_rec'].apply(lambda x: 0 if x >= 0.04 else 1)

# Display the modified DataFrame
print(user_df)


      ID  true_value  content_rec  predicted_value
0  N7482           1     0.224888                0
1  N6379           0     0.261376                0


In [29]:
###precision###

In [26]:
import pandas as pd
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

# Assuming df is your DataFrame
true_values = user_df['true_value']
predicted_values = user_df['predicted_value']

# Calculate precision
precision = precision_score(true_values, predicted_values, average='binary')
recall =recall_score(true_values, predicted_values, average='macro')
# Display the mean precision
print(f"Mean Precision: {precision}")

Mean Precision: 0.0


  _warn_prf(average, modifier, msg_start, len(result))


In [27]:
print(f"Mean Recall: {recall}")

Mean Recall: 0.5


In [31]:
###recall###