In [1]:
# imports
import pandas as pd
import numpy as np
from scipy.stats import pearsonr
import os

In [2]:
#users = interactions_emb
interactions = pd.read_csv("embeddings/users_filtered_final.csv") #document with user interactions
interactions.head()

Unnamed: 0,User,ID,Interactions_emb
0,U244,N17157 N38621 N35022 N50578 N264 N9120 N23907 ...,"[-0.005149974951877837, -0.013250857458654631,..."
1,U68369,N19381 N54536,"[0.0025621717686590273, 0.004183989018201828, ..."
2,U50236,N4020 N44292 N50292 N40772 N57737 N33969 N4054...,"[-0.010138329240492436, -0.01179651383115145, ..."
3,U77060,N23105 N41375,"[-0.005568941123783588, -0.025914330035448074,..."
4,U5596,N459 N56253 N62931 N55846 N29849 N45729 N62834...,"[-0.012533644353970886, -0.011675744312297967,..."


In [3]:
#load the data with news articles
news = pd.read_csv("sentiment_analysis/sentiment_final.csv") #document with user interactions
news = news.iloc[:, 1:] 
news = news.drop(['Content_emb'], axis=1)
news.head()

Unnamed: 0,ID,Category,SubCategory,Content,Sentiment
0,N55528,lifestyle,lifestyleroyals,"The Brands Queen Elizabeth, Prince Charles, an...",Positive
1,N19639,health,weightloss,50 Worst Habits For Belly Fat These seemingly ...,Negative
2,N61837,news,newsworld,The Cost of Trump's Aid Freeze in the Trenches...,Negative
3,N53526,health,voices,I Was An NBA Wife. Here's How It Affected My M...,Neutral
4,N38324,health,medical,"How to Get Rid of Skin Tags, According to a De...",Positive


In [4]:
def list_ids_in_folder(folder_path):
    ids = set()
    
    for filename in os.listdir(folder_path):
        if filename.endswith(".csv"):
            # Extracting the ID from the filename
            file_id = filename.split("_")[0][1:]
            ids.add(file_id)
    
    ids = ['U' + num for num in ids]       
    
    return list(ids)

In [5]:
folder_path = 'content_recommendations'
users_list = list_ids_in_folder(folder_path)
len(users_list)

49053

Category correlation

In [5]:
def create_category_df(specific_category, n_rec, rec_type):
    category_dfs = pd.DataFrame(columns=['User ID', 'Recommend', 'Profiles'])
    if rec_type == "collaborative":
        users_list = list_ids_in_folder('collaborative_recommendations')
    
    if rec_type == "content": 
        users_list = list_ids_in_folder('content_recommendations')
    
    #for user_id in users_list[2500:5000]:
    for user_id in users_list:
        if rec_type == "collaborative":
            item_path = "collaborative_recommendations/" + user_id + "_collab.csv"
            
        if rec_type == "content":
            item_path = "content_recommendations/" + user_id + "_content.csv"      
        
        read_articles = interactions[interactions['User'] == user_id]['ID'].str.split().explode().tolist()
        user_news_df = news[news['ID'].isin(read_articles)]

        #ratio for articles, user interacted with
        user_spec_df = user_news_df[user_news_df['Category'] == specific_category]
        
        categ_counts_user = user_spec_df['Category'].value_counts()
        total_counts = user_news_df['Category'].value_counts().sum()
        user_cat_ratio = categ_counts_user/total_counts
        
        #ratio for recommended articles
        recommendation_df = pd.read_csv(item_path)
        recommendations = recommendation_df.head(n_rec)
        rec_list = recommendations['ID'].tolist() 
        user_news_rec = news[news['ID'].isin(rec_list)]      
        user_news_rec = user_news_rec[user_news_rec['Category'] == specific_category]

        rec_counts_user = user_news_rec['Category'].value_counts()
        rec_ratio = rec_counts_user/n_rec
        
        merged_df = pd.concat([user_cat_ratio, rec_ratio], axis=1, sort=False).fillna(0)
        merged_df.columns = ['Recommend', 'Profiles']
        merged_df['User ID'] = user_id
        
        category_dfs = pd.concat([category_dfs, merged_df], ignore_index=True)
        
    return category_dfs

In [6]:
def calculate_category_correlation(specific_category, n_rec, rec_type):
    category_df = create_category_df(specific_category, n_rec, rec_type)
    recommend = category_df['Recommend']
    profiles = category_df['Profiles']
    
    correlation, p_value = pearsonr(recommend, profiles)

    print("Pearson correlation coefficient:", correlation)
    print("P-value:", p_value)
    
    return correlation, p_value
    

In [9]:
unique_categories = news['Category'].unique()

In [10]:
unique_categories

array(['lifestyle', 'health', 'news', 'sports', 'weather',
       'entertainment', 'autos', 'travel', 'foodanddrink', 'tv',
       'finance', 'movies', 'video', 'music', 'kids', 'middleeast',
       'northamerica'], dtype=object)

Subcategory correlation 

In [10]:
def create_subcategory_df(specific_category, n_rec, rec_type):
    category_dfs = pd.DataFrame(columns=['User ID', 'Recommend', 'Profiles'])
    if rec_type == "collaborative":
        users_list = list_ids_in_folder('collaborative_recommendations')
    
    if rec_type == "content": 
        users_list = list_ids_in_folder('content_recommendations')
    
    for user_id in users_list[:100]:
    # for user_id in users_list:
        if rec_type == "collaborative":
            item_path = "collaborative_recommendations/" + user_id + "_collab.csv"
            
        if rec_type == "content":
            item_path = "content_recommendations/" + user_id + "_content.csv"      
        
        read_articles = interactions[interactions['User'] == user_id]['ID'].str.split().explode().tolist()
        user_news_df = news[news['ID'].isin(read_articles)]

        #ratio for articles, user interacted with
        user_spec_df = user_news_df[user_news_df['SubCategory'] == specific_category]
        
        categ_counts_user = user_spec_df['SubCategory'].value_counts()
        total_counts = user_news_df['SubCategory'].value_counts().sum()
        user_cat_ratio = categ_counts_user/total_counts
        
        #ratio for recommended articles
        recommendation_df = pd.read_csv(item_path)
        recommendations = recommendation_df.head(n_rec)
        rec_list = recommendations['ID'].tolist() 
        user_news_rec = news[news['ID'].isin(rec_list)]      
        user_news_rec = user_news_rec[user_news_rec['SubCategory'] == specific_category]

        rec_counts_user = user_news_rec['SubCategory'].value_counts()
        rec_ratio = rec_counts_user/n_rec
        
        merged_df = pd.concat([user_cat_ratio, rec_ratio], axis=1, sort=False).fillna(0)
        merged_df.columns = ['Recommend', 'Profiles']
        merged_df['User ID'] = user_id
        
        category_dfs = pd.concat([category_dfs, merged_df], ignore_index=True)
        
    return category_dfs

In [6]:
def calculate_subcategory_correlation(specific_category, n_rec, rec_type):
    category_df = create_subcategory_df(specific_category, n_rec, rec_type)
    recommend = category_df['Recommend']
    profiles = category_df['Profiles']
    
    correlation, p_value = pearsonr(recommend, profiles)

    print("Pearson correlation coefficient:", correlation)
    print("P-value:", p_value)
    
    return correlation, p_value

In [13]:
category_counts = news['SubCategory'].value_counts()

# Sort the categories by their counts in descending order
sorted_categories = category_counts.sort_values(ascending=False)

# Print the n most frequent categories
n = 10  # Change this to your desired number of most frequent categories
top_n_subcategories = sorted_categories.head(n)
print("Top", n, "most frequent categories:")
print(top_n_subcategories)

Top 10 most frequent categories:
newsus                      6564
football_nfl                5420
newspolitics                2826
newscrime                   2254
weathertopstories           2047
newsworld                   1720
football_ncaa               1665
baseball_mlb                1661
basketball_nba              1555
newsscienceandtechnology    1210
Name: SubCategory, dtype: int64


Sentiment correlation

In [28]:
def create_sentiment_df(specific_category, n_rec, rec_type):
    category_dfs = pd.DataFrame(columns=['User ID', 'Recommend', 'Profiles'])
    if rec_type == "collaborative":
        users_list = list_ids_in_folder('collaborative_recommendations')
    
    if rec_type == "content": 
        users_list = list_ids_in_folder('content_recommendations')
    
    for user_id in users_list[5000:6000]:
    #for user_id in users_list:
        if rec_type == "collaborative":
            item_path = "collaborative_recommendations/" + user_id + "_collab.csv"
            
        if rec_type == "content":
            item_path = "content_recommendations/" + user_id + "_content.csv"      
        
        read_articles = interactions[interactions['User'] == user_id]['ID'].str.split().explode().tolist()
        user_news_df = news[news['ID'].isin(read_articles)]

        #ratio for articles, user interacted with
        user_spec_df = user_news_df[user_news_df['Sentiment'] == specific_category]
        
        categ_counts_user = user_spec_df['Sentiment'].value_counts()
        total_counts = user_news_df['Sentiment'].value_counts().sum()
        user_cat_ratio = categ_counts_user/total_counts
        
        #ratio for recommended articles
        recommendation_df = pd.read_csv(item_path)
        recommendations = recommendation_df.head(n_rec)
        rec_list = recommendations['ID'].tolist() 
        user_news_rec = news[news['ID'].isin(rec_list)]      
        user_news_rec = user_news_rec[user_news_rec['Sentiment'] == specific_category]

        rec_counts_user = user_news_rec['Sentiment'].value_counts()
        rec_ratio = rec_counts_user/n_rec
        
        merged_df = pd.concat([user_cat_ratio, rec_ratio], axis=1, sort=False).fillna(0)
        merged_df.columns = ['Recommend', 'Profiles']
        merged_df['User ID'] = user_id
        
        category_dfs = pd.concat([category_dfs, merged_df], ignore_index=True)
        
    return category_dfs

In [11]:
def calculate_sentiment_correlation(specific_category, n_rec, rec_type):
    category_df = create_sentiment_df(specific_category, n_rec, rec_type)
    recommend = category_df['Recommend']
    profiles = category_df['Profiles']
    
    correlation, p_value = pearsonr(recommend, profiles)

    print("Pearson correlation coefficient:", correlation)
    print("P-value:", p_value)
    
    return correlation, p_value

In [8]:
unique_sentiments = news['Sentiment'].unique()

In [9]:
unique_sentiments

array(['Positive', 'Negative', 'Neutral'], dtype=object)

In [25]:
calculate_sentiment_correlation("Neutral", 20, "collaborative")

Pearson correlation coefficient: -0.03102053269937157
P-value: 0.3300307435412626


(-0.03102053269937157, 0.3300307435412626)

In [7]:
#users = interactions_emb
recommendations = pd.read_csv("recommendations_20/collaborative_rec_20.csv") #document with user interactions
recommendations.head()

Unnamed: 0,User_ID,ID,article_distance
0,U32888,N21114,0.00509
1,U32888,N23192,0.00509
2,U32888,N40186,0.005099
3,U32888,N16658,0.005217
4,U32888,N29769,0.005217


In [8]:
def create_category_df(specific_category, rec_type):
    recommendations_df = recommendations
    category_dfs = pd.DataFrame(columns=['User ID', 'Recommend', 'Profiles'])
    if rec_type == "collaborative":
        users_list = list_ids_in_folder('collaborative_recommendations')
    if rec_type == "content": 
        users_list = list_ids_in_folder('content_recommendations')
    
    total_users = len(users_list)
    processed_users = 0
    
    for user_id in users_list:
    #for user_id in users_list[:100]:  # Process only the first 100 users for demonstration
        processed_users += 1
        print(f"Processing user {processed_users} of {total_users}")

        user_recommendations = recommendations_df[recommendations_df['User_ID'] == user_id]
        
        read_articles = interactions[interactions['User'] == user_id]['ID'].str.split().explode().tolist()
        user_news_df = news[news['ID'].isin(read_articles)]

        # Ratio for articles user interacted with
        user_spec_df = user_news_df[user_news_df['Category'] == specific_category]
        categ_counts_user = user_spec_df['Category'].value_counts()
        total_counts = user_news_df['Category'].value_counts().sum()
        user_cat_ratio = categ_counts_user / total_counts
        
        # Ratio for recommended articles
        rec_list = user_recommendations['ID'].tolist() 
        user_news_rec = news[news['ID'].isin(rec_list)]      
        user_news_rec = user_news_rec[user_news_rec['Category'] == specific_category]
        rec_counts_user = user_news_rec['Category'].value_counts()
        rec_ratio = rec_counts_user / 20
        
        merged_df = pd.concat([user_cat_ratio, rec_ratio], axis=1, sort=False).fillna(0)
        merged_df.columns = ['Recommend', 'Profiles']
        merged_df['User ID'] = user_id
        
        category_dfs = pd.concat([category_dfs, merged_df], ignore_index=True)
        
    return category_dfs


In [9]:
def calculate_category_correlation(specific_category, rec_type):
    category_df = create_category_df(specific_category, rec_type)
    recommend = category_df['Recommend']
    profiles = category_df['Profiles']
    
    correlation, p_value = pearsonr(recommend, profiles)

    print("Pearson correlation coefficient:", correlation)
    print("P-value:", p_value)
    
    return correlation, p_value
    

In [13]:
calculate_category_correlation("weather", "collaborative")

Processing user 1 of 49053
Processing user 2 of 49053
Processing user 3 of 49053
Processing user 4 of 49053
Processing user 5 of 49053
Processing user 6 of 49053
Processing user 7 of 49053
Processing user 8 of 49053
Processing user 9 of 49053
Processing user 10 of 49053
Processing user 11 of 49053
Processing user 12 of 49053
Processing user 13 of 49053
Processing user 14 of 49053
Processing user 15 of 49053
Processing user 16 of 49053
Processing user 17 of 49053
Processing user 18 of 49053
Processing user 19 of 49053
Processing user 20 of 49053
Processing user 21 of 49053
Processing user 22 of 49053
Processing user 23 of 49053
Processing user 24 of 49053
Processing user 25 of 49053
Processing user 26 of 49053
Processing user 27 of 49053
Processing user 28 of 49053
Processing user 29 of 49053
Processing user 30 of 49053
Processing user 31 of 49053
Processing user 32 of 49053
Processing user 33 of 49053
Processing user 34 of 49053
Processing user 35 of 49053
Processing user 36 of 49053
P

(0.3042571573343274, 0.0)

In [10]:
def create_sentiment_df(specific_category, rec_type):
    recommendations_df = recommendations
    category_dfs = pd.DataFrame(columns=['User ID', 'Recommend', 'Profiles'])
    if rec_type == "collaborative":
        users_list = list_ids_in_folder('collaborative_recommendations')
    if rec_type == "content": 
        users_list = list_ids_in_folder('content_recommendations')
    
    total_users = len(users_list)
    processed_users = 0
    
    for user_id in users_list:
    #for user_id in users_list[:100]:  # Process only the first 100 users for demonstration
        processed_users += 1
        print(f"Processing user {processed_users} of {total_users}")

        user_recommendations = recommendations_df[recommendations_df['User_ID'] == user_id]
        
        read_articles = interactions[interactions['User'] == user_id]['ID'].str.split().explode().tolist()
        user_news_df = news[news['ID'].isin(read_articles)]

        # Ratio for articles user interacted with
        user_spec_df = user_news_df[user_news_df['Sentiment'] == specific_category]
        categ_counts_user = user_spec_df['Sentiment'].value_counts()
        total_counts = user_news_df['Sentiment'].value_counts().sum()
        user_cat_ratio = categ_counts_user / total_counts
        
        # Ratio for recommended articles
        rec_list = user_recommendations['ID'].tolist() 
        user_news_rec = news[news['ID'].isin(rec_list)]      
        user_news_rec = user_news_rec[user_news_rec['Sentiment'] == specific_category]
        rec_counts_user = user_news_rec['Sentiment'].value_counts()
        rec_ratio = rec_counts_user / 20
        
        merged_df = pd.concat([user_cat_ratio, rec_ratio], axis=1, sort=False).fillna(0)
        merged_df.columns = ['Recommend', 'Profiles']
        merged_df['User ID'] = user_id
        
        category_dfs = pd.concat([category_dfs, merged_df], ignore_index=True)
        
    return category_dfs


In [11]:
def calculate_sentiment_correlation(specific_category, rec_type):
    category_df = create_sentiment_df(specific_category, rec_type)
    recommend = category_df['Recommend']
    profiles = category_df['Profiles']
    
    correlation, p_value = pearsonr(recommend, profiles)

    print("Pearson correlation coefficient:", correlation)
    print("P-value:", p_value)
    
    return correlation, p_value

In [12]:
calculate_sentiment_correlation("Positive", "content")

Processing user 1 of 49053
Processing user 2 of 49053
Processing user 3 of 49053
Processing user 4 of 49053
Processing user 5 of 49053
Processing user 6 of 49053
Processing user 7 of 49053
Processing user 8 of 49053
Processing user 9 of 49053
Processing user 10 of 49053
Processing user 11 of 49053
Processing user 12 of 49053
Processing user 13 of 49053
Processing user 14 of 49053
Processing user 15 of 49053
Processing user 16 of 49053
Processing user 17 of 49053
Processing user 18 of 49053
Processing user 19 of 49053
Processing user 20 of 49053
Processing user 21 of 49053
Processing user 22 of 49053
Processing user 23 of 49053
Processing user 24 of 49053
Processing user 25 of 49053
Processing user 26 of 49053
Processing user 27 of 49053
Processing user 28 of 49053
Processing user 29 of 49053
Processing user 30 of 49053
Processing user 31 of 49053
Processing user 32 of 49053
Processing user 33 of 49053
Processing user 34 of 49053
Processing user 35 of 49053
Processing user 36 of 49053
P

(0.29679913349610465, 0.0)

In [13]:
calculate_sentiment_correlation("Negative", "content")

Processing user 1 of 49053
Processing user 2 of 49053
Processing user 3 of 49053
Processing user 4 of 49053
Processing user 5 of 49053
Processing user 6 of 49053
Processing user 7 of 49053
Processing user 8 of 49053
Processing user 9 of 49053
Processing user 10 of 49053
Processing user 11 of 49053
Processing user 12 of 49053
Processing user 13 of 49053
Processing user 14 of 49053
Processing user 15 of 49053
Processing user 16 of 49053
Processing user 17 of 49053
Processing user 18 of 49053
Processing user 19 of 49053
Processing user 20 of 49053
Processing user 21 of 49053
Processing user 22 of 49053
Processing user 23 of 49053
Processing user 24 of 49053
Processing user 25 of 49053
Processing user 26 of 49053
Processing user 27 of 49053
Processing user 28 of 49053
Processing user 29 of 49053
Processing user 30 of 49053
Processing user 31 of 49053
Processing user 32 of 49053
Processing user 33 of 49053
Processing user 34 of 49053
Processing user 35 of 49053
Processing user 36 of 49053
P

(0.2991333524375468, 0.0)

In [14]:
calculate_sentiment_correlation("Neutral", "content")

Processing user 1 of 49053
Processing user 2 of 49053
Processing user 3 of 49053
Processing user 4 of 49053
Processing user 5 of 49053
Processing user 6 of 49053
Processing user 7 of 49053
Processing user 8 of 49053
Processing user 9 of 49053
Processing user 10 of 49053
Processing user 11 of 49053
Processing user 12 of 49053
Processing user 13 of 49053
Processing user 14 of 49053
Processing user 15 of 49053
Processing user 16 of 49053
Processing user 17 of 49053
Processing user 18 of 49053
Processing user 19 of 49053
Processing user 20 of 49053
Processing user 21 of 49053
Processing user 22 of 49053
Processing user 23 of 49053
Processing user 24 of 49053
Processing user 25 of 49053
Processing user 26 of 49053
Processing user 27 of 49053
Processing user 28 of 49053
Processing user 29 of 49053
Processing user 30 of 49053
Processing user 31 of 49053
Processing user 32 of 49053
Processing user 33 of 49053
Processing user 34 of 49053
Processing user 35 of 49053
Processing user 36 of 49053
P

(0.14549855462945266, 6.695898038847733e-229)