In [1]:
import pandas as pd
import random
import openai 
import ast
from scipy.stats import pearsonr
import numpy as np

In [2]:
#load the data with user behaviors
behaviors = pd.read_csv("MIND/behaviors.tsv",sep='\t',  header=None)
behaviors.columns =['User', 'Time', 'ID', 'Impressions'] 
behaviors = behaviors.drop(['Time', 'Impressions'], axis=1)
behaviors.head()

Unnamed: 0,User,ID
0,U13740,N55189 N42782 N34694 N45794 N18445 N63302 N104...
1,U91836,N31739 N6072 N63045 N23979 N35656 N43353 N8129...
2,U73700,N10732 N25792 N7563 N21087 N41087 N5445 N60384...
3,U34670,N45729 N2203 N871 N53880 N41375 N43142 N33013 ...
4,U8125,N10078 N56514 N14904 N33740


In [3]:
news = pd.read_csv("sentiment_analysis/sentiment_final.csv") #document with user interactions
news = news.iloc[:, 1:] 
news = news.drop(['Content_emb'], axis=1)
news.head()

Unnamed: 0,ID,Category,SubCategory,Content,Sentiment
0,N55528,lifestyle,lifestyleroyals,"The Brands Queen Elizabeth, Prince Charles, an...",Positive
1,N19639,health,weightloss,50 Worst Habits For Belly Fat These seemingly ...,Negative
2,N61837,news,newsworld,The Cost of Trump's Aid Freeze in the Trenches...,Negative
3,N53526,health,voices,I Was An NBA Wife. Here's How It Affected My M...,Neutral
4,N38324,health,medical,"How to Get Rid of Skin Tags, According to a De...",Positive


In [4]:
def get_user_related_content(user):
    user_row = behaviors[behaviors['User'] == user]
    
    if user_row.empty:
        return [("No data found for user", user)]
    
    user_ids = user_row['ID'].iloc[0].split()
    
    result = []
    
    for user_id in user_ids:
        id_row = news[news['ID'] == user_id]
        
        if not id_row.empty:
            content = id_row['Content'].iloc[0]
            result.append((user_id, content))
    
    return result

In [14]:
def get_random_news(n):
    random_news = random.sample(list(news.itertuples(index=False, name=None)), n)
    return [(news[0], news[3]) for news in random_news]


Testing with ChatGPT

In [6]:
# Set your OpenAI API key
openai.api_key = "your key"

In [7]:
# Function to chat with GPT
def chat_with_chatgpt(prompt, model="gpt-3.5-turbo"):
    response = openai.ChatCompletion.create(
        model=model,
        messages=[
            {"role": "system", "content": "You are a helpful assistant."},
            {"role": "user", "content": prompt},
        ],
    )
    message = response['choices'][0]['message']['content'].strip()
    return message

In [104]:
def gpt_task1 (user, n_news):
    user_output = get_user_related_content(user)
    random_news = get_random_news (n_news)
    formatted_random_news = [f'{item[0]}: {item[1]}' for item in random_news]
    formatted_user_output = [f'{item[0]}: {item[1]}' for item in user_output]
    user_prompt = (f"The user has interacted with the following items (in no particular order): {formatted_user_output}. Basing on user interactions, sort items from this list {formatted_random_news} in order of priority, from highest to lowest. Output format: a python list with news index (e.g., N12345). Do not explain the reason or include any other words.")
    
    chatbot_response = chat_with_chatgpt(user_prompt)
    
    # Remove leading and trailing whitespaces and newline characters
    cleaned_string = chatbot_response.strip()

    # Use ast.literal_eval to safely evaluate the string as a literal expression
    result_list = ast.literal_eval(cleaned_string)
    set1 = set(formatted_user_output)
    set2 = set(result_list[:10])
    if not set1.intersection(set2):
        print("The lists do not intercross (have no common elements).")
    else:
        print("The lists intercross (have common elements).")
    return (result_list[:10])

In [177]:
# n is the number of random user IDs you want to select
n = 15
# Select n random user IDs from the DataFrame
random_user_ids = behaviors['User'].sample(n).tolist()
print(random_user_ids)

['U77998', 'U65373', 'U56177', 'U92420', 'U33354', 'U82167', 'U742', 'U25153', 'U67080', 'U52004', 'U29672', 'U28036', 'U12491', 'U36887', 'U7042']


In [182]:
user_input = 'U65373'
n=20

In [179]:
gpt_task1(user_input, 20)

The lists do not intercross (have no common elements).


['N11177',
 'N49365',
 'N18109',
 'N995',
 'N63906',
 'N16606',
 'N60615',
 'N21242',
 'N36751',
 'N24298']

Category correlation

In [16]:
def user_news_ids(user_id,):
    # Find the row corresponding to the user ID
    user_row = behaviors[behaviors['User'] == user_id]

    # If the user ID exists in the DataFrame
    if not user_row.empty:
        # Split the IDs from the 'ID' column
        news_ids = user_row['ID'].iloc[0].split()
        return news_ids
    else:
        print(f"User ID {user_id} not found in the DataFrame.")
        return []

In [49]:
def category_correlation(user, n_news):
    user_output = user_news_ids(user)
    user_news_df = news[news['ID'].isin(user_output)]
    categ_counts_user = user_news_df['Category'].value_counts()
    total_counts = user_news_df['Category'].value_counts().sum()
    user_ration = categ_counts_user/total_counts
    
    list_of_recommebdations_t1 = gpt_task1(user, n_news)
    user_recom_df = news[news['ID'].isin(list_of_recommebdations_t1)]
    categ_counts_recom = user_recom_df['Category'].value_counts()
    ratio_recom = categ_counts_recom/10
    
    merged_df = pd.concat([ratio_recom, user_ration], axis=1, sort=False).fillna(0)
    merged_df.columns = ['recommend', 'profiles']
    
    try:
        correlation, p_value = pearsonr(merged_df['recommend'], merged_df['profiles'])
    except ValueError as e:
         print("An error occurred:", e)
         print("Skipping correlation calculation due to insufficient data.")
    
    #print(categ_counts_user)
    print(merged_df)
    print(correlation, p_value )
    
    return correlation, p_value

In [183]:
category_correlation(user_input, n)

The lists do not intercross (have no common elements).
              recommend  profiles
news                0.6  0.500000
sports              0.2  0.166667
travel              0.1  0.083333
foodanddrink        0.0  0.083333
finance             0.0  0.083333
lifestyle           0.0  0.083333
0.9807232952358079 0.0005538054915083992


(0.9807232952358079, 0.0005538054915083992)

Subcategory correlation 

In [73]:
def subcategory_correlation(user, n_news):
    user_output = user_news_ids(user)
    user_news_df = news[news['ID'].isin(user_output)]
    categ_counts_user = user_news_df['SubCategory'].value_counts()
    total_counts = user_news_df['SubCategory'].value_counts().sum()
    user_ratio = categ_counts_user/total_counts
    
    list_of_recommebdations_t1 = gpt_task1(user, n_news)
    user_recom_df = news[news['ID'].isin(list_of_recommebdations_t1)]
    categ_counts_recom = user_recom_df['SubCategory'].value_counts()
    ratio_recom = categ_counts_recom/10
    
    merged_df = pd.concat([ratio_recom, user_ratio], axis=1, sort=False).fillna(0)
    merged_df.columns = ['recommend', 'profiles']
    
    try:
        correlation, p_value = pearsonr(merged_df['recommend'], merged_df['profiles'])
    except ValueError as e:
         print("An error occurred:", e)
         print("Skipping correlation calculation due to insufficient data.")
    
    print(merged_df)
    print(correlation, p_value )
    

In [184]:
subcategory_correlation(user_input, n)

The lists do not intercross (have no common elements).
                   recommend  profiles
newsus                   0.3  0.250000
tipsandtricks            0.1  0.083333
football_nfl             0.1  0.000000
finance-education        0.1  0.083333
lifestyleroyals          0.1  0.083333
newscrime                0.1  0.250000
football_ncaa            0.1  0.166667
weathertopstories        0.1  0.000000
travelnews               0.0  0.083333
0.5244044240850756 0.14723645680908504


Sentiment correlation

In [76]:
def sentiment_correlation(user, n_news):
    user_output = user_news_ids(user)
    user_news_df = news[news['ID'].isin(user_output)]
    categ_counts_user = user_news_df['Sentiment'].value_counts()
    total_counts = user_news_df['Sentiment'].value_counts().sum()
    user_ratio = categ_counts_user/total_counts
    
    list_of_recommebdations_t1 = gpt_task1(user, n_news)
    user_recom_df = news[news['ID'].isin(list_of_recommebdations_t1)]
    categ_counts_recom = user_recom_df['Sentiment'].value_counts()
    ratio_recom = categ_counts_recom/10
    
    merged_df = pd.concat([ratio_recom, user_ratio], axis=1, sort=False).fillna(0)
    merged_df.columns = ['recommend', 'profiles']
    
    try:
        correlation, p_value = pearsonr(merged_df['recommend'], merged_df['profiles'])
    except ValueError as e:
         print("An error occurred:", e)
         print("Skipping correlation calculation due to insufficient data.")
    
    print(merged_df)
    print(correlation, p_value )
    

In [None]:
sentiment_correlation(user_input, n)

Results: all

In [3]:
chatGPT_task1 = pd.read_csv("ChatGPT_results/ChatGPT_task1.csv", sep=';') #document with user interactions

In [4]:
chatGPT_task1

Unnamed: 0,User,Category (cor),Category (p-value),Subcategory (cor),Subcategory (p-value),Sentiment (cor),Sentiment (p-value)
0,U32982,0.647,0.043,0.404,0.049,0.998,0.029
1,U73479,0.746,0.021,0.336,0.187,0.866,0.333
2,U22148,-0.09,0.83,-0.001,0.999,0.628,0.567
3,U31366,0.731,0.0393,0.684,0.009,0.755,0.454
4,U39842,0.404,0.152,0.816,0.001,0.992,0.08
5,U25752,-0.311,0.323,-0.345,0.061,0.861,0.338
6,U71024,0.494,0.102,0.145,0.478,0.755,0.454
7,U41501,-0.359,0.307,0.024,0.917,0.59,0.598
8,U62086,0.989,0.0003,0.476,0.118,0.826,0.381
9,U33947,-0.171,0.745,-0.745,0.008,0.944,0.212


In [5]:
mean_category_cor = chatGPT_task1['Category (cor)'].mean()
mean_category_cor

0.31666666666666665

In [6]:
mean_category_p = chatGPT_task1['Category (p-value)'].mean()
mean_category_p 

0.29741333333333336