In [1]:
import pandas as pd
import random
import openai 
import ast
from scipy.stats import pearsonr
import numpy as np

In [2]:
#load the data with user behaviors
behaviors = pd.read_csv("MIND/behaviors.tsv",sep='\t',  header=None)
behaviors.columns =['User', 'Time', 'ID', 'Impressions'] 
behaviors = behaviors.drop(['Time', 'Impressions'], axis=1)
behaviors.head()

Unnamed: 0,User,ID
0,U13740,N55189 N42782 N34694 N45794 N18445 N63302 N104...
1,U91836,N31739 N6072 N63045 N23979 N35656 N43353 N8129...
2,U73700,N10732 N25792 N7563 N21087 N41087 N5445 N60384...
3,U34670,N45729 N2203 N871 N53880 N41375 N43142 N33013 ...
4,U8125,N10078 N56514 N14904 N33740


In [3]:
news = pd.read_csv("sentiment_analysis/sentiment_final.csv") #document with user interactions
news = news.iloc[:, 1:] 
news = news.drop(['Content_emb'], axis=1)
news.head()

Unnamed: 0,ID,Category,SubCategory,Content,Sentiment
0,N55528,lifestyle,lifestyleroyals,"The Brands Queen Elizabeth, Prince Charles, an...",Positive
1,N19639,health,weightloss,50 Worst Habits For Belly Fat These seemingly ...,Negative
2,N61837,news,newsworld,The Cost of Trump's Aid Freeze in the Trenches...,Negative
3,N53526,health,voices,I Was An NBA Wife. Here's How It Affected My M...,Neutral
4,N38324,health,medical,"How to Get Rid of Skin Tags, According to a De...",Positive


In [4]:
def get_user_related_content(user):
    user_row = behaviors[behaviors['User'] == user]
    
    if user_row.empty:
        return [("No data found for user", user)]
    
    user_ids = user_row['ID'].iloc[0].split()
    
    result = []
    
    for user_id in user_ids:
        id_row = news[news['ID'] == user_id]
        
        if not id_row.empty:
            content = id_row['Content'].iloc[0]
            result.append((user_id, content))
    
    return result

In [5]:
def get_random_news(n):
    random_news = random.sample(list(news.itertuples(index=False, name=None)), n)
    return [(news[0], news[3]) for news in random_news]


Testing with ChatGPT

In [6]:
# Set your OpenAI API key
openai.api_key = "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"

In [7]:
# Function to chat with GPT
def chat_with_chatgpt(prompt, model="gpt-3.5-turbo"):
    response = openai.ChatCompletion.create(
        model=model,
        messages=[
            {"role": "system", "content": "You are a helpful assistant."},
            {"role": "user", "content": prompt},
        ],
    )
    message = response['choices'][0]['message']['content'].strip()
    return message

In [14]:
def gpt_task2 (user, n_news):
    user_output = get_user_related_content(user)
    random_news = get_random_news (n_news)
    formatted_random_news = [f'{item[0]}: {item[1]}' for item in random_news]
    formatted_user_output = [f'{item[0]}: {item[1]}' for item in user_output]
    user_prompt = (f"The user has interacted with the following items (in no particular order): {formatted_user_output}. Please recommend any 10 out of {n_news} items from the list: {formatted_random_news} that the user might interact with. Output format: a python list with news index (e.g., N12345). Do not explain the reason or include any other words.")
    chatbot_response = chat_with_chatgpt(user_prompt)
    # Remove leading and trailing whitespaces and newline characters
    cleaned_string = chatbot_response.strip()

    # Use ast.literal_eval to safely evaluate the string as a literal expression
    result_list = ast.literal_eval(cleaned_string)
    
    #print([f'{item[0]}' for item in user_output])
    #print([f'{item[0]}' for item in random_news])
    
    set1 = set(formatted_user_output)
    set2 = set(result_list)
    if not set1.intersection(set2):
        print("The lists do not intercross (have no common elements).")
    else:
        print("The lists intercross (have common elements).")
    #print(formatted_user_output)
    return result_list

In [60]:
# n is the number of random user IDs you want to select
n = 15
# Select n random user IDs from the DataFrame
random_user_ids = behaviors['User'].sample(n).tolist()
print(random_user_ids)

['U93908', 'U38619', 'U81902', 'U73646', 'U31619', 'U51081', 'U11176', 'U30183', 'U20611', 'U54619', 'U19825', 'U23376', 'U38330', 'U76452', 'U69701']


In [85]:
user_input = 'U73646'
n=20

In [145]:
gpt_task2(user_input, 20)

The lists do not intercross (have no common elements).


['N14509',
 'N24593',
 'N47508',
 'N4250',
 'N7393',
 'N2527',
 'N35472',
 'N28697',
 'N8384',
 'N49109']

Category correlation

In [11]:
def user_news_ids(user_id,):
    # Find the row corresponding to the user ID
    user_row = behaviors[behaviors['User'] == user_id]

    # If the user ID exists in the DataFrame
    if not user_row.empty:
        # Split the IDs from the 'ID' column
        news_ids = user_row['ID'].iloc[0].split()
        return news_ids
    else:
        print(f"User ID {user_id} not found in the DataFrame.")
        return []

In [15]:
def category_correlation(user, n_news):
    user_output = user_news_ids(user)
    user_news_df = news[news['ID'].isin(user_output)]
    categ_counts_user = user_news_df['Category'].value_counts()
    total_counts = user_news_df['Category'].value_counts().sum()
    user_ration = categ_counts_user/total_counts
    
    list_of_recommebdations_t1 = gpt_task2(user, n_news)
    user_recom_df = news[news['ID'].isin(list_of_recommebdations_t1)]
    categ_counts_recom = user_recom_df['Category'].value_counts()
    ratio_recom = categ_counts_recom/10
    
    merged_df = pd.concat([ratio_recom, user_ration], axis=1, sort=False).fillna(0)
    merged_df.columns = ['recommend', 'profiles']
    
    try:
        correlation, p_value = pearsonr(merged_df['recommend'], merged_df['profiles'])
    except ValueError as e:
         print("An error occurred:", e)
         print("Skipping correlation calculation due to insufficient data.")
    
    #print(categ_counts_user)
    print(merged_df)
    print(correlation, p_value )
    
    return correlation, p_value

In [86]:
category_correlation(user_input, n)

The lists do not intercross (have no common elements).
         recommend  profiles
sports         0.4       0.5
weather        0.2       0.5
tv             0.1       0.0
news           0.1       0.0
travel         0.1       0.0
autos          0.1       0.0
0.8528028654224418 0.030905834747225285


(0.8528028654224418, 0.030905834747225285)

Subcategory correlation 

In [17]:
def subcategory_correlation(user, n_news):
    user_output = user_news_ids(user)
    user_news_df = news[news['ID'].isin(user_output)]
    categ_counts_user = user_news_df['SubCategory'].value_counts()
    total_counts = user_news_df['SubCategory'].value_counts().sum()
    user_ratio = categ_counts_user/total_counts
    
    list_of_recommebdations_t1 = gpt_task2(user, n_news)
    user_recom_df = news[news['ID'].isin(list_of_recommebdations_t1)]
    categ_counts_recom = user_recom_df['SubCategory'].value_counts()
    ratio_recom = categ_counts_recom/10
    
    merged_df = pd.concat([ratio_recom, user_ratio], axis=1, sort=False).fillna(0)
    merged_df.columns = ['recommend', 'profiles']
    
    try:
        correlation, p_value = pearsonr(merged_df['recommend'], merged_df['profiles'])
    except ValueError as e:
         print("An error occurred:", e)
         print("Skipping correlation calculation due to insufficient data.")
    
    print(merged_df)
    print(correlation, p_value )
    

In [87]:
subcategory_correlation(user_input, n)

The lists do not intercross (have no common elements).
                          recommend  profiles
newsus                          0.2       0.0
lifestyledidyouknow             0.1       0.0
tv-celebrity                    0.1       0.0
newsscienceandtechnology        0.1       0.0
football_ncaa                   0.1       0.0
finance-companies               0.1       0.0
newspolitics                    0.1       0.0
tvnews                          0.1       0.0
weathertopstories               0.1       0.5
boxing                          0.0       0.5
-0.5590169943749473 0.09297070164703322


Sentiment correlation

In [19]:
def sentiment_correlation(user, n_news):
    user_output = user_news_ids(user)
    user_news_df = news[news['ID'].isin(user_output)]
    categ_counts_user = user_news_df['Sentiment'].value_counts()
    total_counts = user_news_df['Sentiment'].value_counts().sum()
    user_ratio = categ_counts_user/total_counts
    
    list_of_recommebdations_t1 = gpt_task2(user, n_news)
    user_recom_df = news[news['ID'].isin(list_of_recommebdations_t1)]
    categ_counts_recom = user_recom_df['Sentiment'].value_counts()
    ratio_recom = categ_counts_recom/10
    
    merged_df = pd.concat([ratio_recom, user_ratio], axis=1, sort=False).fillna(0)
    merged_df.columns = ['recommend', 'profiles']
    
    try:
        correlation, p_value = pearsonr(merged_df['recommend'], merged_df['profiles'])
    except ValueError as e:
         print("An error occurred:", e)
         print("Skipping correlation calculation due to insufficient data.")
    
    print(merged_df)
    print(correlation, p_value )
    

In [88]:
sentiment_correlation(user_input, n)

The lists do not intercross (have no common elements).
          recommend  profiles
Positive        0.5       0.5
Neutral         0.3       0.0
Negative        0.2       0.5
0.18898223650461368 0.8789622816763234


Results: all

In [89]:
chatGPT_task2 = pd.read_csv("ChatGPT_results/ChatGPT_task2.csv", sep=';') #document with user interactions
chatGPT_task2

Unnamed: 0,User,Category (cor),Category (p-value),Subcategory (cor),Subcategory (p-value),Sentiment (cor),Sentiment (p-value)
0,U44960,-0.086,0.872,-0.645,0.043,0.5,0.666
1,U85741,-0.538,0.135,0.142,0.715,1.0,0.001
2,U57744,-0.255,0.423,-0.801,0.0001,0.93,0.238
3,U6392,0.829,0.021,-0.189,0.652,0.359,0.766
4,U18478,-0.146,0.754,-0.828,0.0003,0.573,0.612
5,U77839,-0.299,0.513,-0.833,0.001,0.5,0.666
6,U44139,0.192,0.679,-0.781,0.004,0.982,0.121
7,U26671,0.258,0.742,-0.833,0.001,0.933,0.234
8,U93908,-0.161,0.76,-0.522,0.045,0.945,0.212
9,U38619,0.813,0.008,-0.282,0.308,0.996,0.059


In [90]:
mean_category_cor = chatGPT_task2['Sentiment (cor)'].mean()
mean_category_cor

0.6420666666666667

In [91]:
mean_category_p = chatGPT_task2['Sentiment (p-value)'].mean()
mean_category_p 

0.4550666666666667