In [41]:
import pandas as pd
import random
import openai 
import ast
from scipy.stats import pearsonr

In [42]:
#load the data with user behaviors
behaviors = pd.read_csv("MIND/behaviorsTEST.tsv",sep='\t',  header=None)
behaviors.columns =['User', 'Time', 'ID', 'Impressions'] 
behaviors = behaviors.drop(['Time', 'Impressions'], axis=1)
behaviors.head()

Unnamed: 0,User,ID
0,U13740,N55189 N42782 N34694 N45794 N18445 N63302 N104...
1,U91836,N31739 N6072 N63045 N23979 N35656 N43353 N8129...
2,U73700,N10732 N25792 N7563 N21087 N41087 N5445 N60384...


In [43]:
news = pd.read_csv("sentiment_analysis/sentiment_TEST.csv") #document with user interactions
news = news.iloc[:, 1:] 
news = news.drop(['Content_emb'], axis=1)
news.head()

Unnamed: 0,ID,Category,SubCategory,Content,Sentiment
0,N55189,tv,tvnews,"'Wheel Of Fortune' Guest Delivers Hilarious, O...",Positive
1,N42782,sports,baseball_mlb,Three takeaways from Yankees' ALCS Game 5 vict...,Positive
2,N34694,tv,tvnews,Rosie O'Donnell: Barbara Walters Isn't 'Up to ...,Positive
3,N45794,news,newscrime,Four flight attendants were arrested in Miami'...,Positive
4,N18445,sports,football_ncaa,Michigan sends breakup tweet to Notre Dame as ...,Positive


In [44]:
def get_user_related_content(user):
    user_row = behaviors[behaviors['User'] == user]
    
    if user_row.empty:
        return [("No data found for user", user)]
    
    user_ids = user_row['ID'].iloc[0].split()
    
    result = []
    
    for user_id in user_ids:
        id_row = news[news['ID'] == user_id]
        
        if not id_row.empty:
            content = id_row['Content'].iloc[0]
            result.append((user_id, content))
    
    return result

In [45]:
def get_random_news(n):
    random_news = random.sample(list(news.itertuples(index=False, name=None)), n)
    return [(news[0], news[3]) for news in random_news]


Testing with ChatGPT

In [46]:
# Set your OpenAI API key
openai.api_key = "your_api_key"

In [47]:
# Function to chat with GPT
def chat_with_chatgpt(prompt, model="gpt-3.5-turbo"):
    response = openai.ChatCompletion.create(
        model=model,
        messages=[
            {"role": "system", "content": "You are a helpful assistant."},
            {"role": "user", "content": prompt},
        ],
    )
    message = response['choices'][0]['message']['content'].strip()
    return message

In [48]:
def gpt_task1 (user, n_news):
    user_output = get_user_related_content(user)
    random_news = get_random_news (n_news)
    formatted_random_news = [f'{item[0]}: {item[1]}' for item in random_news]
    formatted_user_output = [f'{item[0]}: {item[1]}' for item in user_output]
    user_prompt = (f"The user has interacted with the following items (in no particular order): {formatted_user_output}. Basing on user interactions, sort items from this list {formatted_random_news} in order of priority, from highest to lowest. Output format: a python list with news index (e.g., N12345). Do not explain the reason or include any other words.")
    
    chatbot_response = chat_with_chatgpt(user_prompt)
    
    # Remove leading and trailing whitespaces and newline characters
    cleaned_string = chatbot_response.strip()

    # Use ast.literal_eval to safely evaluate the string as a literal expression
    result_list = ast.literal_eval(cleaned_string)
    set1 = set(formatted_user_output)
    set2 = set(result_list[:10])
    if not set1.intersection(set2):
        print("The lists do not intercross (have no common elements).")
    else:
        print("The lists intercross (have common elements).")
    return (result_list[:10])

In [49]:
# n is the number of random user IDs you want to select
n = 1
# Select n random user IDs from the DataFrame
random_user_ids = behaviors['User'].sample(n).tolist()
print(random_user_ids)

['U13740']


In [50]:
user_input = 'U13740'
n=20

In [51]:
gpt_task1(user_input, 20)

The lists do not intercross (have no common elements).


['N13131',
 'N58715',
 'N47346',
 'N48722',
 'N29739',
 'N3142',
 'N59685',
 'N7023',
 'N51570',
 'N21623']

Category correlation

In [52]:
def user_news_ids(user_id,):
    # Find the row corresponding to the user ID
    user_row = behaviors[behaviors['User'] == user_id]

    # If the user ID exists in the DataFrame
    if not user_row.empty:
        # Split the IDs from the 'ID' column
        news_ids = user_row['ID'].iloc[0].split()
        return news_ids
    else:
        print(f"User ID {user_id} not found in the DataFrame.")
        return []

In [53]:
def category_correlation(user, n_news):
    user_output = user_news_ids(user)
    user_news_df = news[news['ID'].isin(user_output)]
    categ_counts_user = user_news_df['Category'].value_counts()
    total_counts = user_news_df['Category'].value_counts().sum()
    user_ration = categ_counts_user/total_counts
    
    list_of_recommebdations_t1 = gpt_task1(user, n_news)
    user_recom_df = news[news['ID'].isin(list_of_recommebdations_t1)]
    categ_counts_recom = user_recom_df['Category'].value_counts()
    ratio_recom = categ_counts_recom/10
    
    merged_df = pd.concat([ratio_recom, user_ration], axis=1, sort=False).fillna(0)
    merged_df.columns = ['recommend', 'profiles']
    
    try:
        correlation, p_value = pearsonr(merged_df['recommend'], merged_df['profiles'])
    except ValueError as e:
         print("An error occurred:", e)
         print("Skipping correlation calculation due to insufficient data.")
    
    #print(categ_counts_user)
    print(merged_df)
    print(correlation, p_value )
    
    return correlation, p_value

In [54]:
category_correlation(user_input, n)

The lists do not intercross (have no common elements).
              recommend  profiles
Category                         
news                0.4  0.333333
sports              0.3  0.222222
lifestyle           0.1  0.111111
health              0.1  0.000000
foodanddrink        0.1  0.000000
tv                  0.0  0.222222
movies              0.0  0.111111
0.6085989058527204 0.1469936618489937


(0.6085989058527204, 0.1469936618489937)

Subcategory correlation 

In [55]:
def subcategory_correlation(user, n_news):
    user_output = user_news_ids(user)
    user_news_df = news[news['ID'].isin(user_output)]
    categ_counts_user = user_news_df['SubCategory'].value_counts()
    total_counts = user_news_df['SubCategory'].value_counts().sum()
    user_ratio = categ_counts_user/total_counts
    
    list_of_recommebdations_t1 = gpt_task1(user, n_news)
    user_recom_df = news[news['ID'].isin(list_of_recommebdations_t1)]
    categ_counts_recom = user_recom_df['SubCategory'].value_counts()
    ratio_recom = categ_counts_recom/10
    
    merged_df = pd.concat([ratio_recom, user_ratio], axis=1, sort=False).fillna(0)
    merged_df.columns = ['recommend', 'profiles']
    
    try:
        correlation, p_value = pearsonr(merged_df['recommend'], merged_df['profiles'])
    except ValueError as e:
         print("An error occurred:", e)
         print("Skipping correlation calculation due to insufficient data.")
    
    print(merged_df)
    print(correlation, p_value )
    

In [56]:
subcategory_correlation(user_input, n)

The lists do not intercross (have no common elements).
                        recommend  profiles
SubCategory                                
newsus                        0.3  0.000000
markets                       0.1  0.000000
lifestyle-news-feature        0.1  0.000000
travelnews                    0.1  0.000000
newscrime                     0.1  0.111111
football_nfl                  0.1  0.000000
finance-real-estate           0.1  0.000000
lifestylehomeandgarden        0.1  0.000000
tvnews                        0.0  0.222222
newspolitics                  0.0  0.222222
baseball_mlb                  0.0  0.111111
football_ncaa                 0.0  0.111111
lifestylebuzz                 0.0  0.111111
movienews                     0.0  0.111111
-0.6791130292870468 0.007563221953400732


Sentiment correlation

In [57]:
def sentiment_correlation(user, n_news):
    user_output = user_news_ids(user)
    user_news_df = news[news['ID'].isin(user_output)]
    categ_counts_user = user_news_df['Sentiment'].value_counts()
    total_counts = user_news_df['Sentiment'].value_counts().sum()
    user_ratio = categ_counts_user/total_counts
    
    list_of_recommebdations_t1 = gpt_task1(user, n_news)
    user_recom_df = news[news['ID'].isin(list_of_recommebdations_t1)]
    categ_counts_recom = user_recom_df['Sentiment'].value_counts()
    ratio_recom = categ_counts_recom/10
    
    merged_df = pd.concat([ratio_recom, user_ratio], axis=1, sort=False).fillna(0)
    merged_df.columns = ['recommend', 'profiles']
    
    try:
        correlation, p_value = pearsonr(merged_df['recommend'], merged_df['profiles'])
    except ValueError as e:
         print("An error occurred:", e)
         print("Skipping correlation calculation due to insufficient data.")
    
    print(merged_df)
    print(correlation, p_value )
    

In [59]:
sentiment_correlation(user_input, n)

The lists do not intercross (have no common elements).
           recommend  profiles
Sentiment                     
Positive         0.7  0.888889
Negative         0.1  0.111111
1.0 1.0
