---Sentiment analysis---
The script conducts sentiment analysis of news articles in the MIND dataset. It takes the PerSenT dataset for training, where articles are marked as postive, negative, or neutral, and applies this classification to the MIND articles. The classification is based on OpenAI's embeddings (text-embedding-ada-002). 


In [23]:
# imports
import pandas as pd
from textblob import TextBlob
import matplotlib.pyplot as plt
import os

In [24]:
news = pd.read_csv("embeddings/news_emb_TEST.csv") #document with user interactions
news.head()

Unnamed: 0,ID,Category,SubCategory,Content,Content_emb
0,N55189,tv,tvnews,"'Wheel Of Fortune' Guest Delivers Hilarious, O...","[-0.020967688411474228, -0.020634232088923454,..."
1,N42782,sports,baseball_mlb,Three takeaways from Yankees' ALCS Game 5 vict...,"[-0.008319429121911526, 0.0013667173916473985,..."
2,N34694,tv,tvnews,Rosie O'Donnell: Barbara Walters Isn't 'Up to ...,"[-0.042908210307359695, -0.018993420526385307,..."
3,N45794,news,newscrime,Four flight attendants were arrested in Miami'...,"[-0.007930373772978783, -0.0221870094537735, 0..."
4,N18445,sports,football_ncaa,Michigan sends breakup tweet to Notre Dame as ...,"[-0.020153459161520004, -0.044797133654356, 0...."


In [6]:
# Function to perform sentiment analysis using TextBlob
def analyze_sentiment(text):
    analysis = TextBlob(text)
    # Classify the polarity as positive, negative, or neutral
    if analysis.sentiment.polarity > 0:
        return 'Positive'
    elif analysis.sentiment.polarity < 0:
        return 'Negative'
    else:
        return 'Neutral'

# Apply the sentiment analysis function to the specified column
news['Sentiment'] = news['Content'].apply(analyze_sentiment)

# Display the DataFrame with the new sentiment analysis column
news.head()


Unnamed: 0,ID,Category,SubCategory,Content,Content_emb,Sentiment
0,N55189,tv,tvnews,"'Wheel Of Fortune' Guest Delivers Hilarious, O...","[-0.020967688411474228, -0.020634232088923454,...",Positive
1,N42782,sports,baseball_mlb,Three takeaways from Yankees' ALCS Game 5 vict...,"[-0.008319429121911526, 0.0013667173916473985,...",Positive
2,N34694,tv,tvnews,Rosie O'Donnell: Barbara Walters Isn't 'Up to ...,"[-0.042908210307359695, -0.018993420526385307,...",Positive
3,N45794,news,newscrime,Four flight attendants were arrested in Miami'...,"[-0.007930373772978783, -0.0221870094537735, 0...",Positive
4,N18445,sports,football_ncaa,Michigan sends breakup tweet to Notre Dame as ...,"[-0.020153459161520004, -0.044797133654356, 0....",Positive


In [7]:
news.to_csv("sentiment_analysis/sentiment_TEST.csv")

Evaluation

In [25]:
news = pd.read_csv("sentiment_analysis/sentiment_TEST.csv") #document with user interactions
news = news.iloc[:, 1:] 
news = news.drop(['Content_emb'], axis=1)
news.head()

Unnamed: 0,ID,Category,SubCategory,Content,Sentiment
0,N55189,tv,tvnews,"'Wheel Of Fortune' Guest Delivers Hilarious, O...",Positive
1,N42782,sports,baseball_mlb,Three takeaways from Yankees' ALCS Game 5 vict...,Positive
2,N34694,tv,tvnews,Rosie O'Donnell: Barbara Walters Isn't 'Up to ...,Positive
3,N45794,news,newscrime,Four flight attendants were arrested in Miami'...,Positive
4,N18445,sports,football_ncaa,Michigan sends breakup tweet to Notre Dame as ...,Positive


In [26]:
behaviors = pd.read_csv("embeddings/users_emb_TEST.csv")
behaviors = behaviors.drop(['Content'], axis=1)
behaviors.head()

Unnamed: 0,User,ID
0,U13740,N55189 N42782 N34694 N45794 N18445 N63302 N104...
1,U91836,N31739 N6072 N63045 N23979 N35656 N43353 N8129...
2,U73700,N10732 N25792 N7563 N21087 N41087 N5445 N60384...


In [27]:
def analyze_sentiments(news_df, behaviors_df):
    # Create a dictionary to store sentiments for each ID
    id_sentiments = {}

    # Iterate through rows in the news dataframe
    for index, row in news_df.iterrows():
        # Check if the 'ID' column is not null and not empty
        if pd.notnull(row['ID']) and row['ID']:
            # Split the IDs in the 'ID' column
            ids = row['ID'].split()

            # Store sentiment for each ID in the dictionary
            for news_id in ids:
                id_sentiments[news_id] = row['Sentiment']

    # Create a new dataframe to store the results
    result_df = behaviors_df.copy()

    # Initialize columns for sentiments
    result_df['Negative'] = 0
    result_df['Positive'] = 0
    result_df['Neutral'] = 0

    # Iterate through rows in the behaviors dataframe
    for index, row in result_df.iterrows():
        # Check if the 'ID' column is not null and not empty
        if pd.notnull(row['ID']) and row['ID']:
            # Split the IDs in the 'ID' column
            ids = row['ID'].split()

            # Count sentiments for each ID and update the result dataframe
            for behavior_id in ids:
                sentiment = id_sentiments.get(behavior_id, 'Neutral')  # Default to 'Neutral'
                result_df.at[index, sentiment] += 1

    return result_df


In [28]:
result_df1 = analyze_sentiments(news, behaviors)
result_df1.head()
#result_df1.to_csv('sentiment_counted.csv', index=None)

Unnamed: 0,User,ID,Negative,Positive,Neutral
0,U13740,N55189 N42782 N34694 N45794 N18445 N63302 N104...,1,8,0
1,U91836,N31739 N6072 N63045 N23979 N35656 N43353 N8129...,27,27,28
2,U73700,N10732 N25792 N7563 N21087 N41087 N5445 N60384...,5,6,5


In [29]:
def analyze_sentiments_recommended(news_df, behaviors_df, n):
    # Create a dictionary to store sentiments for each ID
    id_sentiments = {}

    # Iterate through rows in the news dataframe
    for index, row in news_df.iterrows():
        # Split the IDs in the 'Content' column
        ids = row['ID'].split()

        # Store sentiment for each ID in the dictionary
        for news_id in ids:
            id_sentiments[news_id] = row['Sentiment']

    # Create a new dataframe to store the results
    result_df = behaviors_df.copy()

    # Initialize columns for recommended sentiments
    result_df['Negative_recommended'] = 0
    result_df['Positive_recommended'] = 0
    result_df['Neutral_recommended'] = 0

    # Iterate through rows in the behaviors dataframe
    for index, row in result_df.iterrows():
        user_id = row['User']
        recommendations_file = f'collaborative_recommendations/{user_id}_collab.csv'

        # Check if the recommendations file exists
        if os.path.exists(recommendations_file):
            # Read the content recommendations file
            recommendations_df = pd.read_csv(recommendations_file)

            # Take the top n recommendations
            top_n_recommendations = recommendations_df.head(n)

            # Count sentiments for each recommended ID and update the result dataframe
            for recommended_id in top_n_recommendations['ID']:
                sentiment = id_sentiments.get(recommended_id, 'Neutral')  # Default to 'Neutral'
                result_df.at[index, f'{sentiment}_recommended'] += 1

    return result_df



In [30]:
# Assuming you want to get the top 3 recommendations for each user
n_recommendations = 10

result_df = analyze_sentiments_recommended(news, result_df1, n_recommendations)
result_df.head()

Unnamed: 0,User,ID,Negative,Positive,Neutral,Negative_recommended,Positive_recommended,Neutral_recommended
0,U13740,N55189 N42782 N34694 N45794 N18445 N63302 N104...,1,8,0,3,5,2
1,U91836,N31739 N6072 N63045 N23979 N35656 N43353 N8129...,27,27,28,2,5,3
2,U73700,N10732 N25792 N7563 N21087 N41087 N5445 N60384...,5,6,5,4,6,0


In [31]:
correlation_matrix = result_df[['Negative', 'Positive', 'Neutral', 'Negative_recommended', 'Positive_recommended', 'Neutral_recommended']].corr()

# Display the correlation matrix
print("Correlation Matrix:")
print(correlation_matrix)

# Extract specific correlation values
correlation_negative_negative_recommended = correlation_matrix.loc['Negative', 'Negative_recommended']
correlation_positive_positive_recommended = correlation_matrix.loc['Positive', 'Positive_recommended']
correlation_neutral_neutral_recommended = correlation_matrix.loc['Neutral', 'Neutral_recommended']

print(f"\nCorrelation (Negative - Negative_recommended): {correlation_negative_negative_recommended}")
print(f"Correlation (Positive - Positive_recommended): {correlation_positive_positive_recommended}")
print(f"Correlation (Neutral - Neutral_recommended): {correlation_neutral_neutral_recommended}")


Correlation Matrix:
                      Negative  Positive   Neutral  Negative_recommended  \
Negative              1.000000  0.973727  0.999691             -0.785714   
Positive              0.973727  1.000000  0.967766             -0.905936   
Neutral               0.999691  0.967766  1.000000             -0.770097   
Negative_recommended -0.785714 -0.905936 -0.770097              1.000000   
Positive_recommended -0.371154 -0.572856 -0.347960              0.866025   
Neutral_recommended   0.654654  0.809593  0.635663             -0.981981   

                      Positive_recommended  Neutral_recommended  
Negative                         -0.371154             0.654654  
Positive                         -0.572856             0.809593  
Neutral                          -0.347960             0.635663  
Negative_recommended              0.866025            -0.981981  
Positive_recommended              1.000000            -0.944911  
Neutral_recommended              -0.944911         