## Data import and preprocessing

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

import warnings

warnings.filterwarnings('ignore')

warnings.filterwarnings('ignore', category=FutureWarning)
warnings.filterwarnings('ignore', category=pd.errors.SettingWithCopyWarning)

all_data = pd.read_csv("../data/Data_final.csv")
all_data.loc[all_data["Topic"] == "health and human sevices", "Topic"] = "health and human services"

# to change dictionary applying eval
all_data['big5'] = all_data['big5'].apply(eval)
all_data['pvq21'] = all_data['pvq21'].apply(eval)
all_data['mfq'] = all_data['mfq'].apply(eval)

# To convert a dictionary into individual DataFrames and add a prefix to the column names
big5_df = pd.DataFrame(all_data['big5'].tolist()).add_prefix('big5_')
pvq21_df = pd.DataFrame(all_data['pvq21'].tolist()).add_prefix('pvq21_')
mfq30_df = pd.DataFrame(all_data['mfq'].tolist()).add_prefix('mfq30_')

# merge dataframe
result_df = pd.concat([big5_df, pvq21_df, mfq30_df], axis=1)

# concate dataframe
all_data = pd.concat([all_data, result_df], axis=1)


# Types of Implementations

### 1. Overall Data Visualization
   - General score statistics
   - Message types by topic
   - Gender ratio and age distribution
   - Average scores for each value system
   ...

### 2. Score Comparison Based on Messages:
   - Top 5 and bottom 5 messages by average score 
   
### 3. Score Distribution for 20 Main Topics:
   - Plot the score distribution for each main topic, then compare the topics with high and low scores
   - Average topic scores, ranked from highest to lowest

### 4. Persuasiveness for People with or without Habits:
   - Analyze if messages are more persuasive to people with established habits or not

### 5. Positive vs. Negative Messages Effectiveness:
   - Overall scores comparison
   - For people without habits, determine whether positive or negative messages are more effective

### 6. Effective Strategy and Topic Selection Based on Values
---

## General score statistics

Overall, the distribution resembles a normal distribution, but there is a noticeable heavy tail around the 0 point, indicating that prediction using a standard linear model may be difficult. The proportion of 0 scores was generally high.

Upon calculating the quartiles, the 1st quartile was 2, the median was 5, and the 3rd quartile was 7.

score_counts = all_data['Score'].value_counts().sort_index()

### Plotting the bar chart
plt.figure(figsize=(10, 6))
plt.bar(score_counts.index, score_counts.values, color='#FF8C00')
plt.xlabel('Scores')
plt.ylabel('Count')
plt.title('Distribution of Scores')
plt.xticks(range(11))
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.show()

In [None]:
quartiles = all_data['Score'].quantile([0.25, 0.5, 0.75])

print(quartiles)

## Ratio of gender

The ratio of men to women was almost 1:1, showing a similar distribution.

In [None]:
# Count the occurrences of each gender
gender_counts = all_data['Gender'].value_counts().sort_index()

# Plotting the pie chart with darker colors
plt.figure(figsize=(8, 8))
colors = ['#555555', '#999999']  # Darker grey colors
plt.pie(gender_counts, labels=['Male', 'Female'], autopct='%1.1f%%', colors=colors)
plt.title('Distribution of Gender')
plt.show()

## Ratio of age

There was a slight shortage of people in their 20s, but the overall proportions were mostly similar.

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np


# Convert ages to age groups
bins = [20, 29, 39, 49, 59]
labels = ['20', '30', '40', '50']
all_data['Age Group'] = pd.cut(all_data['Age'], bins=bins, labels=labels, right=True)

# Count the occurrences of each age group
age_group_counts = all_data['Age Group'].value_counts().sort_index()

# Plotting the pie chart with darker colors
plt.figure(figsize=(8, 8))
colors = ['#cc6666', '#3366cc', '#66cc66', '#cc9966']  # Darker color palette
plt.pie(age_group_counts, labels=age_group_counts.index, autopct='%1.1f%%', colors=colors)
plt.title('Distribution of Age Groups')
plt.show()

## Value Distribution

### Big Five
The scores for **Neuroticism** and **Openness** were relatively high, while the scores for **Extraversion** were relatively low.

### PVQ21
**Hedonism** and **Security** had higher score distributions, whereas **Stimulation** had a lower score distribution and the lowest average score (the only one with an average below 4).

### MFQ30
**Harm/Care** had a relatively high number of higher scores, with the highest average. On the other hand, **Ingroup/Loyalty**, **Authority/Respect**, and **Purity/Sanctity** showed similar distributions and relatively lower scores.

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

value_dimensions = [
    "big5_Extraversion", 
    "big5_Agreeableness", 
    "big5_Conscientiousness",
    'big5_Neuroticism',
    'big5_Openness',
    'pvq21_Conformity',
    'pvq21_Tradition',
    'pvq21_Benevolence',
    'pvq21_Universalism',
    'pvq21_Self-Direction',
    'pvq21_Stimulation',
    'pvq21_Hedonism',
    'pvq21_Achievement',
    'pvq21_Power',
    'pvq21_Security',
    'mfq30_Harm/Care',
    'mfq30_Fairness/Reciprocity',
    'mfq30_In-group/Loyalty',
    'mfq30_Authority/Respect',
    'mfq30_Purity/Sanctity'
]

big_5 = ["big5_Extraversion", 
    "big5_Agreeableness", 
    "big5_Conscientiousness",
    'big5_Neuroticism',
    'big5_Openness']

pvq_21 = ['pvq21_Conformity',
    'pvq21_Tradition',
    'pvq21_Benevolence',
    'pvq21_Universalism',
    'pvq21_Self-Direction',
    'pvq21_Stimulation',
    'pvq21_Hedonism',
    'pvq21_Achievement',
    'pvq21_Power',
    'pvq21_Security']

mfq_30 = ['mfq30_Harm/Care',
    'mfq30_Fairness/Reciprocity',
    'mfq30_In-group/Loyalty',
    'mfq30_Authority/Respect',
    'mfq30_Purity/Sanctity']



plt.rcParams.update({'font.size': 20, 'axes.titlesize': 20, 'axes.labelsize': 20, 'xtick.labelsize': 20, 'ytick.labelsize': 20})


# Creating the DataFrame
target_big_5 = all_data[big_5]
target_pvq_21 = all_data[pvq_21]
target_mfq_30 = all_data[mfq_30]

# Plotting the box plots for each value dimension in subplots
fig, axes = plt.subplots(1, 3, figsize=(25, 10))

# Plot for Big 5
sns.boxplot(data=target_big_5, palette="pastel", ax=axes[0])
axes[0].set_title('Distribution of Scores for Big 5')
axes[0].set_xlabel('Values')
axes[0].set_ylabel('Scores')
axes[0].tick_params(axis='x', rotation=90)

# Plot for PVQ-21
sns.boxplot(data=target_pvq_21, palette="pastel", ax=axes[1])
axes[1].set_title('Distribution of Scores for PVQ-21')
axes[1].set_xlabel('Values')
axes[1].set_ylabel('Scores')
axes[1].tick_params(axis='x', rotation=90)

# Plot for MFQ-30
sns.boxplot(data=target_mfq_30, palette="pastel", ax=axes[2])
axes[2].set_title('Distribution of Scores for MFQ-30')
axes[2].set_xlabel('Values')
axes[2].set_ylabel('Scores')
axes[2].tick_params(axis='x', rotation=90)

plt.tight_layout()
plt.show()

## Score Comparison Based on Messages:

Looking at the entire dataset, the top 5 messages with the highest scores are:
1. **reduce_your_speed_when_there_are_many_pedestrians** (8.187500)
2. **Go_indoors_during_lightning_storms** (7.795000)
3. **Clean_up_trash_after_fishing** (7.745370)
4. **Do_not_get_into_a_stranger_s_car** (7.647959)
5. **Do_not_cut_trees_carelessly** (7.430556)

- **Safety**: Messages that promote personal safety, such as slowing down for pedestrians, going indoors during lightning storms, or avoiding getting into a stranger's car.
- **Environmental Protection**: Messages encouraging environmental preservation, such as cleaning up trash after fishing or avoiding careless tree cutting.
- **Social Responsibility**: Messages that consider the impact of personal actions on others and the environment.

These messages tend to receive high scores because they raise awareness of situations people frequently encounter in daily life and emphasize social and environmental responsibility. People are more likely to relate to and recognize the importance of these topics. Additionally, all of these messages involve simple actions that can be implemented immediately.

For the entire dataset, the bottom 5 messages with the lowest scores are:
1. **Do_squats** (1.697674)
2. **Read_books_on_defense** (1.829268)
3. **Look_up_videos_of_military_dog_training** (1.870968)
4. **Learn_the_differences_between_the_Army_Navy_and_Air_Force** (1.991379)
5. **Avoid_using_earphones_while_walking** (2.000000)

- **Personal Interests**: Messages such as doing squats, reading books on defense, or watching military dog training videos are focused on specific personal hobbies or niche interests.
- **Relative Importance**: Messages like learning the differences between military branches or avoiding earphones while walking may be perceived as less important in everyday life.
- **Specific Context**: These messages focus on particular situations or activities, making them harder for the general public to relate to.
- **Difficulty in Implementation**: The requirement to adopt new behaviors or learn new information may be seen as demanding.
- **Lack of Urgency**: The perceived necessity or urgency of these actions is low.
- **Lack of Relevance**: They may be viewed as having little relevance to general day-to-day life.

Unlike messages related to public safety or environmental protection, these tend to focus on individual interests or actions, making it more difficult for people to resonate with them. Furthermore, since they don't address issues people commonly face in daily life, their importance may be rated lower.

### Conclusion:
Messages that received high scores are typically related to public safety and environmental protection, values that people consider important. In contrast, messages that received low scores tend to address specific personal interests or topics perceived as less significant. This suggests that people place greater importance on messages connected to the public good, while topics tied to individual preferences or niche areas are seen as less compelling.

Messages that provide clear, actionable guidance tend to score higher, as they offer immediate and straightforward instructions. On the other hand, messages requiring additional time, effort, or the development of new habits appear to be less persuasive, likely due to the extra effort involved.

In [None]:
## seperate Google image and Dalle
df_value_dalle = all_data[all_data["Method"] == "dalle"]
df_value_google = all_data[all_data["Method"] == "google"]

## Calculate the average persuasiveness score and store the associated values if the message, strategy, method, and ID are the same.
grouped_df_Message = all_data.groupby(['Message']).agg({
    'Score': 'mean',
}).reset_index()

## Top 5 messages
grouped_df_Message.sort_values(by="Score", ascending = False)[:5]

## Bottom 5 messages
grouped_df_Message.sort_values(by="Score", ascending = True)[:5]

## Score Distribution and Comparison Across 20 Main Topics

### High Scores:

- **Transportation**: Traffic safety topic: Reduce your speed at crosswalks.
- **Interior**: Environmental protection: Do not break tree branches.
- **Homeland Security**: Diplomatic safety issue: Do not cross borders.
- **Careful**: Safety issue: Wear insulated gloves when handling electricity.
- **Agriculture**: Agricultural production: Eat organic foods.

### Low Scores:

- **Defense**: National defense topic: Read books on defense, stay updated on the latest military technology trends.
- **Treasury**: Saving money topic: Buy clothes during sales, keep a budget, find cheap restaurants.
- **Cyber Etiquette**: Cyber etiquette topic: Don’t post hate comments, compliment others online.
- **Exercise**: Fitness-related topic: Do boxing, do squats, go running every morning.
- **Advertising**: Advertising topic: Use an iPhone instead of a Galaxy, wear a leather jacket to look cool.

### Conclusion:

Messages that received high scores tend to be actions that are easy to implement and immediately convey a sense of importance. On the other hand, messages with low scores often require more time, complexity, or involve activities where the urgency or importance is less apparent. Additionally, messages related to personal preferences or those requiring long-term effort tend to receive lower scores.

In [None]:
## "If the message, strategy, method, and ID are the same, calculate the average persuasiveness score and store the corresponding values."
grouped_df_Topic = all_data.groupby(['Topic']).agg({
    'Score': 'mean',
}).reset_index()

## Top 5 messages
grouped_df_Topic.sort_values(by="Score", ascending = False)[:5]

## Bottom 5 Topic
grouped_df_Topic.sort_values(by="Score", ascending = True)[:5]

In [None]:
grouped_df_Topic.sort_values(by="Score", ascending = False)

## Average Scores by Strategy

In [None]:
grouped_df_Strategy = all_data.groupby(['Strategy', "Pos_Neg"]).agg({
    'Score': 'mean',
}).reset_index()
grouped_df_Strategy.sort_values(by="Score", ascending = False)


## 4. Social Strategies vs Physical Strategies
To assess the effectiveness of social strategies, we compared the score distributions of social and physical strategies.

Since there appeared to be little difference, we conducted a statistical test to examine whether there is a significant distribution difference between the two strategies.

**K-S Test Results**: The K-S test results showed a statistically significant difference between the two distributions, but the practical difference is not substantial. This suggests that even small differences can appear statistically significant due to the large sample size. While there is a statistically significant difference, the actual difference is minimal.

In [None]:
import pandas as pd

# List of columns to average
average_columns = [
    'Score', 'big5_Extraversion', 'big5_Agreeableness',
    'big5_Conscientiousness', 'big5_Neuroticism', 'big5_Openness',
    'pvq21_Conformity', 'pvq21_Tradition', 'pvq21_Benevolence',
    'pvq21_Universalism', 'pvq21_Self-Direction', 'pvq21_Stimulation',
    'pvq21_Hedonism', 'pvq21_Achievement', 'pvq21_Power', 'pvq21_Security',
    'mfq30_Harm/Care', 'mfq30_Fairness/Reciprocity', 'mfq30_In-group/Loyalty',
    'mfq30_Authority/Respect', 'mfq30_Purity/Sanctity'
]

# Create the aggregation dictionary
agg_dict = {col: 'mean' for col in average_columns}
agg_dict.update({col: 'first' for col in all_data.columns if col not in average_columns + ['Batch', 'Gubun', 'Image_Num']})

# Perform the groupby operation and calculate the required aggregation
aggregated_data = all_data.groupby(['Batch', 'Gubun', 'Image_Num']).agg(agg_dict).reset_index()



In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns


data = aggregated_data
data['Strategy_type'] = data['Strategy'].apply(lambda x: 'Social Strategy' if x in ['xAttr', 'xReact', 'Argument from Popular Opinion', 'oReact'] else 'Non-Social Strategy')

scores_Social = data[data['Strategy_type'] == 'Social Strategy']['Score']
scores_Non_Social = data[data['Strategy_type'] == 'Non-Social Strategy']['Score']

plt.figure(figsize=(12, 8))
sns.histplot(scores_Social, bins=10, kde=True, stat='density', color='blue', label='Social')
sns.histplot(scores_Non_Social, bins=10, kde=True, stat='density', color='orange', label='Non-Social Strategy')
plt.title('Score Distribution by Social/Non-Social')
plt.xlabel('Score')
plt.ylabel('density')
plt.legend()
plt.grid(True)
plt.show()

In [None]:
import pandas as pd
from scipy import stats


data = all_data
# adding column 'Strategy_type' 
data['Strategy_type'] = data['Strategy'].apply(lambda x: 'Social Strategy' if x in ['xAttr', 'xReact', 'Argument from Popular Opinion', 'oReact'] else 'Non-Social Strategy')

# seperate strategy
social_scores = data[data['Strategy_type'] == 'Social Strategy']['Score']
non_social_scores = data[data['Strategy_type'] == 'Non-Social Strategy']['Score']

# K-S test
ks_stat, p_value = stats.ks_2samp(social_scores, non_social_scores)

# result
print(f"K-S Statistic: {ks_stat}")
print(f"P-Value: {p_value}")

# interpretation
alpha = 0.05
if p_value > alpha:
    print("Fail to reject the null hypothesis: There is no difference between the two distributions.")
else:
    print("Reject the null hypothesis: There is a difference between the two distributions.")
    


## With Habits vs Without Habits

In [None]:
# Select only the messages that contain all habits, and visualize their average scores in a graph.

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

def analyze_habit_scores(df: pd.DataFrame):
    
    # Find messages that contain both types of habits
    df_grouped = df.groupby('Message')['Habit_convert'].nunique().reset_index()
    df_grouped = df_grouped[df_grouped['Habit_convert'] > 1]

    # Filter only those messages
    df_filtered = df[df['Message'].isin(df_grouped['Message'])]

    # Calculate the average score for 'Yes' and 'No' for each message
    df_yes = df_filtered[df_filtered['Habit_convert'] == 'Yes']
    df_no = df_filtered[df_filtered['Habit_convert'] == 'No']

    avg_yes = df_yes.groupby('Message')['Score'].mean().reset_index().rename(columns={'Score': 'Avg_Yes_Score'})
    avg_no = df_no.groupby('Message')['Score'].mean().reset_index().rename(columns={'Score': 'Avg_No_Score'})

    # Merge the average scores into a single DataFrame
    df_avg_scores = pd.merge(avg_yes, avg_no, on='Message')

    # Visualize the distribution
    plt.figure(figsize=(14, 7))
    sns.histplot(df_avg_scores['Avg_Yes_Score'], color='blue', stat='density', label='Yes', kde=True, bins=15)
    sns.histplot(df_avg_scores['Avg_No_Score'], color='orange', stat='density', label='No', kde=True, bins=15)

    # plt.title('Distribution of Average Scores for Yes and No', fontsize=18)
    plt.xlabel('Average Score', fontsize=15)
    plt.ylabel('Probability', fontsize=20)
    plt.legend(title='Habit', fontsize=15, title_fontsize=13)
    plt.grid(True)
    plt.xticks(fontsize=20)
    plt.yticks(fontsize=20)
    plt.tight_layout()

    plt.show()

analyze_habit_scores(all_data)

## Positive Startegy vs Negative Strategy

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

# preparing dataset
data = all_data.copy()
data['Score'] = data['Score'].astype(float)

# seperate pos/neg
df_pos = data[data['Pos_Neg'] == 'pos']
df_neg = data[data['Pos_Neg'] == 'neg']

avg_pos = df_pos.groupby('Message')['Score'].mean().reset_index().rename(columns={'Score': 'pos_Score'})
avg_neg = df_neg.groupby('Message')['Score'].mean().reset_index().rename(columns={'Score': 'neg_Score'})

df_avg_scores = pd.merge(avg_pos, avg_neg, on='Message')

# Histogram
plt.figure(figsize=(18, 12))
sns.histplot(df_avg_scores['pos_Score'], color='blue' , bins=15, kde=True, stat='density', label='Positive Strategy')
sns.histplot(df_avg_scores['neg_Score'], color='orange', bins=15, kde=True, stat='density', label='Negative Strategy')
# plt.title('Distribution of Average Scores for Yes and No')
plt.xlabel('Average Score')
plt.ylabel('Density')
plt.xticks(fontsize=20)
plt.yticks(fontsize=20)
plt.legend(title='Strategy type',fontsize = 15,title_fontsize = 15)
plt.grid(True)
plt.show()

##  Which Strategy, Positive or Negative, Influences the Attitude of Inactive Individuals?

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

def analyze_strategy_effect_on_non_habitual(df: pd.DataFrame):
    # Filter only individuals who do not have established habits
    df_non_habit = df[df['Habit_convert'] == 'No']
    
    # Split data based on positive/negative strategy
    df_pos = df_non_habit[df_non_habit['Pos_Neg'] == 'pos']
    df_neg = df_non_habit[df_non_habit['Pos_Neg'] == 'neg']

    # Calculate the average score for positive and negative strategies by message
    avg_pos = df_pos.groupby('Message')['Score'].mean().reset_index().rename(columns={'Score': 'pos_Score'})
    avg_neg = df_neg.groupby('Message')['Score'].mean().reset_index().rename(columns={'Score': 'neg_Score'})

    # Merge the average scores into a single DataFrame
    df_avg_scores = pd.merge(avg_pos, avg_neg, on='Message', how='inner')

    # Visualize the score distribution
    plt.figure(figsize=(14, 7))
    sns.histplot(df_avg_scores['pos_Score'], color='blue', stat='density', label='Positive Strategy', kde=True, bins=15)
    sns.histplot(df_avg_scores['neg_Score'], color='orange', stat='density', label='Negative Strategy', kde=True, bins=15)

    # plt.title('Distribution of Average Scores for Positive and Negative Strategies on Non-Habitual Individuals', fontsize=18)
    plt.xlabel('Average Score', fontsize=20)
    plt.ylabel('Density', fontsize=20)
    plt.legend(title='Strategy Type', fontsize=15, title_fontsize= 15)
    plt.grid(True)
    plt.xticks(fontsize=20)
    plt.yticks(fontsize=20)
    plt.tight_layout()

    plt.show()
    
analyze_strategy_effect_on_non_habitual(all_data)

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

def analyze_strategy_effect_on_habitual(df: pd.DataFrame):
    # Filter only individuals who have established habits
    df_habit = df[df['Habit_convert'] == 'Yes']
    
    # Split data based on positive/negative strategy
    df_pos = df_habit[df_habit['Pos_Neg'] == 'pos']
    df_neg = df_habit[df_habit['Pos_Neg'] == 'neg']

    # Calculate the average score for positive and negative strategies by message
    avg_pos = df_pos.groupby('Message')['Score'].mean().reset_index().rename(columns={'Score': 'pos_Score'})
    avg_neg = df_neg.groupby('Message')['Score'].mean().reset_index().rename(columns={'Score': 'neg_Score'})

    # Merge the average scores into a single DataFrame
    df_avg_scores = pd.merge(avg_pos, avg_neg, on='Message', how='inner')

    # Visualize the score distribution
    plt.figure(figsize=(14, 7))
    sns.histplot(df_avg_scores['pos_Score'], color='blue', stat='density', label='Positive Strategy', kde=True, bins=15)
    sns.histplot(df_avg_scores['neg_Score'], color='orange', stat='density', label='Negative Strategy', kde=True, bins=15)

    # plt.title('Distribution of Average Scores for Positive and Negative Strategies on Habitual Individuals', fontsize=18)
    plt.xlabel('Average Score', fontsize=14)
    plt.ylabel('Density', fontsize=14)
    plt.legend(title='Strategy Type', fontsize=15)
    plt.grid(True)
    plt.xticks(fontsize=20)
    plt.yticks(fontsize=20)
    plt.tight_layout()

    plt.show()
    
analyze_strategy_effect_on_habitual(all_data)

## Score Difference Between Google and DALL·E

The score proportions for images sourced from Google and DALL·E were calculated. Visualization can be done as shown below, indicating that images collected through Google achieved relatively lower persuasiveness, while those generated by DALL·E obtained relatively higher persuasiveness.

In [None]:
data = aggregated_data
data['Score'] = data['Score'].astype(float)

# Method
scores_dalle = data[data['Method'] == 'dalle']['Score']
scores_google = data[data['Method'] == 'google']['Score']

# histogram
plt.figure(figsize=(14, 7))
sns.histplot(scores_dalle, bins=20, kde=True, stat='density', color='blue', label='dalle')
sns.histplot(scores_google, bins=20, kde=True, stat='density', color='orange', label='google')
plt.xlabel('Score')
plt.ylabel('Density')
plt.xticks(fontsize=20)
plt.yticks(fontsize=20)
plt.legend(title = "Method", title_fontsize = 15, fontsize = 15)
plt.grid(True)
plt.show()

## 6. Effective Strategy and Topic Selection Based on Values

In [None]:
df = all_data

## Select only the messages that contain habit-related data
average_columns = [
    'Score'  # Columns for which we want to calculate the average
]

# Dictionary to apply aggregation functions: 'mean' for average columns, 'first' for other columns
agg_dict = {col: 'mean' for col in average_columns}
agg_dict.update({col: 'first' for col in all_data.columns if col not in average_columns + ["Batch", "NO", "Strategy"]})

# Filter and aggregate the data by Batch, NO, and Strategy
aggregated_data_strategy = df.groupby(["Batch", "NO", "Strategy"]).agg(agg_dict).reset_index()


# Dictionary to apply aggregation functions: 'mean' for average columns, 'first' for other columns
agg_dict = {col: 'mean' for col in average_columns}
agg_dict.update({col: 'first' for col in all_data.columns if col not in average_columns + ["Batch", "NO", "Topic"]})

# Filter and aggregate the data by Batch, NO, and Topic
aggregated_data_topic = df.groupby(["Batch", "NO", "Topic"]).agg(agg_dict).reset_index()

### Effective Strategies Based on Specific Values

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

def calculate_and_plot_correlations_spearman(df: pd.DataFrame, strategies: list, trait_set_name: str, trait_set: list):
    colors = sns.color_palette('tab20', len(strategies) * 2)

    # Rename strategy names
    strategy_mapping = {
        'xAttr': 'Perceived Persona',
        'xReact': 'Internal Emotion',
        'oReact': 'External Emotion',
        'Argument from Consequence': 'Consequence',
        'Argument from Popular Opinion': 'Bandwagon'
    }
    df['Strategy'] = df['Strategy'].replace(strategy_mapping)

    # Divide the data into quartiles
    for trait in trait_set:
        quartiles = df[trait].quantile([0.25, 0.5, 0.75]).values
        labels = [f'Q{i+1}' for i in range(len(quartiles) + 1)]
        df[f'{trait}_Quartile'] = pd.cut(df[trait], bins=[-np.inf] + list(quartiles) + [np.inf], labels=labels, include_lowest=True)

    # Calculate and visualize correlations
    correlation_results = []
    for strategy in strategies:
        for pos_neg in ['pos', 'neg']:
            # Use only 'pos' for the 'Bandwagon' strategy
            if strategy == 'Bandwagon' and pos_neg == 'neg':
                continue

            df_filtered = df[(df["Strategy"] == strategy) & (df['Pos_Neg'] == pos_neg)]

            for trait in trait_set:
                quartile_trait = f'{trait}_Quartile'
                if quartile_trait in df_filtered.columns:
                    # Calculate correlation between the mean score of each quartile group and the original score
                    df_filtered['Score_Group_Spearman'] = df_filtered.groupby(quartile_trait)['Score'].transform('mean')
                    correlation = df_filtered['Score_Group_Spearman'].corr(df_filtered['Score'], method='spearman')
                    if strategy == "Bandwagon":
                        correlation_results.append({'Strategy': f'{strategy}', 'Personality_Trait': trait, 'Correlation': correlation})
                    else:
                        correlation_results.append({'Strategy': f'{strategy}_{pos_neg}', 'Personality_Trait': trait, 'Correlation': correlation})

    return correlation_results

def plot_combined_correlations(df_correlations, trait_set_name):
    colors = sns.color_palette('tab20', len(df_correlations['Strategy'].unique()) * 2)

    # Correlation bar chart
    plt.figure(figsize=(20, 10))  # Increase the width to make the plot wider
    sns.barplot(x='Personality_Trait', y='Correlation', hue='Strategy', data=df_correlations, palette=colors, ci=None)

    # plt.title(f'Correlation between {trait_set_name} and Persuasiveness for Strategies (4-Quartile groups, Spearman)', fontsize=16)
    plt.xlabel('Personality Trait', fontsize=25)
    plt.ylabel('Correlation', fontsize=25)
    plt.grid(True)
    plt.xticks(rotation=90, fontsize=20)
    plt.yticks(fontsize=20)
    plt.tight_layout()

    # Adjust legend to be in one row, centered at the top
    plt.legend(title='Strategy', fontsize=12, loc='upper center', bbox_to_anchor=(0.5, 1.2), ncol=len(df_correlations['Strategy'].unique()))
    plt.show()

    # Set y-axis order for the heatmap
    big_5 = ["big5_Extraversion", 
             "big5_Agreeableness", 
             "big5_Conscientiousness",
             'big5_Neuroticism',
             'big5_Openness']

    pvq_21 = ['pvq21_Conformity',
              'pvq21_Tradition',
              'pvq21_Benevolence',
              'pvq21_Universalism',
              'pvq21_Self-Direction',
              'pvq21_Stimulation',
              'pvq21_Hedonism',
              'pvq21_Achievement',
              'pvq21_Power',
              'pvq21_Security']

    mfq_30 = ['mfq30_Harm/Care',
              'mfq30_Fairness/Reciprocity',
              'mfq30_In-group/Loyalty',
              'mfq30_Authority/Respect',
              'mfq30_Purity/Sanctity']

    desired_y_order = big_5 + pvq_21 + mfq_30

    # Set x-axis order for the heatmap
    desired_x_order = ['Perceived Persona_pos','Perceived Persona_neg',
                       'Internal Emotion_pos', 'Internal Emotion_neg', 
                       'External Emotion_pos', 'External Emotion_neg', 
                       'Consequence_pos', 'Consequence_neg', 
                       'Bandwagon']

    # Correlation Heatmap
    heatmap_data = df_correlations.pivot(index="Personality_Trait", columns="Strategy", values="Correlation")

    # Reorder y-axis
    heatmap_data = heatmap_data.reindex(index=desired_y_order)

    # Reorder x-axis
    heatmap_data = heatmap_data[desired_x_order]

    plt.figure(figsize=(20, 10))
    sns.heatmap(heatmap_data, annot=True, cmap='coolwarm', center=0, linewidths=.5)
    plt.xlabel('Strategy', fontsize=20)
    plt.ylabel('Personality Trait', fontsize=20)
    plt.yticks(fontsize=15)  # Increase y-label size here
    plt.xticks(rotation=90, ha='right', fontsize=18)
    plt.tight_layout()
    plt.show()

# Define the list of strategies
strategies = ['Perceived Persona', 'Internal Emotion', 'External Emotion', 'Consequence', 'Bandwagon']

# Calculate and visualize for Big 5, PVQ 21, and MFQ 30
big_5 = ["big5_Extraversion", 
         "big5_Agreeableness", 
         "big5_Conscientiousness",
         'big5_Neuroticism',
         'big5_Openness']

pvq_21 = ['pvq21_Conformity',
          'pvq21_Tradition',
          'pvq21_Benevolence',
          'pvq21_Universalism',
          'pvq21_Self-Direction',
          'pvq21_Stimulation',
          'pvq21_Hedonism',
          'pvq21_Achievement',
          'pvq21_Power',
          'pvq21_Security']

mfq_30 = ['mfq30_Harm/Care',
          'mfq30_Fairness/Reciprocity',
          'mfq30_In-group/Loyalty',
          'mfq30_Authority/Respect',
          'mfq30_Purity/Sanctity']

# Calculate correlation results for each personality trait group
big_5_correlations = calculate_and_plot_correlations_spearman(aggregated_data_strategy, strategies, 'Big 5', big_5)
pvq_21_correlations = calculate_and_plot_correlations_spearman(aggregated_data_strategy, strategies, 'PVQ 21', pvq_21)
mfq_30_correlations = calculate_and_plot_correlations_spearman(aggregated_data_strategy, strategies, 'MFQ 30', mfq_30)

# Visualize individual groups
plot_combined_correlations(pd.DataFrame(big_5_correlations), 'Big 5')
plot_combined_correlations(pd.DataFrame(pvq_21_correlations), 'PVQ 21')
plot_combined_correlations(pd.DataFrame(mfq_30_correlations), 'MFQ 30')

# Combine all groups and visualize
combined_correlations = pd.DataFrame(big_5_correlations + pvq_21_correlations + mfq_30_correlations)
plot_combined_correlations(combined_correlations, 'All Personality Traits')

### Effective Strategies Based on Specific Topics

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

def calculate_and_plot_correlations_spearman(df: pd.DataFrame, topics: list, trait_set_name: str, trait_set: list):
    colors = sns.color_palette('tab20', len(topics))

    # Divide the data into quartiles
    for trait in trait_set:
        quartiles = df[trait].quantile([0.25, 0.5, 0.75]).values
        
        # Remove duplicate values, keeping only unique bins
        unique_quartiles = np.unique(quartiles)

        # Use unique bins to divide the data
        labels = [f'Q{i+1}' for i in range(len(unique_quartiles) + 1)]
        df[f'{trait}_Quartile'] = pd.cut(df[trait], bins=[-np.inf] + list(unique_quartiles) + [np.inf], labels=labels, include_lowest=True)

    # Calculate and visualize correlations
    correlation_results = []
    for topic in topics:
        df_filtered = df[df["Topic"] == topic]

        for trait in trait_set:
            quartile_trait = f'{trait}_Quartile'
            if quartile_trait in df_filtered.columns:
                df_filtered['Score_Group_Spearman'] = df_filtered.groupby(quartile_trait)['Score'].transform('mean')
                correlation = df_filtered['Score_Group_Spearman'].corr(df_filtered['Score'], method='spearman')
                correlation_results.append({'Topic': topic, 'Personality_Trait': trait, 'Correlation': correlation})

    df_correlations = pd.DataFrame(correlation_results)

    # Correlation bar chart
    plt.figure(figsize=(22, 14))
    sns.barplot(x='Personality_Trait', y='Correlation', hue='Topic', data=df_correlations, palette=colors, ci=None)

    plt.xlabel('Personality Trait', fontsize=20)
    plt.ylabel('Correlation', fontsize=20)
    plt.grid(True)
    plt.xticks(rotation=90, fontsize=20)
    plt.yticks(fontsize=20)
    plt.tight_layout()

    # Adjust legend position to the top
    plt.legend(title='Topic', fontsize=12, loc='upper center', bbox_to_anchor=(0.5, 1.15), ncol=5)
    plt.show()

    # Correlation Heatmap with specified order
    heatmap_data = df_correlations.pivot(index="Personality_Trait", columns="Topic", values="Correlation")
    
    # Reorder Personality Traits in a specific order
    ordered_traits = big_5 + pvq_21 + mfq_30
    heatmap_data = heatmap_data.reindex(ordered_traits)

    plt.figure(figsize=(22, 12))
    sns.heatmap(heatmap_data, annot=True, cmap='RdBu_r', center=0, linewidths=.5)
    plt.xlabel('Topic', fontsize=20)
    plt.ylabel('Personality Trait', fontsize=20)
    plt.yticks(fontsize=17)
    plt.xticks(rotation=90, ha='right', fontsize=20)
    plt.tight_layout()
    plt.show()

def calculate_and_plot_all_correlations(df: pd.DataFrame, topics: list):
    # Define personality traits
    global big_5, pvq_21, mfq_30  # Declare global to use in heatmap ordering
    big_5 = ["big5_Extraversion", 
             "big5_Agreeableness", 
             "big5_Conscientiousness",
             'big5_Neuroticism',
             'big5_Openness']

    pvq_21 = ['pvq21_Conformity',
              'pvq21_Tradition',
              'pvq21_Benevolence',
              'pvq21_Universalism',
              'pvq21_Self-Direction',
              'pvq21_Stimulation',
              'pvq21_Hedonism',
              'pvq21_Achievement',
              'pvq21_Power',
              'pvq21_Security']

    mfq_30 = ['mfq30_Harm/Care',
              'mfq30_Fairness/Reciprocity',
              'mfq30_In-group/Loyalty',
              'mfq30_Authority/Respect',
              'mfq30_Purity/Sanctity']

    # Calculate and visualize correlation results for each personality trait group
    calculate_and_plot_correlations_spearman(df, topics, 'Big 5', big_5)
    calculate_and_plot_correlations_spearman(df, topics, 'PVQ 21', pvq_21)
    calculate_and_plot_correlations_spearman(df, topics, 'MFQ 30', mfq_30)

    # Combine all personality trait groups
    combined_traits = big_5 + pvq_21 + mfq_30
    calculate_and_plot_correlations_spearman(df, topics, 'All Personality Traits', combined_traits)

# Define the list of topics
topics = ['education', 'privacy', 'cyber etiquette', 'commerce',
          'agriculture', 'homeland security', 'justice', 'defense',
          'veterans affairs', 'housing and urban development', 'careful',
          'state', 'treasury', 'labor', 'transportation', 'energy',
          'interior', 'health and human services', 'exercise', 'advertising']  

# Generate combined histograms and heatmaps for all personality trait groups
calculate_and_plot_all_correlations(aggregated_data_topic, topics)