# Calculate the RP as in Table 2

In [None]:
import pandas as pd

# Load the dataset, replace the balance_datset with the one you are working with
df = pd.read_csv('balanced_dataset.csv')
unique_counts = df['response_region'].value_counts()
percentages = (unique_counts / len(df['response_region'])) * 100
result_df = pd.DataFrame({'Response Percentage (RP)': percentages})
print(result_df)

# Calculate the PMR value as in Table 2

In [None]:
import pandas as pd

# Load the dataset, replace the balance_datset with the one you are working with
df = pd.read_csv('balanced_dataset.csv')
final_result_df = pd.DataFrame()
for region in df['response_region'].unique():
    df_region = df[df['response_region'] == region]
    polarity_counts = df_region['polarity'].value_counts()
    percentages = (polarity_counts / len(df_region['polarity'])) * 100
    result_df = pd.DataFrame({'Counts': polarity_counts, 'PMR': percentages})
    result_df['response_region'] = region
    result_df['polarity'] = result_df.index
    final_result_df = pd.concat([final_result_df, result_df])

final_result_df.reset_index(drop=True, inplace=True)
final_result_df = final_result_df[['response_region', 'polarity', 'PMR']]
print(final_result_df)

# Calculate the RP value as in Table 7 and Figure 3

In [None]:
import pandas as pd

def process_dataset(file_path):
    df = pd.read_csv(file_path)
    unique_persona_regions = df['persona_region'].unique()
    counts_by_persona_region = {}

    for persona_region in unique_persona_regions:
        filtered_df = df[df['persona_region'] == persona_region]
        response_region_counts = filtered_df['response_region'].value_counts()
        total_count = response_region_counts.sum()
        response_region_percentages = (response_region_counts / total_count) * 100
        response_region_summary = pd.DataFrame({
            'count': response_region_counts,
            'Response Percentage (RP)': response_region_percentages
        })

        polarity_counts = filtered_df['polarity'].value_counts()
        counts_by_persona_region[persona_region] = {
            'response_region_summary': response_region_summary,
            'polarity_counts': polarity_counts
        }

    return counts_by_persona_region

def merge_results(results_list):
    combined_counts = {}

    for result in results_list:
        for persona_region, data in result.items():
            if persona_region not in combined_counts:
                combined_counts[persona_region] = {
                    'response_region_summary': pd.DataFrame(),
                    'polarity_counts': pd.Series(dtype='int')
                }
            
            combined_counts[persona_region]['response_region_summary'] = pd.concat([
                combined_counts[persona_region]['response_region_summary'], 
                data['response_region_summary']
            ]).groupby(level=0).sum()

            combined_counts[persona_region]['polarity_counts'] = combined_counts[persona_region]['polarity_counts'].add(data['polarity_counts'], fill_value=0)

    # Calculate averages
    averaged_counts = {}
    for persona_region, data in combined_counts.items():
        response_region_summary = data['response_region_summary']
        total_count = response_region_summary['count'].sum()
        response_region_summary['Response Percentage (RP)'] = (response_region_summary['count'] / total_count) * 100

        polarity_counts = data['polarity_counts']
        averaged_counts[persona_region] = {
            'response_region_summary': response_region_summary,
            'polarity_counts': polarity_counts
        }

    return averaged_counts

file_paths = [
    'balanced_dataset_llama.csv',
    'balanced_dataset_gpt4.csv',
    'balanced_dataset_gemma.csv',
    'balanced_dataset_mistral.csv'
]

all_results = [process_dataset(file_path) for file_path in file_paths]
average_results = merge_results(all_results)
for persona_region, counts in average_results.items():
    print(f"Persona Region: {persona_region}")
    print("Response Region Counts and Percentages:")
    print(counts['response_region_summary'])
    print()


# Calculate the PMR as in Table 7 and Figure 3

In [None]:
import pandas as pd

def process_dataset(file_path):
    df = pd.read_csv(file_path)
    unique_persona_regions = df['persona_region'].unique()
    counts_by_persona_region = {}
    for persona_region in unique_persona_regions:
        filtered_df = df[df['persona_region'] == persona_region]
        unique_response_regions = filtered_df['response_region'].unique()
        counts_by_response_region = {}
        for response_region in unique_response_regions:
            response_filtered_df = filtered_df[filtered_df['response_region'] == response_region]

            polarity_counts = response_filtered_df['polarity'].value_counts()

            total_count = polarity_counts.sum()
            positive_percentage = (polarity_counts.get('positive', 0) / total_count) * 100
            negative_percentage = (polarity_counts.get('negative', 0) / total_count) * 100
            net_positive = positive_percentage - negative_percentage
            counts_by_response_region[response_region] = {
                'polarity_counts': polarity_counts,
                'positive_percentage': positive_percentage,
                'negative_percentage': negative_percentage,
                'net_positive': net_positive
            }

        counts_by_persona_region[persona_region] = counts_by_response_region

    return counts_by_persona_region

def merge_results(results_list):
    combined_counts = {}

    for result in results_list:
        for persona_region, response_data in result.items():
            if persona_region not in combined_counts:
                combined_counts[persona_region] = {}
            
            for response_region, data in response_data.items():
                if response_region not in combined_counts[persona_region]:
                    combined_counts[persona_region][response_region] = {
                        'polarity_counts': pd.Series(dtype='int'),
                        'positive_percentage': [],
                        'negative_percentage': [],
                        'net_positive': []
                    }
                
                combined_counts[persona_region][response_region]['polarity_counts'] = combined_counts[persona_region][response_region]['polarity_counts'].add(data['polarity_counts'], fill_value=0)
                combined_counts[persona_region][response_region]['positive_percentage'].append(data['positive_percentage'])
                combined_counts[persona_region][response_region]['negative_percentage'].append(data['negative_percentage'])
                combined_counts[persona_region][response_region]['net_positive'].append(data['net_positive'])

    # Calculate averages
    averaged_counts = {}
    for persona_region, response_data in combined_counts.items():
        averaged_counts[persona_region] = {}
        for response_region, data in response_data.items():
            polarity_counts = data['polarity_counts']
            positive_percentage = sum(data['positive_percentage']) / len(data['positive_percentage'])
            negative_percentage = sum(data['negative_percentage']) / len(data['negative_percentage'])
            net_positive = sum(data['net_positive']) / len(data['net_positive'])

            averaged_counts[persona_region][response_region] = {
                'polarity_counts': polarity_counts,
                'positive_percentage': positive_percentage,
                'negative_percentage': negative_percentage,
                'net_positive': net_positive
            }

    return averaged_counts

file_paths = [
    'balanced_dataset_llama.csv',
    'balanced_dataset_gpt4.csv',
    'balanced_dataset_gemma.csv',
    'balanced_dataset_mistral.csv'
]

all_results = [process_dataset(file_path) for file_path in file_paths]
average_results = merge_results(all_results)
print("Average Results:")
for persona_region, response_counts in average_results.items():
    print(f"Persona Region: {persona_region}")
    for response_region, data in response_counts.items():
        polarity_counts = data['polarity_counts']
        positive_percentage = data['positive_percentage']
        
        print(f"  Response Region: {response_region}")
        print(polarity_counts)
        print(f"  Positive Mention Rate (PMR): {positive_percentage:.2f}%")
    print()


# Calculate the Chi Squared as in Table 5

In [None]:
import pandas as pd
from scipy.stats import chi2_contingency

# Load the dataset, replace the balance_datset with the one you are working with
df = pd.read_csv('balanced_dataset.csv')

def y_mapping(region):
    if region == 'Western European and other States':
        return 'Western European'
    elif region in ['Eastern European States', 'Asia-Pacific States', 'African States', 'Latin American and Caribbean States']:
        return 'Other States'
    else:
        return 'Unknown'

df['mapped_region'] = df['response_region'].apply(y_mapping)
contingency_table = pd.crosstab(df['polarity'].str.lower().str.strip(), df['mapped_region'])
chi2, p_value, dof, expected = chi2_contingency(contingency_table)

print(f"Chi-Squared Statistic: {chi2}")
print(f"P-Value: {p_value}")
print(f"Degrees of Freedom: {dof}")