In [1]:
import json
from collections import Counter
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import scipy.stats as stats
import math
import pandas as pd
from scipy.stats import chisquare

In [2]:
base_synthetic_responses_path = 'C:\\Users\\haoch\\Documents\\COMP0190\\Data\\COMP0191-MSc-Project-Code\\Synthetic-Responses-JSON'
base_data_path = 'C:\\Users\\haoch\\Documents\\COMP0190\\Data\\COMP0191-MSc-Project-Code\\Environmental-Views-Variables'

In [3]:
question_1_json_file = "\\synthetic_responses_demo.json"
question_1_data_file = "\\scenv_crlf\\Environmental Friendly Behaviour Probability Distribution Wave 10.json"
question_1_response_options = ["don't do anything environmentally friendly", 'do one or two things environmentally friendly', 'do some things environmentally friendly', 'do many things environmentally friendly', 'do everything environmentally friendly']

In [4]:
question_2_json_file = "\\q2_synthetic_responses.json"
question_2_data_file = "\\scenv_bccc\\Scenv Bccc Probability Distribution Wave 10.json"

In [5]:
question_3_json_file = "\\q3_synthetic_responses.json"
question_3_data_file = "\\scenv_pmep\\Scenv Pmep Probability Distribution Wave 10.json"

In [6]:
question_4_json_file = "\\q4_synthetic_responses.json"
question_4_data_file = "\\OpenVB\\Open VB Probability Distribution Wave 18.json"

In [7]:
question_5_json_file = "\\q5_synthetic_responses.json"
question_5_data_file = "\\scenv_meds\\Scenv Meds Probability Distribution Wave 10.json"

In [8]:
question_6_json_file = "\\q6_synthetic_responses.json"
question_6_data_file = "\\etariff\\etariff Probability Distribution Wave 10.json"

In [9]:
question_7_json_file = "\\q7_synthetic_responses.json"
question_7_data_file = "\\grimyn\\grimyn Probability Distribution Wave 3.json"

In [10]:
question_8_json_file = "\\q8_synthetic_responses.json"
question_8_data_file = "\\orga3\\orga3 Probability Distribution Wave 3.json"

In [11]:
question_9_json_file = "\\q9_synthetic_responses.json"
question_9_data_file = "\\scenv_tlat\\Scenv Tlat Probability Distribution Wave 10.json"

In [12]:
questions_json_files = [question_1_json_file, question_2_json_file, question_3_json_file, question_4_json_file, question_5_json_file, question_6_json_file, question_7_json_file, question_8_json_file, question_9_json_file]
questions_data_files = [question_1_data_file, question_2_data_file, question_3_data_file, question_4_data_file, question_5_data_file, question_6_data_file, question_7_data_file, question_8_data_file, question_9_data_file]

## MAF1 and Absolute Difference Calculation

In [13]:
def custom_round(v, total_responses):
    """
    Rounds a value based on the total number of responses.
    Parameters:
    v (float): The value to be rounded.
    total_responses (int): The total number of responses.
    Returns:
    int: The rounded value.
    """
    product = v * total_responses
    integer_part = math.floor(product)
    decimal_part = product - integer_part
    
    if decimal_part > 0.5:
        return math.ceil(product)
    else:
        return integer_part

In [14]:
def calculate_maf1_and_absolute_difference(base_data_path, question_2_data_file, base_synthetic_responses_path, question_2_json_file, ordered_option):
    """
    Calculates the MAF1 score and absolute differences between synthetic and actual data.
    Parameters:
    - base_data_path (str): The base path of the data file.
    - question_2_data_file (str): The file name of the data file.
    - base_synthetic_responses_path (str): The base path of the synthetic responses file.
    - question_2_json_file (str): The file name of the synthetic responses file.
    - ordered_option (list): The ordered list of categories.
    Returns:
    - absolute_differences (dict): A dictionary containing the absolute differences between synthetic and actual data.
    - normalized_absolute_differences (dict): A dictionary containing the normalized absolute differences between synthetic and actual data.
    - cosine_similarity_score (float): The cosine similarity score between synthetic and actual data.
    """
    # Load data
    with open(base_data_path + question_2_data_file) as f:
        data = json.load(f)

    with open(base_synthetic_responses_path + question_2_json_file) as f:
        responses = json.load(f)

    # Aggregate synthetic responses
    synthetic_counts = Counter()
    for response in responses:
        synthetic_counts.update(response["Synthetic Responses"])

    # Convert counts to proportions
    total_responses = sum(synthetic_counts.values())
    synthetic_proportions = {k: v / total_responses for k, v in synthetic_counts.items()}

    # convert the keys of synthetic_proportions to lowercase
    synthetic_proportions = {k.lower(): v for k, v in synthetic_proportions.items()}
    data = {k.lower(): v for k, v in data.items()}

    # if the keys are "neither agree nor disagree", simplify the key to neither
    if "neither agree nor disagree" in synthetic_proportions:
        synthetic_proportions["neither"] = synthetic_proportions["neither agree nor disagree"]
        del synthetic_proportions["neither agree nor disagree"]

    if "neither agree nor disagree" in data:
        data["neither"] = data["neither agree nor disagree"]
        del data["neither agree nor disagree"]

    # Ensure all keys in UKHLS data are present in synthetic data
    for key in data.keys():
        if key not in synthetic_proportions:
            synthetic_proportions[key] = 0

    # Compute the absolute differences
    absolute_differences = {k: abs(synthetic_proportions[k] - data[k]) for k in data.keys()}

    normalized_absolute_differences = {k: v / data[k] for k, v in absolute_differences.items()}

    # multiply both synthetic_proportions and data by total_responses to get the counts in integer form
    synthetic_proportions = {k: custom_round(v, total_responses) for k, v in synthetic_proportions.items()}
    data = {k: custom_round(v, total_responses) for k, v in data.items()}

    cosine_similarity_score = cosine_similarity([list(synthetic_proportions.values())], [list(data.values())])[0][0]  

    print(data)

    # sort the keys of synthetic proportions in the order of ordered option
    synthetic_proportions = {k: synthetic_proportions[k] for k in ordered_option}
    print(synthetic_proportions)

    categories = ordered_option

    # initialise a contingency table using the categories, synthetic_proportions and data
    contingency_table = pd.DataFrame({
        'Category': categories,
        'Synthetic Proportions': [synthetic_proportions[category] for category in categories],
        'Expected Data': [data[category] for category in categories]
    })

    contingency_table.set_index('Category', inplace=True)

    # Perform the Chi-Squared test
    chi2_stat, p_value = chisquare(contingency_table['Synthetic Proportions'], contingency_table['Expected Data'])

    print(f"Chi-Squared Statistic: {chi2_stat}")
    print(f"P-Value: {p_value}")

    return absolute_differences, normalized_absolute_differences, cosine_similarity_score

In [15]:
ordered_options = [
    ["don't do anything environmentally friendly", 'do one or two things environmentally friendly', 'do some things environmentally friendly', 'do many things environmentally friendly', 'do everything environmentally friendly'],
    ['strongly agree', 'tend to agree', 'neither', 'tend to disagree', 'strongly disagree'],
    ['strongly agree', 'tend to agree', 'tend to disagree', 'strongly disagree', 'neither'],
    ['strongly agree', 'agree', 'disagree', 'strongly disagree', 'already changed'],
    ['strongly agree', 'tend to agree', 'neither', 'tend to disagree', 'strongly disagree'],
    ['yes - already buy', 'yes - seriously considering', 'no', 'considered and rejected'],
    ['yes', 'no'],
    ['mentioned', 'not mentioned'],
    ['strongly agree', 'tend to agree', 'neither', 'tend to disagree', 'strongly disagree']
]

In [16]:
question_index = 0

sum_maf1 = 0
sum_absolute_differences = Counter()

overall_normalized_absolute_differences = []

cosine_similarity_scores = []

for question_json_file, data_json_file, ordered_option in zip(questions_json_files, questions_data_files, ordered_options):
    absolute_differences, normalized_absolute_differences, cosine_similarity_score = calculate_maf1_and_absolute_difference(base_data_path, data_json_file, base_synthetic_responses_path, question_json_file, ordered_option)
    print(f"Question {question_index + 1} Absolute Differences: {absolute_differences}")
    # print(f"Question {question_index + 1} Normalized Absolute Differences: {normalized_absolute_differences}")
    print(f"Question {question_index + 1} Cosine Similarity Scores: {cosine_similarity_score}")

    # store the absolute differences in text files
    with open(f"question_{question_index + 1}_absolute_differences.txt", "w") as f:
        f.write(json.dumps(absolute_differences))

    overall_normalized_absolute_differences.append(normalized_absolute_differences)

    sum_absolute_differences.update(absolute_differences)

    cosine_similarity_scores.append(cosine_similarity_score)

    question_index += 1

# write cosine similarity score to a text file
with open("cosine_similarity_scores.text", "w") as f:
    f.write(json.dumps(cosine_similarity_scores))

{"don't do anything environmentally friendly": 6, 'do one or two things environmentally friendly': 36, 'do some things environmentally friendly': 40, 'do many things environmentally friendly': 16, 'do everything environmentally friendly': 2}
{"don't do anything environmentally friendly": 4, 'do one or two things environmentally friendly': 2, 'do some things environmentally friendly': 52, 'do many things environmentally friendly': 42, 'do everything environmentally friendly': 0}
Chi-Squared Statistic: 80.62777777777778
P-Value: 1.2823205101362282e-16
Question 1 Absolute Differences: {"don't do anything environmentally friendly": 0.01744607582255893, 'do one or two things environmentally friendly': 0.3365726450239489, 'do some things environmentally friendly': 0.11546692546097737, 'do many things environmentally friendly': 0.2576796168174561, 'do everything environmentally friendly': 0.019127821431925617}
Question 1 Cosine Similarity Scores: 0.5199567597069386
{'strongly agree': 9, 'tend

In [17]:
# Calculate the total sum of absolute differences and the number of differences
total_sum_absolute_differences = 0
total_number_of_differences = 0

for diff, count in sum_absolute_differences.items():
    # Ensure that the difference is a number
    try:
        total_sum_absolute_differences += count
        total_number_of_differences += 1
    except (ValueError, TypeError) as e:
        pass

# Calculate the average of absolute differences
if total_number_of_differences > 0:
    average_absolute_difference = total_sum_absolute_differences / total_number_of_differences
else:
    average_absolute_difference = 0

print(f"Average Absolute Difference: {average_absolute_difference}")

Average Absolute Difference: 0.3881148866061161


In [18]:
for normalized_difference in overall_normalized_absolute_differences:
    print(normalized_difference)

{"don't do anything environmentally friendly": 0.3036948228882833, 'do one or two things environmentally friendly': 0.9439104477611939, 'do some things environmentally friendly': 0.28543259557344064, 'do many things environmentally friendly': 1.5874754098360655, 'do everything environmentally friendly': 1.0}
{'strongly agree': 0.6714129244249727, 'tend to agree': 0.6462585034013605, 'neither': 0.23857269242111462, 'tend to disagree': 0.8979833926453143, 'strongly disagree': 2.9840637450199203}
{'strongly agree': 5.910748419350095, 'tend to agree': 0.16712214674266673, 'tend to disagree': 0.42344986634519627, 'strongly disagree': 0.76522506619594, 'neither': 0.9356105727439554}
{'strongly agree': 10.2, 'agree': 0.9555555555555555, 'disagree': 0.10714285714285703, 'strongly disagree': 3.4999999999999996, 'already changed': 0.9}
{'strongly agree': 0.7274424552429668, 'tend to agree': 0.679550501156515, 'neither': 0.0005528896307114018, 'tend to disagree': 3.357596843615495, 'strongly disa

In [19]:
def calculate_expected_mean(distribution, question_dictionary_mapping):
    """
    Calculate the expected mean for a given probability distribution.

    :param distribution: Dictionary where keys are response levels and values are their corresponding probabilities
    :param question_dictionary_mapping: Dictionary mapping response levels to their corresponding numerical values
    :return: Expected mean value of the distribution
    """
    # try to convert every key in the question_dictionary_mapping to lowercase
    question_dictionary_mapping = {v.lower(): k for k, v in question_dictionary_mapping.items()}

    # print(question_dictionary_mapping)

    expected_mean = sum(question_dictionary_mapping[response] * prob for response, prob in distribution.items())
    return expected_mean

In [20]:
def calculate_maf1_and_absolute_difference_h1(base_data_path, question_2_data_file, question_2_json_file, question_dictionary_mapping):
    # Load data
    with open(base_data_path + question_2_data_file) as f:
        data = json.load(f)

    with open(base_synthetic_responses_path + question_2_json_file) as f:
        responses = json.load(f)

    # convert the keys of synthetic_proportions to lowercase
    synthetic_proportions = {k.lower(): v for k, v in responses.items()}
    data = {k.lower(): v for k, v in data.items()}

    total_responses = sum(data.values())

    synthetic_proportions = {k: v / total_responses for k, v in synthetic_proportions.items()}
    data = {k: v / total_responses for k, v in data.items()}

    # if the keys are "neither agree nor disagree", simplify the key to neither
    if "neither agree nor disagree" in synthetic_proportions:
        synthetic_proportions["neither"] = synthetic_proportions["neither agree nor disagree"]
        del synthetic_proportions["neither agree nor disagree"]

    if "neither agree nor disagree" in data:
        data["neither"] = data["neither agree nor disagree"]
        del data["neither agree nor disagree"]

    # Ensure all keys in UKHLS data are present in synthetic data
    for key in data.keys():
        if key not in synthetic_proportions:
            synthetic_proportions[key] = 0

    print("Synthetic Distribution: ", synthetic_proportions)
    print("Data Distribution: ", data)

    expected_synthetic_distribution_value = calculate_expected_mean(synthetic_proportions, question_dictionary_mapping)
    expected_data_distribution_value = calculate_expected_mean(data, question_dictionary_mapping)

    print(f"Expected Synthetic Distribution Value: {expected_synthetic_distribution_value}")
    print(f"Expected Data Distribution Value: {expected_data_distribution_value}")

    # Compute the absolute differences
    absolute_differences = {k: abs(synthetic_proportions[k] - data[k]) for k in data.keys()}

    # normalized_absolute_differences = {k: v / data[k] for k, v in absolute_differences.items()}
    normalized_absolute_differences = 0

    # multiply both synthetic_proportions and data by total_responses to get the counts in integer form
    synthetic_proportions = {k: int(v * total_responses) for k, v in synthetic_proportions.items()}
    data = {k: int(v * total_responses) for k, v in data.items()}

    cosine_similarity_score = cosine_similarity([list(synthetic_proportions.values())], [list(data.values())])[0][0]  

    return absolute_differences, normalized_absolute_differences, cosine_similarity_score

In [21]:
scenv_crlf_dict = {
    1: "Don't do Anything Environmentally Friendly",
    2: "Do One or Two Things Environmentally Friendly",
    3: "Do Some Things Environmentally Friendly",
    4: "Do Many Things Environmentally Friendly",
    5: "Do Everything Environmentally Friendly"
} # dictionary to create mapping for question 1

q1_human_distribution = {
    6: "Entirely Positive",
    5: "More Positive than Negative",
    4: "Neither",
    3: "More Negative than Positive",
    2: "Entirely Negative",
    1: "Don't Know"
} # distribution for question 2

scenv_bccc_dict = {
    1: "Strongly Agree",
    2: "Tend to Agree",
    3: "Neither",
    4: "Tend to Disagree",
    5: "Strongly Disagree"
} # dictionary to create mapping for question 3

scenv_pmep_dict = {
    1: "Strongly Agree",
    2: "Tend to Agree",
    3: "Neither",
    4: "Tend to Disagree",
    5: "Strongly Disagree"
} # dictionary to create mapping for question 4

br_openvb_distribution = {
    5: "Strongly Agree",
    4: "Agree",
    3: "Disagree",
    2: "Strongly Disagree",
    1: "Already Changed" 
} # initialise a dictionary to store the attitudes about whether personal changes are needed to protect the environment for question 5

scenv_meds_dict = {
    1: "Strongly Agree",
    2: "Tend to Agree",
    3: "Neither",
    4: "Tend to Disagree",
    5: "Strongly Disagree"
} # dictionary to create mapping for question 6

etariff_dict = {
    1: "Yes - already buy",
    2: "Yes - seriously considering",
    3: "No",
    4: "Considered and rejected"
} # dictionary to create mapping for question 7


In [22]:
base_synthetic_responses_path = 'C:\\Users\\haoch\\Documents\\COMP0190\\Data\\COMP0191-MSc-Project-Code\\Synthetic-Responses-JSON'
q1_with_children_json_paths = ["\\Hypothesis-1\\synthetic_responses_question_7_wave_10_with_children.json"]
q1_without_children_json_paths = ["\\Hypothesis-1\\synthetic_responses_question_7_wave_10_without_children.json"]
question_dictionary_mappings = [etariff_dict]
wave_numbers = [10]

In [23]:
question_index = 0

sum_maf1 = 0
sum_absolute_differences = Counter()

overall_normalized_absolute_differences = []

cosine_similarity_scores = []

for question_json_file, data_json_file, question_dictionary_mapping in zip(q1_with_children_json_paths, q1_without_children_json_paths, question_dictionary_mappings):
    print("Data JSON File: ", data_json_file)
    print("Question JSON File: ", question_json_file)
    
    absolute_differences, normalized_absolute_differences, cosine_similarity_score = calculate_maf1_and_absolute_difference_h1(base_synthetic_responses_path, data_json_file, question_json_file, question_dictionary_mapping)
    print(f"Question {question_index + 1} Absolute Differences With and Without Children: {absolute_differences}")
    # print(f"Question {question_index + 1} Normalized Absolute Differences: {normalized_absolute_differences}")
    print(f"Question {question_index + 1} Cosine Similarity Scores With and Without Children: {cosine_similarity_score}")

    overall_normalized_absolute_differences.append(normalized_absolute_differences)

    sum_absolute_differences.update(absolute_differences)

    cosine_similarity_scores.append(cosine_similarity_score)

    question_index += 1

Data JSON File:  \Hypothesis-1\synthetic_responses_question_7_wave_10_without_children.json
Question JSON File:  \Hypothesis-1\synthetic_responses_question_7_wave_10_with_children.json
Synthetic Distribution:  {'yes - already buy': 0.31, 'yes - seriously considering': 0.45, 'no': 0.23, 'considered and rejected': 0.01}
Data Distribution:  {'yes - already buy': 0.24, 'yes - seriously considering': 0.5, 'no': 0.26, 'considered and rejected': 0.0}
Expected Synthetic Distribution Value: 1.94
Expected Data Distribution Value: 2.02
Question 1 Absolute Differences With and Without Children: {'yes - already buy': 0.07, 'yes - seriously considering': 0.04999999999999999, 'no': 0.03, 'considered and rejected': 0.01}
Question 1 Cosine Similarity Scores With and Without Children: 0.9889639934559099
