In [1]:
import json
from collections import Counter
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
base_synthetic_responses_path = 'C:\\Users\\haoch\\Documents\\COMP0190\\Data\\COMP0191-MSc-Project-Code\\Synthetic-Responses-JSON'
base_data_path = 'C:\\Users\\haoch\\Documents\\COMP0190\\Data\\COMP0191-MSc-Project-Code\\Environmental-Views-Variables'

In [3]:
question_1_json_file = "\\synthetic_responses_demo.json"
question_1_data_file = "\\scenv_crlf\\Environmental Friendly Behaviour Probability Distribution Wave 10.json"
question_1_response_options = ["don't do anything environmentally friendly", 'do one or two things environmentally friendly', 'do some things environmentally friendly', 'do many things environmentally friendly', 'do everything environmentally friendly']

In [4]:
question_2_json_file = "\\q2_synthetic_responses.json"
question_2_data_file = "\\scenv_bccc\\Scenv Bccc Probability Distribution Wave 10.json"

In [5]:
question_3_json_file = "\\q3_synthetic_responses.json"
question_3_data_file = "\\scenv_pmep\\Scenv Pmep Probability Distribution Wave 10.json"

In [6]:
question_4_json_file = "\\q4_synthetic_responses.json"
question_4_data_file = "\\OpenVB\\Open VB Probability Distribution Wave 18.json"

In [7]:
question_5_json_file = "\\q5_synthetic_responses.json"
question_5_data_file = "\\scenv_meds\\Scenv Meds Probability Distribution Wave 10.json"

In [8]:
question_6_json_file = "\\q6_synthetic_responses.json"
question_6_data_file = "\\etariff\\etariff Probability Distribution Wave 10.json"

In [9]:
question_7_json_file = "\\q7_synthetic_responses.json"
question_7_data_file = "\\grimyn\\grimyn Probability Distribution Wave 3.json"

In [10]:
question_8_json_file = "\\q8_synthetic_responses.json"
question_8_data_file = "\\orga3\\orga3 Probability Distribution Wave 3.json"

In [11]:
question_9_json_file = "\\q9_synthetic_responses.json"
question_9_data_file = "\\scenv_tlat\\Scenv Tlat Probability Distribution Wave 10.json"

In [12]:
questions_json_files = [question_1_json_file, question_2_json_file, question_3_json_file, question_4_json_file, question_5_json_file, question_6_json_file, question_7_json_file, question_8_json_file, question_9_json_file]
questions_data_files = [question_1_data_file, question_2_data_file, question_3_data_file, question_4_data_file, question_5_data_file, question_6_data_file, question_7_data_file, question_8_data_file, question_9_data_file]

## MAF1 and Absolute Difference Calculation

In [13]:
def calculate_maf1_and_absolute_difference(base_data_path, question_2_data_file, base_synthetic_responses_path, question_2_json_file):
    # Load data
    with open(base_data_path + question_2_data_file) as f:
        data = json.load(f)

    with open(base_synthetic_responses_path + question_2_json_file) as f:
        responses = json.load(f)

    # Aggregate synthetic responses
    synthetic_counts = Counter()
    for response in responses:
        synthetic_counts.update(response["Synthetic Responses"])

    # Convert counts to proportions
    total_responses = sum(synthetic_counts.values())
    synthetic_proportions = {k: v / total_responses for k, v in synthetic_counts.items()}

    # convert the keys of synthetic_proportions to lowercase
    synthetic_proportions = {k.lower(): v for k, v in synthetic_proportions.items()}
    data = {k.lower(): v for k, v in data.items()}

    # if the keys are "neither agree nor disagree", simplify the key to neither
    if "neither agree nor disagree" in synthetic_proportions:
        synthetic_proportions["neither"] = synthetic_proportions["neither agree nor disagree"]
        del synthetic_proportions["neither agree nor disagree"]

    if "neither agree nor disagree" in data:
        data["neither"] = data["neither agree nor disagree"]
        del data["neither agree nor disagree"]

    # Ensure all keys in UKHLS data are present in synthetic data
    for key in data.keys():
        if key not in synthetic_proportions:
            synthetic_proportions[key] = 0

    if "neither" in data:
        print(data)
    if "neither" in synthetic_proportions:
        print(synthetic_proportions)

    # Compute the absolute differences
    absolute_differences = {k: abs(synthetic_proportions[k] - data[k]) for k in data.keys()}

    normalized_absolute_differences = {k: v / data[k] for k, v in absolute_differences.items()}

    # multiply both synthetic_proportions and data by total_responses to get the counts in integer form
    synthetic_proportions = {k: int(v * total_responses) for k, v in synthetic_proportions.items()}
    data = {k: int(v * total_responses) for k, v in data.items()}

    cosine_similarity_score = cosine_similarity([list(synthetic_proportions.values())], [list(data.values())])[0][0]  

    return absolute_differences, normalized_absolute_differences, cosine_similarity_score

In [14]:
question_index = 0

sum_maf1 = 0
sum_absolute_differences = Counter()

overall_normalized_absolute_differences = []

cosine_similarity_scores = []

for question_json_file, data_json_file in zip(questions_json_files, questions_data_files):
    absolute_differences, normalized_absolute_differences, cosine_similarity_score = calculate_maf1_and_absolute_difference(base_data_path, data_json_file, base_synthetic_responses_path, question_json_file)
    print(f"Question {question_index + 1} Absolute Differences: {absolute_differences}")
    # print(f"Question {question_index + 1} Normalized Absolute Differences: {normalized_absolute_differences}")
    print(f"Question {question_index + 1} Cosine Similarity Scores: {cosine_similarity_score}")

    # store the absolute differences in text files
    with open(f"question_{question_index + 1}_absolute_differences.txt", "w") as f:
        f.write(json.dumps(absolute_differences))

    overall_normalized_absolute_differences.append(normalized_absolute_differences)

    sum_absolute_differences.update(absolute_differences)

    cosine_similarity_scores.append(cosine_similarity_score)

    question_index += 1

# write cosine similarity score to a text file
with open("cosine_similarity_scores.text", "w") as f:
    f.write(json.dumps(cosine_similarity_scores))

Question 1 Absolute Differences: {"don't do anything environmentally friendly": 0.01744607582255893, 'do one or two things environmentally friendly': 0.3365726450239489, 'do some things environmentally friendly': 0.11546692546097737, 'do many things environmentally friendly': 0.2576796168174561, 'do everything environmentally friendly': 0.019127821431925617}
Question 1 Cosine Similarity Scores: 0.5018483906814953
{'strongly agree': 0.0813, 'tend to agree': 0.3675, 'neither': 0.3391, 'tend to disagree': 0.1186, 'strongly disagree': 0.0251}
{'neither': 0.42, 'tend to agree': 0.13, 'tend to disagree': 0.32, 'strongly agree': 0.03, 'strongly disagree': 0.1}
Question 2 Absolute Differences: {'strongly agree': 0.0513, 'tend to agree': 0.2375, 'neither': 0.08089999999999997, 'tend to disagree': 0.20140000000000002, 'strongly disagree': 0.07490000000000001}
Question 2 Cosine Similarity Scores: 0.6809647376971487
{'strongly agree': 0.06801, 'tend to agree': 0.38521, 'tend to disagree': 0.19079,

In [15]:
# Calculate the total sum of absolute differences and the number of differences
total_sum_absolute_differences = 0
total_number_of_differences = 0

for diff, count in sum_absolute_differences.items():
    # Ensure that the difference is a number
    try:
        total_sum_absolute_differences += count
        total_number_of_differences += 1
    except (ValueError, TypeError) as e:
        pass

# Calculate the average of absolute differences
if total_number_of_differences > 0:
    average_absolute_difference = total_sum_absolute_differences / total_number_of_differences
else:
    average_absolute_difference = 0

print(f"Average Absolute Difference: {average_absolute_difference}")

Average Absolute Difference: 0.3621758866061162


In [16]:
for normalized_difference in overall_normalized_absolute_differences:
    print(normalized_difference)

{"don't do anything environmentally friendly": 0.3036948228882833, 'do one or two things environmentally friendly': 0.9439104477611939, 'do some things environmentally friendly': 0.28543259557344064, 'do many things environmentally friendly': 1.5874754098360655, 'do everything environmentally friendly': 1.0}
{'strongly agree': 0.6309963099630996, 'tend to agree': 0.6462585034013605, 'neither': 0.23857269242111462, 'tend to disagree': 1.6981450252951098, 'strongly disagree': 2.9840637450199203}
{'strongly agree': 1.4996324069989708, 'tend to agree': 0.14223410607201267, 'tend to disagree': 0.42344986634519627, 'strongly disagree': 0.33804060017652254, 'neither': 0.19513215929944303}
{'strongly agree': 10.2, 'agree': 0.9555555555555555, 'disagree': 0.10714285714285703, 'strongly disagree': 3.4999999999999996, 'already changed': 0.9}
{'strongly agree': 0.7274424552429668, 'tend to agree': 0.679550501156515, 'neither': 0.0005528896307114018, 'tend to disagree': 3.357596843615495, 'strongly

In [17]:
def calculate_maf1_and_absolute_difference_h1(base_data_path, question_2_data_file, question_2_json_file):
    # Load data
    with open(base_data_path + question_2_data_file) as f:
        data = json.load(f)

    with open(base_synthetic_responses_path + question_2_json_file) as f:
        responses = json.load(f)

    # convert the keys of synthetic_proportions to lowercase
    synthetic_proportions = {k.lower(): v for k, v in responses.items()}
    data = {k.lower(): v for k, v in data.items()}

    total_responses = sum(data.values())

    synthetic_proportions = {k: v / total_responses for k, v in synthetic_proportions.items()}
    data = {k: v / total_responses for k, v in data.items()}

    # if the keys are "neither agree nor disagree", simplify the key to neither
    if "neither agree nor disagree" in synthetic_proportions:
        synthetic_proportions["neither"] = synthetic_proportions["neither agree nor disagree"]
        del synthetic_proportions["neither agree nor disagree"]

    if "neither agree nor disagree" in data:
        data["neither"] = data["neither agree nor disagree"]
        del data["neither agree nor disagree"]

    # Ensure all keys in UKHLS data are present in synthetic data
    for key in data.keys():
        if key not in synthetic_proportions:
            synthetic_proportions[key] = 0

    # Compute the absolute differences
    absolute_differences = {k: abs(synthetic_proportions[k] - data[k]) for k in data.keys()}

    # normalized_absolute_differences = {k: v / data[k] for k, v in absolute_differences.items()}
    normalized_absolute_differences = 0

    # multiply both synthetic_proportions and data by total_responses to get the counts in integer form
    synthetic_proportions = {k: int(v * total_responses) for k, v in synthetic_proportions.items()}
    data = {k: int(v * total_responses) for k, v in data.items()}

    cosine_similarity_score = cosine_similarity([list(synthetic_proportions.values())], [list(data.values())])[0][0]  

    return absolute_differences, normalized_absolute_differences, cosine_similarity_score

In [28]:
base_synthetic_responses_path = 'C:\\Users\\haoch\\Documents\\COMP0190\\Data\\COMP0191-MSc-Project-Code\\Synthetic-Responses-JSON'
q1_with_children_json_paths = ["\\Hypothesis-1\\synthetic_responses_question_3_wave_1_with_children.json", "\\Hypothesis-1\\synthetic_responses_question_3_wave_5_with_children.json", "\\Hypothesis-1\\synthetic_responses_question_7_wave_10_with_children.json"]
q1_without_children_json_paths = ["\\Hypothesis-1\\synthetic_responses_question_3_wave_1_without_children.json", "\\Hypothesis-1\\synthetic_responses_question_3_wave_5_without_children.json", "\\Hypothesis-1\\synthetic_responses_question_7_wave_10_without_children.json"]
wave_numbers = [1, 5, 10]

In [29]:
question_index = 0

sum_maf1 = 0
sum_absolute_differences = Counter()

overall_normalized_absolute_differences = []

cosine_similarity_scores = []

for question_json_file, data_json_file in zip(q1_with_children_json_paths, q1_without_children_json_paths):
    absolute_differences, normalized_absolute_differences, cosine_similarity_score = calculate_maf1_and_absolute_difference_h1(base_synthetic_responses_path, data_json_file, question_json_file)
    print(f"Question {question_index + 1} Absolute Differences With and Without Children: {absolute_differences}")
    # print(f"Question {question_index + 1} Normalized Absolute Differences: {normalized_absolute_differences}")
    print(f"Question {question_index + 1} Cosine Similarity Scores With and Without Children: {cosine_similarity_score}")

    overall_normalized_absolute_differences.append(normalized_absolute_differences)

    sum_absolute_differences.update(absolute_differences)

    cosine_similarity_scores.append(cosine_similarity_score)

    question_index += 1

Question 1 Absolute Differences With and Without Children: {'strongly agree': 0.0, 'tend to agree': 0.0, 'neither': 0.0, 'tend to disagree': 0.0, 'strongly disagree': 0.0}
Question 1 Cosine Similarity Scores With and Without Children: 1.0
Question 2 Absolute Differences With and Without Children: {'strongly agree': 0.0, 'tend to agree': 0.0, 'neither': 0.0, 'tend to disagree': 0.0, 'strongly disagree': 0.0}
Question 2 Cosine Similarity Scores With and Without Children: 0.9999999999999999
Question 3 Absolute Differences With and Without Children: {'yes - already buy': 0.07, 'yes - seriously considering': 0.04999999999999999, 'no': 0.03, 'considered and rejected': 0.01}
Question 3 Cosine Similarity Scores With and Without Children: 0.9889639934559099
