In [1]:
import json
from collections import Counter
from sklearn.metrics import mutual_info_score
from sklearn.metrics.cluster import normalized_mutual_info_score
# ignore warnings
import warnings
warnings.filterwarnings("ignore")

In [2]:
from sklearn.metrics import normalized_mutual_info_score
import numpy as np

# Define the two sample distributions
D1_responses = [
    {"do some things environmentally friendly": 5, "do many things environmentally friendly": 4, "do one or two things environmentally friendly": 1},
    {"do some things environmentally friendly": 5, "do many things environmentally friendly": 4, "don't do anything environmentally friendly": 1},
    {"do some things environmentally friendly": 6, "do many things environmentally friendly": 4},
    {"do some things environmentally friendly": 3, "do many things environmentally friendly": 5, "don't do anything environmentally friendly": 2},
    {"do many things environmentally friendly": 3, "do some things environmentally friendly": 7},
    {"do some things environmentally friendly": 5, "do one or two things environmentally friendly": 1, "do many things environmentally friendly": 4},
    {"do some things environmentally friendly": 6, "do many things environmentally friendly": 3, "don't do anything environmentally friendly": 1},
    {"do many things environmentally friendly": 6, "do some things environmentally friendly": 4},
    {"do many things environmentally friendly": 6, "do some things environmentally friendly": 4},
    {"do some things environmentally friendly": 7, "do many things environmentally friendly": 3},
]

# Aggregate D1 responses into a single distribution
D1_aggregated = {"Don't do Anything Environmentally Friendly": 0, "Do One or Two Things Environmentally Friendly": 0, "Do Some Things Environmentally Friendly": 0, "Do Many Things Environmentally Friendly": 0}

for response in D1_responses:
    for key in response:
        mapped_key = key.lower().replace(" ", "_")
        if "do_one_or_two_things" in mapped_key:
            D1_aggregated["Do One or Two Things Environmentally Friendly"] += response[key]
        elif "do_some_things" in mapped_key:
            D1_aggregated["Do Some Things Environmentally Friendly"] += response[key]
        elif "do_many_things" in mapped_key:
            D1_aggregated["Do Many Things Environmentally Friendly"] += response[key]
        elif "don't_do_anything" in mapped_key:
            D1_aggregated["Don't do Anything Environmentally Friendly"] += response[key]

# D2 distribution
D2 = {"Don't do Anything Environmentally Friendly": 6, "Do One or Two Things Environmentally Friendly": 36, "Do Some Things Environmentally Friendly": 40, "Do Many Things Environmentally Friendly": 16, "Do Everything Environmentally Friendly": 2}

# Convert both distributions to arrays in the same order
D1_array = np.array([D1_aggregated.get(key, 0) for key in D2])
D2_array = np.array([D2[key] for key in D2])

# Calculate normalized mutual information score
nmi_score = normalized_mutual_info_score(D1_array, D2_array)

nmi_score

1.0

In [3]:
base_synthetic_responses_path = 'C:\\Users\\haoch\\Documents\\COMP0190\\Data\\COMP0191-MSc-Project-Code\\Synthetic-Responses-JSON'
base_data_path = 'C:\\Users\\haoch\\Documents\\COMP0190\\Data\\COMP0191-MSc-Project-Code\\Environmental-Views-Variables'

In [4]:
question_1_json_file = "\\synthetic_responses_demo.json"
question_1_data_file = "\\scenv_crlf\\Environmental Friendly Behaviour Probability Distribution Wave 10.json"

In [5]:
question_2_json_file = "\\q2_synthetic_responses.json"
question_2_data_file = "\\scenv_bccc\\Scenv Bccc Probability Distribution Wave 10.json"

In [6]:
question_3_json_file = "\\q3_synthetic_responses.json"
question_3_data_file = "\\scenv_pmep\\Scenv Pmep Probability Distribution Wave 10.json"

In [7]:
question_4_json_file = "\\q4_synthetic_responses.json"
question_4_data_file = "\\OpenVB\\Open VB Probability Distribution Wave 18.json"

In [8]:
question_5_json_file = "\\q5_synthetic_responses.json"
question_5_data_file = "\\scenv_meds\\Scenv Meds Probability Distribution Wave 10.json"

In [9]:
question_6_json_file = "\\q6_synthetic_responses.json"
question_6_data_file = "\\etariff\\etariff Probability Distribution Wave 10.json"

In [10]:
question_7_json_file = "\\q7_synthetic_responses.json"
question_7_data_file = "\\grimyn\\grimyn Probability Distribution Wave 3.json"

In [11]:
question_8_json_file = "\\q8_synthetic_responses.json"
question_8_data_file = "\\orga3\\orga3 Probability Distribution Wave 3.json"

In [12]:
question_9_json_file = "\\q9_synthetic_responses.json"
question_9_data_file = "\\scenv_tlat\\Scenv Tlat Probability Distribution Wave 10.json"

In [13]:
questions_json_files = [question_1_json_file, question_2_json_file, question_3_json_file, question_4_json_file, question_5_json_file, question_6_json_file, question_7_json_file, question_8_json_file, question_9_json_file]
questions_data_files = [question_1_data_file, question_2_data_file, question_3_data_file, question_4_data_file, question_5_data_file, question_6_data_file, question_7_data_file, question_8_data_file, question_9_data_file]

## Mutual Information

In [14]:
def calculate_mutual_information_value(base_data_path, data_file, base_synthetic_responses_path, json_file):
    with open(base_data_path + data_file) as f:
        data = json.load(f)
    with open(base_synthetic_responses_path + json_file) as f:
        responses = json.load(f)

    # convert data keys to lower case
    data = {k.lower(): v for k, v in data.items()}

    # Aggregate synthetic responses
    synthetic_counts = Counter()
    for response in responses:
        synthetic_counts.update(responses[0]["Synthetic Responses"])

    # Convert counts to proportions
    total_responses = sum(synthetic_counts.values())

    data = {k.lower(): v * total_responses for k, v in data.items()}

    synthetic_proportions = {k.lower(): v for k, v in synthetic_counts.items()} 

    # if the keys are "neither agree nor disagree", simplify the key to neither
    if "neither agree nor disagree" in synthetic_proportions:
        synthetic_proportions["neither"] = synthetic_proportions["neither agree nor disagree"]
        del synthetic_proportions["neither agree nor disagree"]

    if "neither agree nor disagree" in data:
        data["neither"] = data["neither agree nor disagree"]
        del data["neither agree nor disagree"]

    # initialise the dictionary keys as the categories in the data
    categories = list(data.keys())

    # Ensure all keys in UKHLS data are present in synthetic data
    for key in data.keys():
        if key not in synthetic_proportions:
            synthetic_proportions[key] = 0.0

    data_list = [data[key] for key in categories]
    synthetic_proportions_list = [synthetic_proportions[key] for key in categories]

    print("Data List: ", data_list)
    print("Synthetic Proportions List: ", synthetic_proportions_list)

    mutual_info = normalized_mutual_info_score(data_list, synthetic_proportions_list)

    print("Normalized Mutual Information: ", mutual_info)

    return mutual_info

In [15]:
mutual_info_scores = []
for i in range(len(questions_json_files)):
    print("Question ", i+1)
    temp_score = calculate_mutual_information_value(base_data_path, questions_data_files[i], base_synthetic_responses_path, questions_json_files[i])
    print("\n")
    mutual_info_scores.append(temp_score)

Question  1
Data List:  [5.744607582255893, 35.657264502394895, 40.45330745390226, 16.23203831825439, 1.9127821431925616]
Synthetic Proportions List:  [0.0, 10, 50, 40, 0.0]
Normalized Mutual Information:  0.9057460992755193


Question  2
Data List:  [9.13, 36.75, 33.910000000000004, 16.86, 2.5100000000000002]
Synthetic Proportions List:  [3, 13, 42, 32, 10]
Normalized Mutual Information:  1.0


Question  3
Data List:  [6.801, 38.421, 19.078999999999997, 4.532, 31.061]
Synthetic Proportions List:  [47, 32, 11, 8, 2]
Normalized Mutual Information:  1.0


Question  4
Data List:  [5.0, 45.0, 28.000000000000004, 2.0, 20.0]
Synthetic Proportions List:  [56, 2, 31, 9, 2]
Normalized Mutual Information:  0.9057460992755193


Question  5
Data List:  [18.34474992962372, 40.56801476337931, 29.983422476619438, 8.720402865096494, 2.383409965281036]
Synthetic Proportions List:  [5, 13, 30, 38, 14]
Normalized Mutual Information:  1.0


Question  6
Data List:  [8.778179794043975, 7.854160868355136, 81

In [16]:
# save the temp scores and their question number in a text file in the same directory
with open("mutual_info_scores.txt", "w") as f:
    for i in range(len(mutual_info_scores)):
        # round each score to 4 decimal places
        f.write("Question " + str(i+1) + ": " + str(round(mutual_info_scores[i], 4)) + "\n")

## Study 2: Hypothesis 1 - Difference in Attitudes to Environmental Issues by whether an individual has children. 

In [17]:
q1_with_children_json_paths = ["\\Hypothesis-1\\synthetic_responses_question_1_wave_1_with_children.json", "\\Hypothesis-1\\synthetic_responses_question_1_wave_5_with_children.json", "\\Hypothesis-1\\synthetic_responses_question_1_wave_10_with_children.json"]
q1_without_children_json_paths = ["\\Hypothesis-1\\synthetic_responses_question_1_wave_1_without_children.json", "\\Hypothesis-1\\synthetic_responses_question_1_wave_5_without_children.json", "\\Hypothesis-1\\synthetic_responses_question_1_wave_10_without_children.json"]
wave_numbers = [1, 5, 10]

q1_with_children_json_filepath = "\\Hypothesis-1\\synthetic_responses_question_1_wave_10_with_children.json"
q1_without_children_json_filepath = "\\Hypothesis-1\\synthetic_responses_question_1_wave_10_without_children.json"
question_one_responses = ["Don't do Anything Environmentally Friendly", "Do One or Two Things Environmentally Friendly", "Do Some Things Environmentally Friendly", "Do Many Things Environmentally Friendly", "Do Everything Environmentally Friendly"]

In [18]:
def calculate_mutual_information_value_h1(base_data_path, d1_file, d2_file, pot_responses):
    with open(base_data_path + d1_file) as f:
        d1 = json.load(f)
    with open(base_synthetic_responses_path + d2_file) as f:
        d2 = json.load(f)

    # convert data keys to lower case
    d1 = {k.lower(): v for k, v in d1.items()}

    d2 = {k.lower(): v for k, v in d2.items()}

    pot_responses = [response.lower() for response in pot_responses]

    for response in pot_responses:
        if response not in d1.keys():
            d1[response] = 0

        if response not in d2.keys():
            d2[response] = 0

    d1_list = [d1[key] for key in pot_responses]
    d2_list = [d2[key] for key in pot_responses]

    mutual_info = normalized_mutual_info_score(d1_list, d2_list)

    return mutual_info

In [19]:
for i in range(len(q1_with_children_json_paths)):
    print("Question 1 Wave ", wave_numbers[i])
    temp_score = calculate_mutual_information_value_h1(base_data_path, question_1_data_file, q1_with_children_json_paths[i], question_one_responses)
    print("Mutual Information Score: ", temp_score)
    print("\n")

Question 1 Wave  1
Mutual Information Score:  0.9057460992755193


Question 1 Wave  5
Mutual Information Score:  0.9057460992755193


Question 1 Wave  10
Mutual Information Score:  1.0




In [20]:
q1_wave_10_mi = calculate_mutual_information_value_h1(base_synthetic_responses_path, q1_with_children_json_filepath, q1_without_children_json_filepath, question_one_responses)

In [21]:
print("Question 1 Wave 10 Mutual Information: ", q1_wave_10_mi)

Question 1 Wave 10 Mutual Information:  1.0
