In [1]:
import json
from collections import Counter
from sklearn.metrics import mutual_info_score
# ignore warnings
import warnings
warnings.filterwarnings("ignore")

In [2]:
base_synthetic_responses_path = 'C:\\Users\\haoch\\Documents\\COMP0190\\Data\\COMP0191-MSc-Project-Code\\Synthetic-Responses-JSON'
base_data_path = 'C:\\Users\\haoch\\Documents\\COMP0190\\Data\\COMP0191-MSc-Project-Code\\Environmental-Views-Variables'

In [3]:
question_1_json_file = "\\synthetic_responses_demo.json"
question_1_data_file = "\\scenv_crlf\\Environmental Friendly Behaviour Probability Distribution Wave 10.json"

In [4]:
question_2_json_file = "\\q2_synthetic_responses.json"
question_2_data_file = "\\scenv_bccc\\Scenv Bccc Probability Distribution Wave 10.json"

In [5]:
question_3_json_file = "\\q3_synthetic_responses.json"
question_3_data_file = "\\scenv_pmep\\Scenv Pmep Probability Distribution Wave 10.json"

In [6]:
question_4_json_file = "\\q4_synthetic_responses.json"
question_4_data_file = "\\OpenVB\\Open VB Probability Distribution Wave 18.json"

In [7]:
question_5_json_file = "\\q5_synthetic_responses.json"
question_5_data_file = "\\scenv_meds\\Scenv Meds Probability Distribution Wave 10.json"

In [8]:
question_6_json_file = "\\q6_synthetic_responses.json"
question_6_data_file = "\\etariff\\etariff Probability Distribution Wave 10.json"

In [9]:
question_7_json_file = "\\q7_synthetic_responses.json"
question_7_data_file = "\\grimyn\\grimyn Probability Distribution Wave 3.json"

In [10]:
question_8_json_file = "\\q8_synthetic_responses.json"
question_8_data_file = "\\orga3\\orga3 Probability Distribution Wave 3.json"

In [11]:
question_9_json_file = "\\q9_synthetic_responses.json"
question_9_data_file = "\\scenv_tlat\\Scenv Tlat Probability Distribution Wave 10.json"

In [12]:
questions_json_files = [question_1_json_file, question_2_json_file, question_3_json_file, question_4_json_file, question_5_json_file, question_6_json_file, question_7_json_file, question_8_json_file, question_9_json_file]
questions_data_files = [question_1_data_file, question_2_data_file, question_3_data_file, question_4_data_file, question_5_data_file, question_6_data_file, question_7_data_file, question_8_data_file, question_9_data_file]

## Mutual Information

In [13]:
def calculate_mutual_information_value(base_data_path, data_file, base_synthetic_responses_path, json_file):
    with open(base_data_path + data_file) as f:
        data = json.load(f)
    with open(base_synthetic_responses_path + json_file) as f:
        responses = json.load(f)

    # initialise the dictionary keys as the categories in the data
    categories = list(data.keys())

    # convert data keys to lower case
    data = {k.lower(): v for k, v in data.items()}

    # convert every item in categories to lower case
    categories = [category.lower() for category in categories]

    # Aggregate synthetic responses
    synthetic_counts = Counter()
    for response in responses:
        synthetic_counts.update(responses[0]["Synthetic Responses"])

    # Convert counts to proportions
    total_responses = sum(synthetic_counts.values())
    synthetic_proportions = {k.lower(): v / total_responses for k, v in synthetic_counts.items()} 

    # Ensure all keys in UKHLS data are present in synthetic data
    for key in data.keys():
        if key not in synthetic_proportions:
            synthetic_proportions[key] = 0.0

    # print data and synthetic proportions
    print("Data: ", data)
    print("Synthetic Proportions: ", synthetic_proportions)

    data_list = [data[key] for key in categories]
    synthetic_proportions_list = [synthetic_proportions[key] for key in categories]

    print("Data List: ", data_list)
    print("Synthetic Proportions List: ", synthetic_proportions_list)

    mutual_info = mutual_info_score(data_list, synthetic_proportions_list)

    print("Mutual Information: ", mutual_info)

    return mutual_info

In [14]:
mutual_info_scores = []
for i in range(len(questions_json_files)):
    print("Question ", i+1)
    temp_score = calculate_mutual_information_value(base_data_path, questions_data_files[i], base_synthetic_responses_path, questions_json_files[i])
    print("\n")
    mutual_info_scores.append(temp_score)

Question  1
Data:  {"don't do anything environmentally friendly": 0.05744607582255893, 'do one or two things environmentally friendly': 0.3565726450239489, 'do some things environmentally friendly': 0.40453307453902265, 'do many things environmentally friendly': 0.1623203831825439, 'do everything environmentally friendly': 0.019127821431925617}
Synthetic Proportions:  {'do some things environmentally friendly': 0.5, 'do many things environmentally friendly': 0.4, 'do one or two things environmentally friendly': 0.1, "don't do anything environmentally friendly": 0.0, 'do everything environmentally friendly': 0.0}
Data List:  [0.05744607582255893, 0.3565726450239489, 0.40453307453902265, 0.1623203831825439, 0.019127821431925617]
Synthetic Proportions List:  [0.0, 0.1, 0.5, 0.4, 0.0]
Mutual Information:  1.3321790402101223


Question  2
Data:  {'strongly agree': 0.0813, 'tend to agree': 0.3675, 'neither': 0.3391, 'tend to disagree': 0.1186, 'strongly disagree': 0.0251}
Synthetic Proportio

In [15]:
# save the temp scores and their question number in a text file in the same directory
with open("mutual_info_scores.txt", "w") as f:
    for i in range(len(mutual_info_scores)):
        # round each score to 4 decimal places
        f.write("Question " + str(i+1) + ": " + str(round(mutual_info_scores[i], 4)) + "\n")

## Study 2: Hypothesis 1 - Difference in Attitudes to Environmental Issues by whether an individual has children. 

In [16]:
q1_with_children_json_paths = ["\\Hypothesis-1\\synthetic_responses_question_1_wave_1_with_children.json", "\\Hypothesis-1\\synthetic_responses_question_1_wave_5_with_children.json", "\\Hypothesis-1\\synthetic_responses_question_1_wave_10_with_children.json"]
q1_without_children_json_paths = ["\\Hypothesis-1\\synthetic_responses_question_1_wave_1_without_children.json", "\\Hypothesis-1\\synthetic_responses_question_1_wave_5_without_children.json", "\\Hypothesis-1\\synthetic_responses_question_1_wave_10_without_children.json"]
wave_numbers = [1, 5, 10]

q1_with_children_json_filepath = "\\Hypothesis-1\\synthetic_responses_question_1_wave_10_with_children.json"
q1_without_children_json_filepath = "\\Hypothesis-1\\synthetic_responses_question_1_wave_10_without_children.json"
question_one_responses = ["Don't do Anything Environmentally Friendly", "Do One or Two Things Environmentally Friendly", "Do Some Things Environmentally Friendly", "Do Many Things Environmentally Friendly", "Do Everything Environmentally Friendly"]

In [17]:
def calculate_mutual_information_value_h1(base_data_path, d1_file, d2_file, pot_responses):
    with open(base_data_path + d1_file) as f:
        d1 = json.load(f)
    with open(base_synthetic_responses_path + d2_file) as f:
        d2 = json.load(f)

    # convert data keys to lower case
    d1 = {k.lower(): v for k, v in d1.items()}

    d2 = {k.lower(): v for k, v in d2.items()}

    pot_responses = [response.lower() for response in pot_responses]

    for response in pot_responses:
        if response not in d1.keys():
            d1[response] = 0

        if response not in d2.keys():
            d2[response] = 0

    d1_list = [d1[key] for key in pot_responses]
    d2_list = [d2[key] for key in pot_responses]

    mutual_info = mutual_info_score(d1_list, d2_list)

    return mutual_info

In [18]:
for i in range(len(q1_with_children_json_paths)):
    print("Question 1 Wave ", wave_numbers[i])
    temp_score = calculate_mutual_information_value_h1(base_data_path, question_1_data_file, q1_with_children_json_paths[i], question_one_responses)
    print("Mutual Information Score: ", temp_score)
    print("\n")

Question 1 Wave  1
Mutual Information Score:  1.3321790402101223


Question 1 Wave  5
Mutual Information Score:  1.6094379124341005


Question 1 Wave  10
Mutual Information Score:  1.6094379124341005




In [19]:
q1_wave_10_mi = calculate_mutual_information_value_h1(base_synthetic_responses_path, q1_with_children_json_filepath, q1_without_children_json_filepath, question_one_responses)

In [20]:
print("Question 1 Wave 10 Mutual Information: ", q1_wave_10_mi)

Question 1 Wave 10 Mutual Information:  1.3321790402101223
