In [8]:
import json
from collections import Counter

In [9]:
base_synthetic_responses_path = 'C:\\Users\\haoch\\Documents\\COMP0190\\Data\\COMP0191-MSc-Project-Code\\Synthetic-Responses-JSON'
base_data_path = 'C:\\Users\\haoch\\Documents\\COMP0190\\Data\\COMP0191-MSc-Project-Code\\Environmental-Views-Variables'

In [10]:
question_7_json_file = "\\q7_synthetic_responses.json"
question_7_data_file = "\\grimyn\\grimyn Probability Distribution Wave 3.json"

In [11]:
question_8_json_file = "\\q8_synthetic_responses.json"
question_8_data_file = "\\orga3\\orga3 Probability Distribution Wave 3.json"

In [12]:
def calculate_jaccard_index(base_data_path, data_file, base_synthetic_responses_path, json_file):
    """
    Calculates the Jaccard Index between the base data and synthetic responses.
    Parameters:
    - base_data_path (str): The path to the base data file.
    - data_file (str): The name of the base data file.
    - base_synthetic_responses_path (str): The path to the synthetic responses file.
    - json_file (str): The name of the synthetic responses file.
    Returns:
    - jaccard_index (float): The Jaccard Index between the base data and synthetic responses.
    """
    with open(base_data_path + data_file) as f:
        data = json.load(f)
    with open(base_synthetic_responses_path + json_file) as f:
        responses = json.load(f)

    # convert data keys to lower case
    data = {k.lower(): v for k, v in data.items()}

    # Aggregate synthetic responses
    synthetic_counts = Counter()
    for response in responses:
        synthetic_counts.update(responses[0]["Synthetic Responses"])

    # Convert counts to proportions
    total_responses = sum(synthetic_counts.values())

    data = {k.lower(): v * total_responses for k, v in data.items()}

    synthetic_proportions = {k.lower(): v for k, v in synthetic_counts.items()} 

    print(data)
    print(synthetic_proportions)

    # if the keys are "neither agree nor disagree", simplify the key to neither
    if "neither agree nor disagree" in synthetic_proportions:
        synthetic_proportions["neither"] = synthetic_proportions["neither agree nor disagree"]
        del synthetic_proportions["neither agree nor disagree"]

    if "neither agree nor disagree" in data:
        data["neither"] = data["neither agree nor disagree"]
        del data["neither agree nor disagree"]

    # Calculate Jaccard Index
    intersection = 0

    for key in data.keys():

        if key in synthetic_proportions:
            intersection += min(data[key], synthetic_proportions[key])

    union = sum(data.values()) + sum(synthetic_proportions.values()) - intersection

    jaccard_index = intersection / union

    return jaccard_index

In [13]:
calculate_jaccard_index(base_data_path, question_7_data_file, base_synthetic_responses_path, question_7_json_file)

{'yes': 11.26165185345762, 'no': 88.73834814654238}
{'yes': 69, 'no': 31}


0.2679224953858121

In [14]:
calculate_jaccard_index(base_data_path, question_8_data_file, base_synthetic_responses_path, question_8_json_file)

{'mentioned': 1.747693399574166, 'not mentioned': 98.25230660042583}
{'not mentioned': 48, 'mentioned': 52}


0.3310943740242932