# Read in data

In [3]:
import json

def read_json(filename, num_objects=3):
    with open(filename, 'r') as file:
        data = json.load(file)
        # Assuming the JSON data is a list of objects
        for i in range(min(num_objects, len(data))):
            print(data[i])

# Call the function with the filename and the number of objects to print
read_json('Activities.json', 5)


{'input': 'Can you recommend some activities to do in Paris?', 'output': {'action': ['activities'], 'destination': 'PAR'}}
{'input': 'What are the best activities to try in Paris?', 'output': {'action': ['activities'], 'destination': 'PAR'}}
{'input': 'What are some fun activities to do in Paris?', 'output': {'action': ['activities'], 'destination': 'PAR'}}
{'input': 'What are the best activities to do in Tokyo?', 'output': {'action': ['activities'], 'destination': 'TYO'}}
{'input': 'What activities can I do in Paris?', 'output': {'action': ['activities'], 'destination': 'PAR'}}


# Adding diversity

The two cells below have been ran iteratevely substitutting different cities to ensure the integrity of ourr data set and have sparse locations.

In [13]:
import json
from collections import Counter

def count_destinations(filename):
    with open(filename, 'r') as file:
        data = json.load(file)
        # Initialize a list to store the destinations
        destinations = []
        # Assuming the JSON data is a list of such dictionaries
        for item in data:
            if 'output' in item and 'destination' in item['output']:
                destinations.append(item['output']['destination'])
        # Count each unique destination
        destination_counts = Counter(destinations)
        return destination_counts

# Example usage:
destination_counts = count_destinations('Modified_Modified_Activities.json')
print(destination_counts)


Counter({'BCN': 9, 'SEL': 8, 'NYC': 8, 'CAI': 7, 'SHA': 6, 'LON': 6, 'SYD': 6, 'MEX': 5, 'BER': 5, 'BKK': 5, 'LAX': 4, 'SIN': 4, 'LIS': 3, 'PRG': 3, 'AMS': 3, 'RIO': 2, 'GRU': 2, 'HKG': 2, 'CPT': 2, 'BUD': 2, 'IST': 2, 'DXB': 1, 'CDG': 1, 'FLR': 1, 'BUE': 1, 'ROM': 1, 'MAD': 1})


In [12]:
import json
import random

def replace_destination(filename):
    # Mapping of city codes to city names
    city_codes_to_names = {
        'NYC': 'New York',
        'PAR': 'Paris',
        'BCN': 'Barcelona',
        'SYD': 'Sydney',
        'LON': 'London',
        'LAX': 'Los Angeles',
        'FLR': 'Florence',
        'BER': 'Berlin',
        'MOW': 'Moscow',
        'BKK': 'Bangkok',
        'IST': 'Istanbul',
        'HND': 'Tokyo',  # Note: Usually, we wouldn't replace with the same city, but it's listed for completeness.
        'SIN': 'Singapore',
        'DXB': 'Dubai',
        'HKG': 'Hong Kong',
        'MAD': 'Madrid',
        'ROM': 'Rome',
        'CPT': 'Cape Town',
        'GRU': 'São Paulo',
        'BOM': 'Mumbai',
        'YTO': 'Toronto',
        'SEL': 'Seoul',
        'SHA': 'Shanghai',
        'MEX': 'Mexico City',
        'AMS': 'Amsterdam',
        'CAI': 'Cairo',
        'RIO': 'Rio de Janeiro',
        'LIS': 'Lisbon',
        'PRG': 'Prague',
        'BUD': 'Budapest',
        'BUE': 'Buenos Aires'
    }
    # Exclude 'TYO' from possible new destinations if you want to ensure it's replaced with a different city
    city_codes = [code for code in city_codes_to_names if code != 'PAR']

    # Read the JSON file
    with open(filename, 'r') as file:
        data = json.load(file)
    
    # Iterate through each item and replace 'TYO' with a new random city code and name for each item
    for item in data:
        if item['output']['destination'] == 'PAR':
            new_city_code = random.choice(city_codes)  # Randomly choose a new city code for each 'TYO'
            item['output']['destination'] = new_city_code
            new_city_name = city_codes_to_names[new_city_code]
            item['input'] = item['input'].replace('Paris', new_city_name).replace('paris', new_city_name).replace('PAR', new_city_name)

    # Save the modified data back to the file or to a new file
    with open('Modified_' + filename, 'w') as file:
        json.dump(data, file, indent=4)

# Example usage:
replace_destination('Modified_Activities.json')

# Adding all fields required to the output part of the data.

In [1]:
import json
def read_json(filename, num_objects=3):
    with open(filename, 'r') as file:
        data = json.load(file)
        # Assuming the JSON data is a list of objects
        for i in range(min(num_objects, len(data))):
            print(data[i])

# Call the function with the filename and the number of objects to print
read_json('Activities.json', 5)

{'input': 'Can you recommend some activities to do in Dubai?', 'output': {'action': ['activities'], 'destination': 'DXB'}}
{'input': 'What are the best activities to try in Barcelona?', 'output': {'action': ['activities'], 'destination': 'BCN'}}
{'input': 'What are some fun activities to do in Shanghai?', 'output': {'action': ['activities'], 'destination': 'SHA'}}
{'input': 'What are the best activities to do in Rio de Janeiro?', 'output': {'action': ['activities'], 'destination': 'RIO'}}
{'input': 'What activities can I do in Lisbon?', 'output': {'action': ['activities'], 'destination': 'LIS'}}


In [3]:
import json
from datetime import datetime

def is_valid_date(date_str):
    """Check if the date string is in 'YYYY-MM-DD' format."""
    if not isinstance(date_str, str):
        return False
    try:
        datetime.strptime(date_str, '%Y-%m-%d')
        return True
    except ValueError:
        return False

def ensure_correct_type(value, expected_data_type):
    """
    Ensure that the value is of the expected data type.
    If it's not, attempt to convert or fix it.
    """
    if expected_data_type == int:
        if isinstance(value, int):
            return value
        elif isinstance(value, float):
            return int(value)
        elif isinstance(value, list) and len(value) == 1:
            return ensure_correct_type(value[0], int)
        elif isinstance(value, str) and value.isdigit():
            return int(value)
        else:
            return None  # Return None for invalid int
    elif expected_data_type == str:
        if isinstance(value, str):
            return value
        elif isinstance(value, (int, float)):
            return str(value)
        elif isinstance(value, list) and len(value) == 1:
            return ensure_correct_type(value[0], str)
        else:
            return ''
    elif expected_data_type == bool:
        if isinstance(value, bool):
            return value
        elif isinstance(value, str):
            if value.lower() == 'true':
                return True
            elif value.lower() == 'false':
                return False
            else:
                return False
        else:
            return False
    elif expected_data_type == list:
        if isinstance(value, list):
            return value
        else:
            return []
    else:
        return value

def process_data(data):
    """
    Process each entry in the data list:
    - Ensure required fields are present and correctly formatted.
    - Exclude entries with missing or invalid required fields.
    """
    required_fields = {
        'action': list,         # Under 'output'
        'destination': str,     # Under 'output'
    }

    output_data = []
    for obj in data:
        valid = True

        # Check if 'output' is present
        if 'output' not in obj:
            print('Missing output:', json.dumps(obj, indent=4))
            continue  # Skip this object

        output = obj['output']

        # Ensure 'output' is a dictionary
        if not isinstance(output, dict):
            print('Output is not a dictionary:', json.dumps(obj, indent=4))
            continue

        # Check required fields under 'output'
        for field, expected_data_type in required_fields.items():
            if field not in output:
                print(f"Missing required field '{field}':", json.dumps(obj, indent=4))
                valid = False
                break
            else:
                value = output[field]
                # If field is 'action', handle possible string to list conversion
                if field == 'action':
                    # If action_value is a string, convert it to a list
                    if isinstance(value, str):
                        value = [value]
                        output[field] = value  # Update the object with the new list

                    # Ensure 'action' is exactly ["activities"]
                    if value != ["activities"]:
                        print(f"Invalid 'action' field:", json.dumps(obj, indent=4))
                        valid = False
                        break
                # Ensure correct data type
                output[field] = ensure_correct_type(value, expected_data_type)
                if output[field] in [None, '']:
                    print(f"Invalid '{field}' value:", json.dumps(obj, indent=4))
                    valid = False
                    break

        if not valid:
            continue  # Skip this object

        # Append the validated and fixed object to the output list
        output_data.append(obj)

    return output_data

def main():
    """Main function to process the JSON data."""
    # Read data from 'Activities.json'
    with open('Activities.json', 'r') as f:
        data = json.load(f)

    # Process the data
    processed_data = process_data(data)

    # Write the processed data to 'new_6.json'
    with open('new_6.json', 'w') as f:
        json.dump(processed_data, f, indent=4)

    # Print the length of the new JSON
    print(f"Number of entries in new_6.json: {len(processed_data)}")

if __name__ == '__main__':
    main()


Number of entries in new_6.json: 100
