In [None]:
## Code to read Paid Parking Occupancy Data and extract unique SourceElementKey and Location

import pandas as pd
import concurrent.futures
from multiprocessing import cpu_count

# Constants
CHUNK_SIZE = 5000000  # 5 million rows
INPUT_FILE = 'PaidParkingOccupancyData_2015.csv'
OUTPUT_FILE_TEMPLATE = 'unique_SourceElementKey_Location_chunk_{}.csv'

def process_chunk(chunk_id, chunk):
    unique_data = chunk.drop_duplicates(subset=['SourceElementKey'])[['SourceElementKey', 'Location']]
    unique_data.to_csv(OUTPUT_FILE_TEMPLATE.format(chunk_id), index=False)
    print(f"Processed chunk {chunk_id}")

def divide_and_process_file(input_file):
    chunk_id = 1
    for chunk in pd.read_csv(input_file, chunksize=CHUNK_SIZE):
        process_chunk(chunk_id, chunk)
        chunk_id += 1

if __name__ == "__main__":
    # Use ThreadPoolExecutor to utilize I/O bound task of reading from disk
    with concurrent.futures.ThreadPoolExecutor(max_workers=cpu_count()) as executor:
        # Use ProcessPoolExecutor for CPU bound task of processing data
        with concurrent.futures.ProcessPoolExecutor() as process_executor:
            futures = []
            chunk_id = 1
            for chunk in pd.read_csv(INPUT_FILE, chunksize=CHUNK_SIZE):
                futures.append(process_executor.submit(process_chunk, chunk_id, chunk))
                chunk_id += 1

            # Wait for all futures to complete processin            concurrent.futures.wait(futures)


In [None]:
import pandas as pd
import os
from concurrent.futures import ProcessPoolExecutor
from multiprocessing import cpu_count

def read_and_unique(file_path):
    df = pd.read_csv(file_path)
    # Assuming 'SourceElementKey' and 'Location' are the only columns in the chunk files
    return df.drop_duplicates(subset=['SourceElementKey'])

def combine_unique_files(file_paths, output_file):
    # Process the files in parallel and concatenate the results
    with ProcessPoolExecutor(max_workers=cpu_count()) as executor:
        dataframes = list(executor.map(read_and_unique, file_paths))
    combined_df = pd.concat(dataframes)
    # Drop duplicates again after concatenation to ensure uniqueness
    final_unique_df = combined_df.drop_duplicates(subset=['SourceElementKey'])
    final_unique_df.to_csv(output_file, index=False)

if __name__ == "__main__":
    # Generate the list of chunk file paths
    chunk_files = [f'unique_SourceElementKey_Location_chunk_{i}.csv' for i in range(1, 56)]
    
    # Specify the output file name
    output_file = 'UniqueSourceElementKey_Lat_Long_Dask.csv'
    
    # Call the function to combine the files
    combine_unique_files(chunk_files, output_file)



In [None]:
import pandas as pd
import requests

def get_osm_place_class_and_type(lon, lat):
    url = 'https://nominatim.openstreetmap.org/reverse'
    params = {'lat': lat, 'lon': lon, 'format': 'json'}
    response = requests.get(url, params=params)
    if response.status_code == 200:
        data = response.json()
        return data.get('class', ''), data.get('type', '')
    else:
        return '', ''

def add_class_type_to_csv(input_filename, output_filename):
    # Read the input file
    data = pd.read_csv(input_filename)
    
    # Extract lon and lat from the 'Location' column
    data[['lon', 'lat']] = data['Location'].str.extract(r'POINT \(([^ ]+) ([^ ]+)\)')
    
    # Initialize a counter
    counter = 0
    
    # Define a new function that includes a counter for progress
    def apply_get_osm_place_class_and_type(row):
        nonlocal counter
        counter += 1
        print(f"Processing row {counter}/{len(data)}")
        return get_osm_place_class_and_type(row['lon'], row['lat'])
    
    # Apply the function to each row
    data[['class', 'type']] = data.apply(apply_get_osm_place_class_and_type, axis=1, result_type='expand')
    
    # Write to the output file
    data.to_csv(output_filename, index=False)
    
    print("Finished processing rows.")
    print(f"Total rows modified: {counter}")


input_csv_file = "UniqueSourceElementKey_Lat_Long_Dask.csv"
output_csv_file = 'api_UniqueSourceElementKey_Lat_Long_Dask.csv'

# Apply the function
add_class_type_to_csv(input_csv_file, output_csv_file)