In [None]:
#!pip install polars

### Try on chunks to resolve memory error

In [1]:
import pandas as pd
import ast

In [2]:
# Load the CSV file in chunks to handle large data efficiently
chunksize = 2500000
chunks = pd.read_csv('./csv/nrw_hai_extracted.csv', chunksize=chunksize)

# Initialize an empty DataFrame to hold the processed data
result_df = pd.DataFrame()

In [3]:
# Function to calculate the centroid of the bounds
def calculate_centroid(bounds):
    lower_left_lon, lower_left_lat, upper_right_lon, upper_right_lat = bounds
    centroid_lon = (lower_left_lon + upper_right_lon) / 2
    centroid_lat = (lower_left_lat + upper_right_lat) / 2
    return centroid_lat, centroid_lon

In [5]:
# Process each chunk
for chunk in chunks:
    # Extract the required fields
    extracted_chunk = chunk[['AGG_DAY_PERIOD', 'BOUNDS', 'ACTIVITY_INDEX_TOTAL']].copy()
    
    # Parse the BOUNDS field and calculate the centroid for each bound
    extracted_chunk['BOUNDS'] = extracted_chunk['BOUNDS'].apply(lambda x: tuple(ast.literal_eval(x)))  # Safely parse the list from string and convert to tuple
    extracted_chunk[['CENTROID_LAT', 'CENTROID_LON']] = extracted_chunk['BOUNDS'].apply(lambda b: pd.Series(calculate_centroid(b)))
    
    # Group by AGG_DAY_PERIOD and BOUNDS and compute the mean ACTIVITY_INDEX_TOTAL
    grouped_chunk = extracted_chunk.groupby(['AGG_DAY_PERIOD', 'BOUNDS', 'CENTROID_LAT', 'CENTROID_LON'], as_index=False).agg({
        'ACTIVITY_INDEX_TOTAL': 'mean'
    }).rename(columns={'ACTIVITY_INDEX_TOTAL': 'MEAN_ACTIVITY_INDEX_TOTAL'})
    
    # Append the processed chunk to the result DataFrame
    result_df = pd.concat([result_df, grouped_chunk], ignore_index=True)

In [6]:
# Save the resulting DataFrame to a new CSV file
result_df.to_csv('mean_activity_index_per_day_bounds.csv', index=False)

print("The CSV file has been created successfully.")

The CSV file has been created successfully.


In [10]:
result_df.nunique()

AGG_DAY_PERIOD                   347
BOUNDS                       3005863
CENTROID_LAT                    3072
CENTROID_LON                    2896
MEAN_ACTIVITY_INDEX_TOTAL     493526
dtype: int64

In [11]:
len(result_df)

49657947