# Coffee Shop Visit Analysis

This notebook performs an analysis to identify which coffee shops from `turkiye_geneli_kahve_zinciri_subeleri.csv` were visited by individuals based on mobility pings from `MobilityDataMay2024.parquet`.

**Steps:**

1. Load mobility data (first 5 million rows) and coffee shop venue data.
2. Convert both to GeoDataFrames.
3. Create a 50-meter buffer around each coffee shop.
4. Perform a spatial join to find mobility pings falling within these coffee shop buffers.
5. Process visit data, adding time-based features (date, hour, day of week).
6. Generate a summary table of visit frequencies.
7. Save the detailed visit pings and the frequency summary.

## 1. Setup and Library Imports

In [None]:
import pandas as pd
import geopandas
from shapely.geometry import Point
import pyarrow # Required for parquet
import os

print("Libraries imported.")

In [None]:
### Mount Google Drive (if running in Colab)
try:
    from google.colab import drive
    drive.mount('/content/drive')
    print("Google Drive mounted.")
    # Define base path for files on Google Drive
    # IMPORTANT: Adjust this path if your files are in a subfolder of MyDrive
    google_drive_base_path = '/content/drive/MyDrive/'
except ModuleNotFoundError:
    print("Not running in Colab, or google.colab module not found. Assuming files are local.")
    # Define a base path for local files (current directory)
    google_drive_base_path = './' # Current directory if not in Colab
except Exception as e:
    print(f"Error mounting Google Drive: {e}")
    google_drive_base_path = './'

## 2. Helper Function for Coordinate Cleaning

In [None]:
def clean_coordinates(coord_series):
    """Cleans coordinate strings by removing single quotes, replacing commas with periods, and converting to numeric."""
    # Ensure series is string type, remove single quotes, then replace comma with period
    cleaned_series = coord_series.astype(str).str.strip("'").str.replace(',', '.', regex=False)
    return pd.to_numeric(cleaned_series, errors='coerce')

print("Helper function defined.")

## 3. Load Mobility Data

In [None]:
print("Loading mobility data (MobilityDataMay2024.parquet)...")
# Construct path using the google_drive_base_path variable
mobility_data_filename = 'MobilityDataMay2024.parquet'
# Adjust path to be relative to the notebook's location in 'code/' directory, accessing parent directory for data files
mobility_data_path = os.path.join(google_drive_base_path, '..', mobility_data_filename) if google_drive_base_path == './' else os.path.join(google_drive_base_path, mobility_data_filename)

df_mobility = None
try:
    # Load the entire parquet file first
    df_mobility_full = pd.read_parquet(mobility_data_path)
    print(f"Full mobility data loaded. Shape: {df_mobility_full.shape}")
    
    # Take the first 5 million rows
    num_rows_to_sample = 5000000
    if len(df_mobility_full) > num_rows_to_sample:
        df_mobility = df_mobility_full.head(num_rows_to_sample)
        print(f"Using the first {num_rows_to_sample} rows of mobility data. New shape: {df_mobility.shape}")
    else:
        df_mobility = df_mobility_full
        print(f"Full mobility data has {len(df_mobility_full)} rows (less than or equal to {num_rows_to_sample}), using all of it. Shape: {df_mobility.shape}")
    del df_mobility_full # Free up memory

    # Basic validation
    if not all(col in df_mobility.columns for col in ['latitude', 'longitude', 'device_aid', 'timestamp']):
        print("Error: Mobility data is missing one or more required columns: 'latitude', 'longitude', 'device_aid', 'timestamp'.")
        df_mobility = None # Invalidate df_mobility
    else:
        print("Required columns found in mobility data.")
        print(df_mobility.head())
except FileNotFoundError:
    print(f"Error: Mobility data file not found at {mobility_data_path}")
except Exception as e:
    print(f"Error loading mobility data: {e}")

## 4. Create Mobility GeoDataFrame

In [None]:
gdf_mobility = None
if df_mobility is not None:
    print("Creating mobility GeoDataFrame...")
    try:
        gdf_mobility = geopandas.GeoDataFrame(
            df_mobility,
            geometry=geopandas.points_from_xy(df_mobility.longitude, df_mobility.latitude),
            crs="EPSG:4326"  # WGS84
        )
        print(f"Mobility GeoDataFrame created. Shape: {gdf_mobility.shape}, CRS: {gdf_mobility.crs}")
        print(gdf_mobility.head())
    except Exception as e:
        print(f"Error creating mobility GeoDataFrame: {e}")
else:
    print("Skipping mobility GeoDataFrame creation as df_mobility was not loaded.")

## 5. Load Coffee Shop Venue Data

In [None]:
print("Loading coffee shop venue data (turkiye_geneli_kahve_zinciri_subeleri.csv)...")
# Construct path using the google_drive_base_path variable
venue_data_filename = 'turkiye_geneli_kahve_zinciri_subeleri.csv'
# Adjust path to be relative to the notebook's location in 'code/' directory, accessing parent directory for data files
venue_data_path = os.path.join(google_drive_base_path, '..', venue_data_filename) if google_drive_base_path == './' else os.path.join(google_drive_base_path, venue_data_filename)

df_venues = None
try:
    df_venues = pd.read_csv(venue_data_path, sep=';')
    print(f"Coffee shop venue data loaded. Shape: {df_venues.shape}")
    # Basic validation
    if not all(col in df_venues.columns for col in ['latitude', 'longitude', 'isim']):
        print("Error: Coffee shop venue data is missing one or more required columns: 'latitude', 'longitude', 'isim'.")
        df_venues = None # Invalidate df_venues
    else:
        print("Required columns found in coffee shop venue data.")
        print(df_venues.head())
except FileNotFoundError:
    print(f"Error: Coffee shop venue data file not found at {venue_data_path}")
except Exception as e:
    print(f"Error loading coffee shop venue data: {e}")

## 6. Clean Venue Coordinates and Create Venue GeoDataFrame

In [None]:
gdf_venues = None
if df_venues is not None:
    print("Cleaning coffee shop venue coordinates and creating venue GeoDataFrame...")
    try:
        df_venues['lat_cleaned'] = clean_coordinates(df_venues['latitude'])
        df_venues['lng_cleaned'] = clean_coordinates(df_venues['longitude'])
        
        # Drop rows with invalid coordinates
        original_venue_count = len(df_venues)
        df_venues.dropna(subset=['lat_cleaned', 'lng_cleaned'], inplace=True)
        print(f"Dropped {original_venue_count - len(df_venues)} coffee shops due to invalid coordinates.")

        if df_venues.empty:
            print("Error: No valid coffee shop venue coordinates after cleaning.")
        else:
            gdf_venues = geopandas.GeoDataFrame(
                df_venues,
                geometry=geopandas.points_from_xy(df_venues.lng_cleaned, df_venues.lat_cleaned),
                crs="EPSG:4326"  # WGS84
            )
            print(f"Coffee shop venue GeoDataFrame created. Shape: {gdf_venues.shape}, CRS: {gdf_venues.crs}")
            print(gdf_venues.head())
    except Exception as e:
        print(f"Error creating coffee shop venue GeoDataFrame: {e}")
else:
    print("Skipping coffee shop venue GeoDataFrame creation as df_venues was not loaded.")

## 7. Spatial Analysis: Buffering and Joining

In [None]:
gdf_visits = None
if gdf_venues is not None and gdf_mobility is not None:
    # Target CRS for buffering and spatial join (UTM Zone 36N, suitable for Istanbul/Western Turkey)
    # If data covers a wider area, a more dynamic UTM zone selection or a different projected CRS might be needed.
    projected_crs = "EPSG:32636" 
    buffer_radius_meters = 50

    print(f"Projecting coffee shop venue data to {projected_crs} for buffering...")
    try:
        gdf_venues_projected = gdf_venues.to_crs(projected_crs)
        print(f"Coffee shop venue data projected. CRS: {gdf_venues_projected.crs}")
    except Exception as e:
        print(f"Error projecting coffee shop venue data: {e}")
        gdf_venues_projected = None

    gdf_venue_buffers = None
    if gdf_venues_projected is not None:
        print(f"Creating {buffer_radius_meters}m buffers around coffee shops...")
        try:
            # Ensure the geometry column is active and valid before buffering
            if not gdf_venues_projected.geometry.is_valid.all():
                print("Warning: Some coffee shop venue geometries are invalid. Attempting to fix...")
                # A common trick to fix invalid geometries; may not always work perfectly.
                gdf_venues_projected.geometry = gdf_venues_projected.geometry.buffer(0) 
                if not gdf_venues_projected.geometry.is_valid.all():
                     print("Error: Could not fix all invalid coffee shop venue geometries. Proceeding with potentially problematic data.")
            
            gdf_venue_buffers = gdf_venues_projected.copy()
            gdf_venue_buffers['geometry'] = gdf_venues_projected.geometry.buffer(buffer_radius_meters)
            print(f"Coffee shop venue buffers created. Shape: {gdf_venue_buffers.shape}")
        except Exception as e:
            print(f"Error creating coffee shop venue buffers: {e}")
            gdf_venue_buffers = None

    gdf_mobility_projected = None
    if gdf_venue_buffers is not None: # Proceed only if buffers were created
        print(f"Projecting mobility data to {projected_crs} for spatial join...")
        try:
            gdf_mobility_projected = gdf_mobility.to_crs(projected_crs)
            print(f"Mobility data projected. CRS: {gdf_mobility_projected.crs}")
        except Exception as e:
            print(f"Error projecting mobility data: {e}")
            gdf_mobility_projected = None

    if gdf_mobility_projected is not None and gdf_venue_buffers is not None:
        print("Performing spatial join (mobile pings within coffee shop venue buffers)...")
        try:
            # Use all columns from gdf_venue_buffers for the join to retain all venue information
            gdf_venue_buffers_for_join = gdf_venue_buffers.copy()
            print(f"Columns in gdf_venue_buffers_for_join before sjoin: {gdf_venue_buffers_for_join.columns.tolist()}")
            
            # Perform the spatial join
            # 'predicate="within"' means mobility points must be within venue buffers
            gdf_visits = geopandas.sjoin(gdf_mobility_projected, gdf_venue_buffers_for_join, how='inner', predicate='within')
            print(f"Spatial join completed. Number of potential visit pings: {gdf_visits.shape[0]}")
            if gdf_visits.empty:
                print("No visits found after spatial join.")
            else:
                print("Sample of joined visit data (first 5 rows):")
                print(gdf_visits.head())
        except Exception as e:
            print(f"Error during spatial join: {e}")
            gdf_visits = None
else:
    print("Skipping spatial analysis as one or both GeoDataFrames (mobility, venues) are missing.")

## 8. Process Results, Add Time Features, and Save Output

In [None]:
if gdf_visits is not None and not gdf_visits.empty:
    print("Extracting relevant columns and adding time features for the final output...")
    
    df_visits_output = gdf_visits.copy()
    
    try:
        df_visits_output['original_latitude'] = df_mobility.loc[df_visits_output.index, 'latitude'].values
        df_visits_output['original_longitude'] = df_mobility.loc[df_visits_output.index, 'longitude'].values
    except KeyError as e:
        print(f"KeyError while trying to map original lat/lon: {e}. Original coordinates might be missing.")
        df_visits_output['original_latitude'] = pd.NA
        df_visits_output['original_longitude'] = pd.NA
    except Exception as e:
        print(f"An unexpected error occurred while mapping original lat/lon: {e}")
        df_visits_output['original_latitude'] = pd.NA
        df_visits_output['original_longitude'] = pd.NA
    
    # Convert timestamp to datetime and extract time features
    df_visits_output['timestamp'] = pd.to_datetime(df_visits_output['timestamp'])
    df_visits_output['date'] = df_visits_output['timestamp'].dt.date
    df_visits_output['hour_of_day'] = df_visits_output['timestamp'].dt.hour
    df_visits_output['day_of_week'] = df_visits_output['timestamp'].dt.day_name()

    # Columns from mobility data (device_aid, timestamp, original lat/lon of the ping, and new time features)
    mobility_cols_to_keep = ['device_aid', 'timestamp', 'original_latitude', 'original_longitude', 'date', 'hour_of_day', 'day_of_week']

    # Original columns from df_venues (loaded from turkiye_geneli_kahve_zinciri_subeleri.csv)
    # We want 'isim', 'adres', 'google_maps_link'
    original_venue_column_names = ['isim', 'adres', 'google_maps_link']
    
    processed_venue_columns_for_output = []
    for venue_col_original_name in original_venue_column_names:
        if venue_col_original_name in df_visits_output.columns:
            processed_venue_columns_for_output.append(venue_col_original_name)
        elif f"{venue_col_original_name}_right" in df_visits_output.columns:
            df_visits_output.rename(columns={f"{venue_col_original_name}_right": venue_col_original_name}, inplace=True)
            processed_venue_columns_for_output.append(venue_col_original_name)

    final_output_columns_list = mobility_cols_to_keep + processed_venue_columns_for_output
    
    columns_to_exclude_finally = ['geometry_right', 'index_right', 'lat_cleaned', 'lng_cleaned', 'latitude', 'longitude'] # Exclude raw and cleaned venue coords
    
    final_selected_columns = []
    seen_columns = set()
    for col in final_output_columns_list:
        if col in df_visits_output.columns and col not in columns_to_exclude_finally and col not in seen_columns:
            final_selected_columns.append(col)
            seen_columns.add(col)
            
    print(f"Final columns selected for detailed output: {final_selected_columns}")
        
    df_final_visits_log = df_visits_output[final_selected_columns]

    output_log_filename = 'mobil_coffee.csv'
    # Adjust path for saving output file in the main directory, not 'code/'
    output_log_path = os.path.join(google_drive_base_path, '..', output_log_filename) if google_drive_base_path == './' else os.path.join(google_drive_base_path, output_log_filename)
    print(f"Saving detailed visit log to {output_log_path}...")
    try:
        df_final_visits_log.to_csv(output_log_path, index=False, sep=';')
        print(f"Successfully saved detailed visit log to {output_log_path}. Shape: {df_final_visits_log.shape}")
        print(df_final_visits_log.head())
    except Exception as e:
        print(f"Error saving detailed visit log: {e}")
    
    # Generate and save visit frequency summary
    print("Generating visit frequency summary...")
    try:
        # Ensure 'isim' is present for grouping (it should be if processed_venue_columns_for_output included it)
        if 'isim' in df_visits_output.columns:
            df_visit_summary = df_visits_output.groupby(['device_aid', 'isim', 'date', 'hour_of_day']).size().reset_index(name='visit_ping_count')
            output_summary_filename = 'mobil_coffee_visit_summary.csv'
            # Adjust path for saving output file in the main directory
            output_summary_path = os.path.join(google_drive_base_path, '..', output_summary_filename) if google_drive_base_path == './' else os.path.join(google_drive_base_path, output_summary_filename)
            print(f"Saving visit frequency summary to {output_summary_path}...")
            df_visit_summary.to_csv(output_summary_path, index=False, sep=';')
            print(f"Successfully saved visit frequency summary to {output_summary_path}. Shape: {df_visit_summary.shape}")
            print(df_visit_summary.head())
        else:
            print("Error: 'isim' column not found in df_visits_output. Cannot generate frequency summary.")
    except Exception as e:
        print(f"Error generating or saving visit frequency summary: {e}")

else:
    print("No visit data to save (either gdf_visits is None or empty).")

print("Coffee shop visit analysis script finished.")