In [1]:
!pip install folium==0.18.0



In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.animation as animation
from pyspark.sql.functions import concat, to_timestamp, col, lit

In [3]:
# # Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
import os

file_path = "/content/drive/My Drive/KSRTCJune2024.csv"
if os.path.exists(file_path):
    print("File exists.")
else:
    print("File does not exist.")


File does not exist.


In [5]:
# Import necessary libraries
import pandas as pd
from pyspark.sql import SparkSession
from pyspark.sql.functions import sum as spark_sum, col, hour, concat_ws, to_date, date_format



# Step 1: Initialize a Spark session
spark = SparkSession.builder \
    .appName("BigDataProcessing") \
    .config("spark.executor.memory", "4g") \
    .config("spark.driver.memory", "4g") \
    .getOrCreate()

# Step 2: Load your CSV file from Google Drive into a Spark DataFrame
file_path = "/content/drive/My Drive/June2024.csv"
data = spark.read.csv(file_path, header=True, inferSchema=True)


# Step 3: Perform operations on the DataFrame
data.show()  # Display the first few rows


+--------------------+-----------------+-------------------+--------------+---------------+-------------------+----------+--------+-----------+------------+---------+--------------------+--------------------+-----------+-----------+----------+------+---------------+---------------+-------+------------+
|          DEPOT_NAME|TICKET_ISSUE_DATE|  TICKET_ISSUE_TIME|WAYBILL_NUMBER|SCHEDULE_NUMBER|       SERVICE_TYPE|ROUTE_NAME|ROUTE_ID|TRIP_NUMBER|          ID|TICKET_NO|      FROM_STOP_NAME|        TO_STOP_NAME|NO_OF_ADULT|NO_OF_CHILD|NO_OF_LUGG|OTHERS|TOTAL_PASSENGER|DISTANCE_TRAVEL|TRIP_KM|PAYMENT_TYPE|
+--------------------+-----------------+-------------------+--------------+---------------+-------------------+----------+--------+-----------+------------+---------+--------------------+--------------------+-----------+-----------+----------+------+---------------+---------------+-------+------------+
|TRIVANDRUM CITY D...|       31/05/2024|2024-12-31 09:03:06|      14146419|  1C 3 Duty 6

In [6]:
import time
# Define the specific time range
start_time = "12:45:00"
end_time = "14:45:00"


In [7]:
from pyspark.sql.functions import to_timestamp, concat, col, lit, date_format, expr
# Filter rows where ROUTE_ID is 'acwXkRFM'
# data = data.filter(col("ROUTE_ID") == 'acwXkRFM')
# Step 2: Format TICKET_ISSUE_TIME as a string in "HH:mm:ss" format (if not already) and combine date and time

data = data.withColumn("TICKET_ISSUE_TIME_STR", date_format(col("TICKET_ISSUE_TIME"), "HH:mm:ss"))
# Filter rows within the specific time range
# Show data which has time other than start_time and end_time

data = data.filter((col("TICKET_ISSUE_TIME_STR") >= start_time) & (col("TICKET_ISSUE_TIME_STR") <= end_time))
# Calculate the total number of days in the dataset
total_days = data.select(to_date(col("TICKET_ISSUE_DATE")).alias("date")).distinct().count()



In [8]:
from pyspark.sql import functions as F
import folium
import pandas as pd
from geopy.geocoders import Nominatim
from folium.plugins import MarkerCluster

LIMIT_OF_TOP_BUS_STOPS = 600
MIN_AVG_THRESHOLD = 5
# Aggregate data to get total passenger count per bus stop within the time range
# Sort by total passengers in descending order and select the top 20
top_bus_stops = (
    data.groupBy("FROM_STOP_NAME")
    .agg(F.sum("TOTAL_PASSENGER").alias("TOTAL_PASSENGER"))
    .withColumn("AVERAGE_PASSENGER", F.col("TOTAL_PASSENGER") / total_days)
    .filter(F.col("AVERAGE_PASSENGER") >= MIN_AVG_THRESHOLD)
    .orderBy("TOTAL_PASSENGER", ascending=False)
    .limit(LIMIT_OF_TOP_BUS_STOPS)
    .collect()
)


In [9]:
# Function to save geocoded data to a JSON file
def save_geocoded_data(data):
    with open(GEO_CACHE_FILE, 'w') as f:
        json.dump(data, f, indent=4)

# Function to check if a location is in South India
def is_in_south_india(latitude, longitude):
    return (SOUTH_INDIA_LAT_MIN <= latitude <= SOUTH_INDIA_LAT_MAX) and (SOUTH_INDIA_LON_MIN <= longitude <= SOUTH_INDIA_LON_MAX)

def load_geocoded_failures():
    try:
        with open(FAILURE_CACHE_FILE, 'r') as f:
            return json.load(f)
    except FileNotFoundError:
        return {}

In [10]:
import json
import time
from geopy.geocoders import Nominatim
import random
import sys

# Latitude and Longitude bounds for South India
SOUTH_INDIA_LAT_MIN = 8.0
SOUTH_INDIA_LAT_MAX = 14.5
SOUTH_INDIA_LON_MIN = 76.0
SOUTH_INDIA_LON_MAX = 85.0

# File to store previously geocoded bus stops
GEO_CACHE_FILE = '/content/drive/My Drive/geocoded_stops.json'
FAILURE_CACHE_FILE = '/content/drive/My Drive/geocoding_failures.json'

# Function to load previously cached geocoded data from a JSON file
def load_geocoded_data():
    try:
        with open(GEO_CACHE_FILE, 'r') as f:
            return json.load(f)
    except FileNotFoundError:
        return {}



# Prepare data for geocoding (replace 'top_bus_stops' with your actual data)
bus_stops_data = [{"stop_name": row["FROM_STOP_NAME"], "passenger_count": row["AVERAGE_PASSENGER"]} for row in top_bus_stops]

# Initialize the geocoder
geolocator = Nominatim(user_agent="bus_stop_locator")

# Load previously geocoded data from the cache
cached_data = load_geocoded_data()
cached_failure_data = load_geocoded_failures()

# Initialize counters for success and failure
success_count = 0
failure_count = 0
failures = []

# Function to print the progress bar
def print_progress_bar(iteration, total, bar_length=40):
    progress = iteration / total
    arrow = '=' * int(round(progress * bar_length) - 1)
    spaces = ' ' * (bar_length - len(arrow))
    percent = round(progress * 100, 1)
    sys.stdout.write(f'\r[{arrow}{spaces}] {percent}%')
    sys.stdout.flush()

# Geocode each bus stop with exponential backoff
for i, stop in enumerate(bus_stops_data):
    stop_name = stop["stop_name"]

    # Skip if the stop is in the failures list
    if stop_name in cached_data:
        # Use cached data
        stop["latitude"] = cached_data[stop_name]["latitude"]
        stop["longitude"] = cached_data[stop_name]["longitude"]
        success_count += 1  # Increment success count for cached data
        print_progress_bar(i + 1, len(bus_stops_data))  # Update progress bar for this stop
        continue  # Skip geocoding since it's already cached
    elif stop_name in cached_failure_data:
        # Skip geocoding if it previously failed
        failure_count += 1
        print_progress_bar(i + 1, len(bus_stops_data))  # Update progress bar for this stop
        continue

    # If not cached, geocode this stop
    retries = 0  # Counter for retry attempts
    while retries < 2:
        try:
            # Geocode the stop with a timeout
            location = geolocator.geocode(stop_name, timeout=20)

            if location:
                latitude = location.latitude
                longitude = location.longitude

                # Check if the coordinates are within South India's bounds
                if is_in_south_india(latitude, longitude):
                    stop["latitude"] = latitude
                    stop["longitude"] = longitude
                    # Save the geocoded result in the cache
                    cached_data[stop_name] = {"latitude": stop["latitude"], "longitude": stop["longitude"]}
                    success_count += 1  # Increment success count
                else:
                    # If outside South India, mark as None
                    stop["latitude"] = None
                    stop["longitude"] = None
                    failure_count += 1  # Increment failure count
                    failures.append(f"{stop_name} (outside South India)")
            else:
                stop["latitude"] = None
                stop["longitude"] = None
                failure_count += 1  # Increment failure count
                failures.append(stop_name)

            break  # Exit the retry loop on success
        except Exception as e:
            retries += 1
            print(f"Error geocoding {stop_name}: {e}")
            stop["latitude"] = None
            stop["longitude"] = None
            failures.append(stop_name)
            failure_count += 1  # Increment failure count

            # Exponential backoff
            backoff_time = min(2 ** retries + random.uniform(0, 1), 30)  # max backoff of 30 seconds
            print(f"Retrying {stop_name} in {backoff_time:.2f} seconds...")
            time.sleep(backoff_time)  # Sleep exponentially between retries

    # If retries are exhausted, skip to the next stop
    if retries == 2:
        print(f"Failed to geocode {stop_name} after {retries} retries.")

    # Update progress bar
    print_progress_bar(i + 1, len(bus_stops_data))

# Save the updated geocoded data to the cache file
save_geocoded_data(cached_data)

# Output the number of successes and failures
print(f"\nGeocoding Successes: {success_count}")
print(f"Geocoding Failures: {failure_count}")
if failures:
    print("Failed to geocode the following bus stops:")
    print(failures)

    # Function to save geocoding failures to a JSON file
    def save_geocoding_failures(failures):
        with open(FAILURE_CACHE_FILE, 'w') as f:
            json.dump(failures, f, indent=4)

    # Save the geocoding failures to the cache file
    save_geocoding_failures(failures)


Geocoding Successes: 361
Geocoding Failures: 212
Failed to geocode the following bus stops:
['East Fort North Bus Stand', 'Thampanoor Main Bus Stand', 'East South Fort Bus Stand', 'Pattom Sut Office', 'East Fort South Bus Stand', 'Statue Sbi Or Secretariat', 'Statue Sbi (outside South India)', 'World Market (outside South India)', 'Venjarammoodu Depot', 'Karamana Junction', 'Chakai (outside South India)', 'Vellayambalam Elankim Devi', 'Shangumukham Beach', 'Sree Karyam', 'Rotary (outside South India)', 'Kseb Chakkai', 'Shangumukham', 'Njadoorkonam', 'Radio Station Monvila', 'Valiyathura Shangumugam', '16th Mile (outside South India)', 'Chellamagalam Panjayath', 'Vizhinjam Bus Stand', 'Beema Pally', 'Crpf Gate', 'Mamam (outside South India)', 'Thrikkannapuram (outside South India)', 'Korani Junction', 'Technopark Front Gate', 'Venjarammoodu', 'Agricultural College Poonkulam', 'Mananthala (outside South India)', 'Sainik School (outside South India)', 'Veli Church (outside South India)', 

In [11]:
from folium.plugins import HeatMap
import folium
import pandas as pd
from folium import Icon
from folium.plugins import MarkerCluster
import numpy as np

# Filter out stops without coordinates
stops_with_coords = [stop for stop in bus_stops_data if 'latitude' in stop and 'longitude' in stop and stop["latitude"] is not None and stop["longitude"] is not None]

# Convert to Pandas DataFrame for easier handling with Folium
stops_df = pd.DataFrame(stops_with_coords)

# Initialize a Folium map centered around an average location
map_center = [8.4869, 76.9529]
m = folium.Map(location=map_center, tiles="CartoDB positron", zoom_start=13, min_zoom=8, max_zoom=18)

# Logarithmic transformation of passenger counts for better contrast in markers
stops_df['log_passenger_count'] = np.log1p(stops_df['passenger_count'])

# Prepare data for HeatMap using actual passenger counts for intensity
heat_data = []
for _, row in stops_df.iterrows():
    heat_data.append([row["latitude"], row["longitude"], row["passenger_count"]])  # Using actual count for heatmap

# Create the HeatMap layer with adjusted visual settings
HeatMap(
    heat_data,
    min_opacity=0.3,  # Set minimum opacity for better visibility (not too faint)
    max_opacity=0.7,  # Set maximum opacity for a more subtle heatmap
    radius=25,        # Adjust radius size to balance between clarity and overlap
    blur=18,          # Moderate blur to avoid excessive smoothing
    gradient={        # Reduced 5-color gradient scale for better distinction
        0.2: 'blue',   # Low density -> blue
        0.4: 'green',  # Medium-low density -> green
        0.6: 'yellow', # Medium-high density -> yellow
        0.8: 'orange', # High density -> orange
        1.0: 'red',    # Very high density -> red
    }
).add_to(m)

# Create a MarkerCluster for the stops (useful for closely spaced stops)
marker_cluster = MarkerCluster().add_to(m)

# Define color mapping for passenger counts (using log-transformed values for marker colors)
def get_marker_color(log_count):
    if log_count < 3.1:   # 0 to 20 passengers
        return 'blue'      # Low density -> blue
    elif log_count < 5.8:  # 21 to 500 passengers
        return 'green'     # Medium density -> green
    elif log_count < 6.9:  # 501 to 1000 passengers
        return 'orange'    # High density -> orange
    else:                  # 1000+ passengers
        return 'red'       # Very high density -> red


# Add popups and clustered markers for bus stops with their name and transformed passenger count
for _, row in stops_df.iterrows():
    # Get the color based on the transformed passenger count
    color = get_marker_color(row['log_passenger_count'])

    marker = folium.Marker(
        location=[row["latitude"], row["longitude"]],
        popup=f"<b>{row['stop_name']}</b><br>Passenger count: {row['passenger_count']}<br>Log Transformed: {row['log_passenger_count']:.2f}",
        tooltip=row["stop_name"],
        icon=Icon(color=color, icon="fa-users", prefix="fa"),  # Apply color dynamically
    )
    marker.add_to(marker_cluster)  # Add to MarkerCluster for better organization

# Create a legend HTML for color decoding (simplified and smaller)
legend_html = '''
    <div style="position: fixed;
                bottom: 50px; left: 50px; width: 240px; height: 160px;
                background-color: white; border: 2px solid grey; padding: 20px;
                z-index: 9999; font-size: 10px; border-radius: 8px;">
        <b>Passenger Density Legend</b><br>
        <i style="background: blue; width: 20px; height: 20px; display: inline-block;"></i> Low Density (0 - 20 passengers)<br>
        <i style="background: green; width: 20px; height: 20px; display: inline-block;"></i> Medium Density (21 - 500 passengers)<br>
        <i style="background: orange; width: 20px; height: 20px; display: inline-block;"></i> High Density (501 - 1000 passengers)<br>
        <i style="background: red; width: 20px; height: 20px; display: inline-block;"></i> Very High Density (1000+ passengers)
    </div>
'''

# Add the legend to the map
m.get_root().html.add_child(folium.Element(legend_html))




<branca.element.Element at 0x7a3be516a860>

In [12]:

# Save the map to Google Drive
save_path = '/content/drive/My Drive/passenger_boarding_density_map.html'
m.save(save_path)

print(f"Map saved successfully to {save_path}")

Map saved successfully to /content/drive/My Drive/passenger_boarding_density_map.html
