## Of 1year of Ride Requests Data at OLA Bikes

Time Window: 2020-03-26 to 2021-03-26

## Number of Good Ride Requests: 3708329

In [2]:
!pip install --upgrade ydata-profiling

Welcome to the Glue Interactive Sessions Kernel
For more information on available magic commands, please type %help in any new cell.

Please view our Getting Started page to access the most up-to-date information on the Interactive Sessions kernel: https://docs.aws.amazon.com/glue/latest/dg/interactive-sessions.html
Installed kernel version: 1.0.9 


In [4]:
# First, cleanly stop your current session
%stop_session


There is no current session.


In [1]:
%start_session \
    --glue_version 3.0 \
    --idle_timeout 2880 \
    --worker_type G.1X \
    --number_of_workers 5 \
    --additional_python_modules "geopy==2.4.1,gpxpy==1.5.0" \
    --conf "{\"spark.pyspark.virtualenv.enabled\":\"true\"}"

UsageError: Line magic function `%start_session` not found.


Trying to create a Glue session for the kernel.
Session Type: etl
Session ID: 0484e41f-9732-4565-8d1e-941de006bac6
Applying the following default arguments:
--glue_kernel_version 1.0.9
--enable-glue-datacatalog true
Waiting for session 0484e41f-9732-4565-8d1e-941de006bac6 to get into ready status...
Session 0484e41f-9732-4565-8d1e-941de006bac6 has been created.
IndentationError: unexpected indent (<stdin>, line 1)


In [7]:
!pip install gpxpy



In [2]:
import gpxpy
import gpxpy.geo
import geopy

ModuleNotFoundError: No module named 'gpxpy'


In [1]:

import numpy as np
from copy import deepcopy
from matplotlib import pyplot as plt
from sklearn.cluster import MiniBatchKMeans, KMeans
from datetime import datetime, timedelta
from joblib import dump, load
from ydata_profiling import ProfileReport  # Correct import for ydata-profiling

Welcome to the Glue Interactive Sessions Kernel
For more information on available magic commands, please type %help in any new cell.

Please view our Getting Started page to access the most up-to-date information on the Interactive Sessions kernel: https://docs.aws.amazon.com/glue/latest/dg/interactive-sessions.html
Installed kernel version: 1.0.9 
Trying to create a Glue session for the kernel.
Session Type: etl
Session ID: 4790117e-98d6-472f-a02f-a2f3df4517b1
Applying the following default arguments:
--glue_kernel_version 1.0.9
--enable-glue-datacatalog true
Waiting for session 4790117e-98d6-472f-a02f-a2f3df4517b1 to get into ready status...
Session 4790117e-98d6-472f-a02f-a2f3df4517b1 has been created.
ModuleNotFoundError: No module named 'ydata_profiling'


In [2]:
import boto3
import pandas as pd
import io

# Initialize boto3 client if not already done
s3 = boto3.client('s3')

bucket_name = 'demand-prediction-ola-rides'        # Replace with your bucket
file_key = 'processed/clean_data.csv.gz'  # Your gzip file key

# Get the object from S3
response = s3.get_object(Bucket=bucket_name, Key=file_key)

# Read the compressed data from the response body
compressed_body = response['Body'].read()

# Convert bytes data to a BytesIO buffer
buffer = io.BytesIO(compressed_body)

# Read CSV with gzip compression
df = pd.read_csv(buffer, compression='gzip')

# Preview
print(df.head())


     index                   ts  ...  booking_time_diff_min  geodesic_distance
0  2374378  2020-10-10 07:34:16  ...               26705254              17.40
1  2405894  2020-10-11 08:23:42  ...                   1489               4.08
2  2406076  2020-10-11 11:57:17  ...                    213               4.05
3  2500477  2020-10-16 17:51:07  ...                   7553               3.62
4  2694503  2020-10-30 09:00:44  ...                  19629               3.10

[5 rows x 18 columns]


In [None]:
len(df)

# Geospacial Feature Engineering - Clustering/Segmentation
## Here, we have divided whole India into regions using “K-Means Clustering”.

In [None]:
coord = df[["pick_lat", "pick_lng"]].values
neighbors = []


In [None]:
def min_distance(regionCenters, totalClusters):
    good_points = 0
    bad_points = 0
    less_dist = []
    more_dist = []
    min_distance = np.inf  #any big number can be given here
    for i in range(totalClusters):
        good_points = 0
        bad_points = 0
        for j in range(totalClusters):
            if j != i:
                distance = gpxpy.geo.haversine_distance(latitude_1 = regionCenters[i][0], longitude_1 = regionCenters[i][1], latitude_2 = regionCenters[j][0], longitude_2 = regionCenters[j][1])
                distance = distance/(1.60934*1000)   #distance from meters to miles
                min_distance = min(min_distance, distance) #it will return minimum of "min_distance, distance".
                if distance < 2:
                    good_points += 1
                else:
                    bad_points += 1
        less_dist.append(good_points)
        more_dist.append(bad_points)
    print("On choosing a cluster size of {}".format(totalClusters))
    print("Avg. Number clusters within vicinity where inter cluster distance < 2 miles is {}".format(np.ceil(sum(less_dist)/len(less_dist))))
    print("Avg. Number clusters outside of vicinity where inter cluster distance > 2 miles is {}".format(np.ceil(sum(more_dist)/len(more_dist))))
    print("Minimum distance between any two clusters = {}".format(min_distance))
    print("-"*10)
            
def makingRegions(noOfRegions):
    regions = MiniBatchKMeans(n_clusters = noOfRegions, batch_size = 10000, random_state = 5).fit(coord)
    regionCenters = regions.cluster_centers_ 
    totalClusters = len(regionCenters)
    return regionCenters, totalClusters

In [None]:
startTime = datetime.now()
for i in range(10, 100, 10):
    regionCenters, totalClusters = makingRegions(i)
    min_distance(regionCenters, totalClusters)
print("Time taken = "+str(datetime.now() - startTime))

### I want the minimum inter cluster distance between any two clusters to be less than 0.5miles and when number of clusters are 50 then this condition is meeting. Therefore, we are considering number of clusters to be 50.

The MiniBatchKMeans is a variant of the KMeans algorithm which uses mini-batches to reduce the computation time, while still attempting to optimise the same objective function. Mini-batches are subsets of the input data, randomly sampled in each training iteration. These mini-batches drastically reduce the amount of computation required to converge to a local solution. In contrast to other algorithms that reduce the convergence time of k-means, mini-batch k-means produces results that are generally only slightly worse than the standard algorithm.

In [None]:
coord = df[["pick_lat", "pick_lng"]].values
regions = MiniBatchKMeans(n_clusters = 50, batch_size = 10000, random_state = 0).fit(coord)
df["pickup_cluster"] = regions.predict(df[["pick_lat", "pick_lng"]])

In [None]:
df

In [None]:
### Model to Define pickup cluster, given latitude and longitude
dump(regions, '../data/pickup_cluster_model.joblib', compress = 3)

In [None]:
### These pickup clusters tell in which area most ride requests are coming. 
### Plotting Regions in Bangalore (our most rides requests here)
#### Bangalore:'boundingbox': ['12.8340125', '13.1436649', '77.4601025', '77.7840515']
bangalore_latitude_range = (12.8340125, 13.1436649)
bangalore_longitude_range = (77.4601025, 77.7840515)
fig = plt.figure()
ax = fig.add_axes([0,0,1.5,1.5])
ax.scatter(x = df.pick_lng.values[:100000], y = df.pick_lat.values[:100000], c = df.pickup_cluster.values[:100000], cmap = "Paired", s = 5)
ax.set_xlim(77.4601025, 77.7840515)
ax.set_ylim(12.8340125, 13.1436649)
ax.set_title("Regions in Bangalore")
ax.set_xlabel('Longitude')
ax.set_ylabel('Latitude')
plt.show()
#Longitude values vary from left to right i.e., horizontally
#Latitude values vary from top to bottom means i.e., vertically

In [None]:
df.head()

## Summing Ride Request Count to 30mins Interval groupby pickup cluster

In [None]:
def round_timestamp_30interval(x):
    if type(x)==str:
        x = datetime.strptime(x, '%Y-%m-%d %H:%M:%S')
    return x- timedelta(minutes=x.minute%30, seconds=x.second, microseconds=x.microsecond)

df['ts'] = np.vectorize(round_timestamp_30interval)(df['ts'])


In [None]:

dataset = deepcopy(df)
dataset.ts = pd.to_datetime(dataset.ts)
dataset

In [None]:
dataset = dataset[['ts','number','pickup_cluster']]

In [None]:
dataset=dataset.groupby(by = ['ts','pickup_cluster']).count().reset_index()
dataset.columns = ['ts','pickup_cluster','request_count']

In [None]:
dataset

### There should be: 366days [2020-03-26 to 2021-03-26] * [(24*2) 30mins intervals] * 50 regions = 878400 data rows

In [None]:
# ## Adding Dummy pickup cluster -1

# ## Change this Data based on your data
# l = [datetime(2020,3,26,00,00,00) + timedelta(minutes = 30*i) for i in range(0,48*365)]
# lt = []
# for x in l:
#     lt.append([x, -1, 0])
# temp = pd.DataFrame(lt, columns = ['ts','pickup_cluster','request_count'])
# dataset = dataset.append(temp,ignore_index=True)

In [None]:
## Adding Dummy pickup cluster -1

## Change this Data based on your data
l = [datetime(2020,3,26,00,00,00) + timedelta(minutes = 30*i) for i in range(0,48*365)]
lt = []
for x in l:
    lt.append([x, -1, 0])
temp = pd.DataFrame(lt, columns = ['ts','pickup_cluster','request_count'])
dataset = pd.concat([dataset, temp], ignore_index=True)

In [None]:
data = dataset.set_index(['ts', 'pickup_cluster']).unstack().fillna(value=0).asfreq(freq='30Min').stack().sort_index(level=1).reset_index()

In [None]:
data.tail(5)

In [None]:
# Removing Dummy Cluster
data = data[data.pickup_cluster>=0]

In [None]:

assert len(data)==878400


## Adding TimeFeatures
### hour, mins, dayofweek, quarter & month (to capture seasonality winter ride count, summer ride count, rainy weather ride count)
### During Rainy weather or extreme winter or extreme summer months, ride request will vary with weather

In [None]:
data['mins'] = data.ts.dt.minute
data['hour'] = data.ts.dt.hour
data['day'] = data.ts.dt.day
data['month'] = data.ts.dt.month
data['dayofweek'] = data.ts.dt.dayofweek
data['quarter'] = data.ts.dt.quarter

In [None]:
data

In [None]:
import io
import boto3

# Assuming df is your DataFrame and s3 client and bucket_name defined

buffer = io.BytesIO()
df.to_csv(buffer, index=False, compression='gzip')  # <-- Add compression here
buffer.seek(0)  # Reset pointer to start

# Upload to S3
s3.put_object(Bucket=bucket_name, Key='processed/preprocessed_11.csv.gz', Body=buffer.getvalue())

print("In-memory gzip-compressed CSV uploaded successfully!")


In [None]:
profile = data.profile_report(title='Ride Request DataSet Analysis')
profile.to_file(output_file="data_analysis_ride_request.html")

In [None]:
# profile