In [None]:
storage_account_name = "xxxxxxxxxxxxx"
storage_account_key = "xxxxxxxxxxxxxxxxxxxxxxxxxx"
container_name = "cabsdatatransformed"
spark.conf.set(f"fs.azure.account.key.{storage_account_name}.blob.core.windows.net", storage_account_key)
input_path = f"wasbs://{container_name}@{storage_account_name}.blob.core.windows.net/"

df = spark.read.format("parquet").load(input_path)

In [None]:
df.printSchema()

root
 |-- distance: double (nullable = true)
 |-- cab_type: string (nullable = true)
 |-- time_stamp: timestamp (nullable = true)
 |-- destination: string (nullable = true)
 |-- source: string (nullable = true)
 |-- price: double (nullable = true)
 |-- surge_multiplier: double (nullable = true)
 |-- id: string (nullable = true)
 |-- product_id: string (nullable = true)
 |-- name: string (nullable = true)
 |-- date: string (nullable = true)
 |-- time: integer (nullable = true)
 |-- weekday: integer (nullable = true)
 |-- dollars_per_mile: double (nullable = true)
 |-- revenue: double (nullable = true)
 |-- merge_date: string (nullable = true)
 |-- temp: double (nullable = true)
 |-- location: string (nullable = true)
 |-- clouds: double (nullable = true)
 |-- pressure: double (nullable = true)
 |-- rain: double (nullable = true)
 |-- time_stamp_w: timestamp (nullable = true)
 |-- humidity: double (nullable = true)
 |-- wind: double (nullable = true)
 |-- date_w: string (nullable = true)

In [None]:
df.createOrReplaceTempView("cabs_table")

# Total Revenue of Uber

In [None]:
%sql
SELECT SUM(revenue) FROM cabs_table WHERE cab_type="Uber"

sum(revenue)
5176316.5


Databricks visualization. Run in Databricks to view.

# Total Revenue of Lyft

In [None]:
%sql
SELECT SUM(revenue) FROM cabs_table WHERE cab_type="Lyft"

sum(revenue)
5577636.23


Databricks visualization. Run in Databricks to view.

# Distribution of Rides of each cabs in the data

In [None]:
%sql
SELECT cab_type, COUNT(*) FROM cabs_table GROUP BY cab_type

cab_type,count(1)
Lyft,304637
Uber,327766


Databricks visualization. Run in Databricks to view.

# Cab prices as per the distance

In [None]:
%sql
SELECT 
  cab_type, 
  distance, 
  AVG(price) as avg_price 
FROM 
  cabs_table 
GROUP BY 
  cab_type,distance

cab_type,distance,avg_price
Lyft,2.73,18.635757575757577
Lyft,2.13,17.078579117330463
Lyft,4.7,26.022727272727277
Lyft,5.22,24.91111111111111
Uber,3.21,19.07567567567568
Lyft,1.47,14.794388931591085
Lyft,2.64,19.26295585412668
Lyft,2.97,19.95057833859096
Uber,3.06,18.84298780487805
Uber,1.79,15.259218289085544


Databricks visualization. Run in Databricks to view.

#### Insights
- Lyft is providing less cost for smaller distance but cost higher for longer distance than Uber

# Which time of the day more cab service is used

In [None]:
%sql
SELECT 
  cab_type,time,count(*) 
FROM 
  cabs_table 
GROUP BY 
  cab_type, time

cab_type,time,count(1)
Lyft,4,9263
Lyft,17,13077
Uber,19,12218
Lyft,19,11115
Lyft,20,13280
Uber,11,12073
Lyft,7,6921
Uber,10,13171
Lyft,13,15200
Lyft,21,9365


Databricks visualization. Run in Databricks to view.

count of rides made in the date 2018-11 to 2018-12 on the same hours

In [None]:
%sql
SELECT 
  cab_type,
  CASE 
        WHEN time >= 0 AND time < 6 THEN 'Night'
        WHEN time >= 6 AND time < 12 THEN 'Morning'
        WHEN time >= 12 AND time < 18 THEN 'Afternoon'
        WHEN time >= 18 AND time < 24 THEN 'Evening'
        ELSE 'Invalid Time'
    END AS time_category,
  count(*) 
FROM 
  cabs_table
GROUP BY 
  cab_type, time
ORDER BY 
  time ASC

cab_type,time_category,count(1)
Lyft,Night,16967
Uber,Night,18324
Lyft,Night,9300
Uber,Night,9954
Uber,Night,17284
Lyft,Night,16140
Lyft,Night,11410
Uber,Night,12694
Lyft,Night,9263
Uber,Night,9828


Databricks visualization. Run in Databricks to view.

At 12 AM Public facility may not be available
- Most of the peek times are at 12 am, 2 am, 5 am, 1pm, 3pm, 6pm

# Price for each cab based on the time of the day

In [None]:
%sql
SELECT 
  cab_type, time, AVG(price) 
FROM 
  cabs_table 
GROUP BY 
  cab_type, time

cab_type,time,avg(price)
Lyft,4,17.314406779661017
Lyft,17,17.382235986847135
Uber,19,15.839703715829105
Lyft,19,17.322575798470535
Lyft,20,17.376193524096387
Uber,11,15.661683094508406
Lyft,7,17.18393295766508
Uber,10,15.810834408928708
Lyft,13,17.299661184210525
Lyft,21,17.34383342231714


Databricks visualization. Run in Databricks to view.

# Price based on the type of cabs each cab type having

In [None]:
%sql
SELECT 
  name, AVG(price) as price, cab_type
FROM
  cabs_table
GROUP BY name,cab_type 
ORDER BY price DESC

name,price,cab_type
Lux Black XL,32.32087681644548,Lyft
Black SUV,30.28588302080281,Uber
Lux Black,23.05760580577821,Lyft
Black,20.520334950123548,Uber
Lux,17.766729648874193,Lyft
UberXL,15.675990846681922,Uber
Lyft XL,15.308588993320328,Lyft
UberX,9.764840517083533,Uber
WAV,9.764037335285504,Uber
Lyft,9.609680331099725,Lyft


Databricks visualization. Run in Databricks to view.

# Uber/Lyft who earned the maximum revenue in year 2018

In [None]:
%sql
SELECT 
  cab_type, 
  YEAR(date) as Year, 
  SUM(revenue) as Total_Revenue 
FROM 
  cabs_table 
GROUP BY 
  cab_type, 
  YEAR(date)  
ORDER BY 
  Total_Revenue DESC

cab_type,Year,Total_Revenue
Lyft,2018,5577636.23
Uber,2018,5176316.5


Databricks visualization. Run in Databricks to view.

# Revenue based on month

In [None]:
%sql
SELECT cab_type, MONTH(DATE), revenue FROM cabs_table 

cab_type,month(DATE),revenue
Lyft,12,5.0
Lyft,11,11.0
Lyft,11,26.0
Lyft,11,9.0
Lyft,12,16.5
Lyft,11,10.5
Lyft,12,16.5
Lyft,12,3.0
Lyft,11,27.5
Lyft,11,13.5


Databricks visualization. Run in Databricks to view.

# Demand for cabs

In [None]:
%sql
SELECT 
  name,COUNT(*) as count, cab_type
FROM
  cabs_table
GROUP BY name,cab_type 
ORDER BY count DESC

name,count,cab_type
UberPool,54644,Uber
WAV,54640,Uber
Black,54635,Uber
UberXL,54625,Uber
UberX,54614,Uber
Black SUV,54608,Uber
Lux,50808,Lyft
Lux Black XL,50786,Lyft
Lux Black,50777,Lyft
Shared,50775,Lyft


# Cabs total ride count and revenue earned 

In [None]:
%sql
SELECT COUNT(*), SUM(revenue) FROM cabs_table WHERE cab_type="Uber"

count(1),sum(revenue)
327766,5176316.5


Databricks visualization. Run in Databricks to view.

In [None]:
%sql
SELECT COUNT(*), SUM(revenue) FROM cabs_table WHERE cab_type="Lyft"

count(1),sum(revenue)
304637,5577636.23


Databricks visualization. Run in Databricks to view.

# Number of rides based on temperature for each cab

In [None]:
%sql
SELECT 
  cab_type, 
  temp, 
  COUNT(*) 
FROM 
  cabs_table 
GROUP BY 
  cab_type, temp

cab_type,temp,count(1)
Lyft,43.33,265
Lyft,32.67,167
Uber,39.28,416
Uber,45.86,389
Uber,46.02,373
Lyft,34.9,198
Uber,46.75,226
Uber,41.01,185
Lyft,41.33,195
Uber,28.28,211


Databricks visualization. Run in Databricks to view.

#### Insights
- More cabs rides are required by customers at a temperature between 35 - 45 deg Farenheit

# Total avg price of Rides by temperature

In [None]:
%sql
SELECT cab_type, temp, avg(price) as avg_price FROM cabs_table GROUP BY cab_type, temp

cab_type,temp,avg_price
Lyft,43.33,20.50566037735849
Lyft,32.67,15.62874251497006
Uber,39.28,16.66466346153846
Uber,45.86,14.694087403598973
Uber,46.02,16.404825737265416
Lyft,34.9,15.181818181818182
Uber,46.75,14.736725663716816
Uber,41.01,18.548648648648648
Lyft,41.33,19.47435897435897
Uber,28.28,15.035545023696685


Databricks visualization. Run in Databricks to view.

In [None]:
%sql
SELECT  humidity, cab_type, AVG(surge_multiplier) FROM cabs_table GROUP BY humidity, cab_type

humidity,cab_type,avg(surge_multiplier)
0.84,Uber,1.0
0.73,Uber,1.0
0.55,Uber,1.0
0.75,Lyft,1.026722734671731
0.52,Lyft,1.0457559681697612
0.66,Uber,1.0
0.59,Lyft,1.035863219349458
0.74,Lyft,1.0360164773992429
0.88,Lyft,1.0275443510737627
0.85,Lyft,1.0286323681489142


Databricks visualization. Run in Databricks to view.

# Which day of the week has higher cab requirement

In [None]:
%sql
SELECT CASE WHEN weekday = 0 THEN 'Sunday' 
  WHEN weekday = 1 THEN 'Monday' 
  WHEN weekday = 2 THEN 'Tuesday'
  WHEN weekday = 3 THEN 'Wednesday'
  WHEN weekday = 4 THEN 'Thursday'
  WHEN weekday = 5 THEN 'Friday'
  WHEN weekday = 6 THEN 'Saturday'
  END AS DAY,  cab_type, count
FROM (
SELECT weekday,cab_type, count(*) as count FROM cabs_table GROUP BY weekday, cab_type ORDER BY weekday) as _ 

DAY,cab_type,count
Sunday,Uber,44596
Sunday,Lyft,41504
Monday,Lyft,56590
Monday,Uber,61114
Tuesday,Uber,57783
Tuesday,Lyft,53419
Wednesday,Lyft,27999
Wednesday,Uber,29906
Thursday,Uber,50151
Thursday,Lyft,46166


Databricks visualization. Run in Databricks to view.

#### Insight
- Monday and Tuesdays have the higher cab requirements

# Which Day of week the price is higher

In [None]:
%sql
SELECT CASE WHEN weekday = 0 THEN 'Sunday' 
  WHEN weekday = 1 THEN 'Monday' 
  WHEN weekday = 2 THEN 'Tuesday'
  WHEN weekday = 3 THEN 'Wednesday'
  WHEN weekday = 4 THEN 'Thursday'
  WHEN weekday = 5 THEN 'Friday'
  WHEN weekday = 6 THEN 'Saturday'
  END AS DAY,  cab_type, price
FROM (
SELECT weekday,cab_type, AVG(price) as price FROM cabs_table GROUP BY weekday, cab_type ORDER BY weekday) as _ 

DAY,cab_type,price
Sunday,Uber,15.811922593954614
Sunday,Lyft,17.36639841942945
Monday,Lyft,17.28782205336632
Monday,Uber,15.770699021500802
Tuesday,Uber,15.87377775470294
Tuesday,Lyft,17.33465246447893
Wednesday,Lyft,17.33953712632594
Wednesday,Uber,15.772436969170066
Thursday,Uber,15.797461665769376
Thursday,Lyft,17.398874279772993


Databricks visualization. Run in Databricks to view.

#Correlation with humidity and surge charge

In [None]:
%sql
SELECT humidity, AVG(surge_multiplier) AS avg_surge_multiplier
FROM cabs_table 
GROUP BY humidity


humidity,avg_surge_multiplier
0.66,1.0131813621669823
0.84,1.0122240121292525
0.87,1.0125380196012166
0.93,1.0149733857903265
0.89,1.0150460949730389
0.79,1.013494032452729
0.72,1.0152274035693725
0.7,1.015500298957548
0.54,1.0106543750944537
0.45,1.0118710134656272


Databricks visualization. Run in Databricks to view.

In [None]:
%sql
SELECT pressure, AVG(surge_multiplier) AS avg_surge_multiplier
FROM cabs_table 
GROUP BY pressure

pressure,avg_surge_multiplier
1002.86,1.0055668016194332
1030.21,1.0068376068376068
1034.02,1.011414503133393
1021.56,1.023117569352708
1014.17,1.0182038834951457
1001.79,1.0
1015.52,1.0187209620114783
1027.94,1.025
1022.9,1.0023056653491436
1022.18,1.008309591642925


Databricks visualization. Run in Databricks to view.

In [None]:
%sql
SELECT clouds, cab_type, AVG(surge_multiplier) AS avg_surge_multiplier
FROM cabs_table 
GROUP BY clouds, cab_type


clouds,cab_type,avg_surge_multiplier
0.27,Lyft,1.0334239130434784
0.43,Uber,1.0
0.33,Lyft,1.0332512315270936
0.84,Uber,1.0
1.0,Lyft,1.0306448540771624
0.19,Uber,1.0
0.02,Lyft,1.0490837696335078
0.55,Uber,1.0
0.73,Uber,1.0
0.25,Uber,1.0


Databricks visualization. Run in Databricks to view.

#Correlation between time and surge

In [None]:
%sql
SELECT time, AVG(surge_multiplier) FROM cabs_table GROUP BY time

time,avg(surge_multiplier)
12,1.0162609146070742
22,1.0138093733959803
1,1.0156071465669472
13,1.0158625405508557
6,1.0142909828508206
16,1.0145530696638745
3,1.014551526717557
20,1.0167598278968852
5,1.0150667714061272
19,1.0145180645437792


Databricks visualization. Run in Databricks to view.

#Which location has more customers book cabs

In [None]:
%sql
SELECT source, count(*) FROM cabs_table GROUP BY source;

source,count(1)
Financial District,53737
Northeastern University,52618
North End,52709
Boston University,52737
North Station,52136
Back Bay,52786
Theatre District,52637
South Station,52618
Fenway,52740
Haymarket Square,52719


Databricks visualization. Run in Databricks to view.

Financial District have the higher range of booking

# Which destination to which more cabs are booked

In [None]:
%sql
SELECT destination, count(*) FROM cabs_table GROUP BY destination

destination,count(1)
Financial District,53766
Northeastern University,52680
North End,52733
Boston University,52744
North Station,52122
Back Bay,52705
Theatre District,52566
South Station,52672
Fenway,52682
Haymarket Square,52751


Databricks visualization. Run in Databricks to view.

#Which Route is mostly chosen by customer for rides

In [None]:
%sql
SELECT 
  source,
  destination,
  COUNT(*) as rides
FROM
  cabs_table
GROUP BY source,destination
ORDER BY rides DESC

source,destination,rides
South Station,Financial District,9463
Financial District,South Station,9432
Back Bay,North End,9325
North End,Back Bay,9309
Fenway,West End,9296
West End,Fenway,9278
Haymarket Square,Financial District,9269
Financial District,Haymarket Square,9268
North End,Beacon Hill,9201
Beacon Hill,North End,9182


Databricks visualization. Run in Databricks to view.

In [None]:
%sql
SELECT 
  source,
  destination,
  COUNT(*) as rides
FROM
  cabs_table
GROUP BY source,destination


source,destination,rides
Fenway,Financial District,8864
Northeastern University,Beacon Hill,8825
Theatre District,Haymarket Square,8815
Back Bay,North End,9325
Northeastern University,Financial District,8776
North Station,North End,8599
Haymarket Square,West End,8393
Haymarket Square,North Station,8810
Back Bay,South Station,8639
Fenway,Theatre District,8418


Databricks visualization. Run in Databricks to view.

In [None]:
%sql
SELECT 
  source,
  destination,
  AVG(price) as price
FROM
  cabs_table
GROUP BY source,destination
ORDER BY price DESC

source,destination,price
Financial District,Boston University,25.505817894261185
Boston University,Financial District,24.127369133574007
Fenway,Financial District,23.42509025270758
Financial District,Fenway,23.408495994584225
Northeastern University,Financial District,22.58859389243391
Financial District,Northeastern University,21.916296802094003
Theatre District,Boston University,20.36365488382337
Boston University,North Station,20.1751356969627
Northeastern University,North Station,19.911237230419975
Fenway,North Station,19.6998987284798


Databricks visualization. Run in Databricks to view.

In [None]:
%sql
SELECT 
  source,
  destination,
  AVG(price) as rides
FROM
  cabs_table
GROUP BY source,destination
ORDER BY rides DESC

source,destination,rides
Financial District,Boston University,25.505817894261185
Boston University,Financial District,24.127369133574007
Fenway,Financial District,23.42509025270758
Financial District,Fenway,23.408495994584225
Northeastern University,Financial District,22.58859389243391
Financial District,Northeastern University,21.916296802094003
Theatre District,Boston University,20.36365488382337
Boston University,North Station,20.1751356969627
Northeastern University,North Station,19.911237230419975
Fenway,North Station,19.6998987284798


Databricks visualization. Run in Databricks to view.

#Relation between distance and surge_multiplier

In [None]:
%sql
SELECT 
  cab_type,
  distance,
  surge_multiplier
FROM
  cabs_table

cab_type,distance,surge_multiplier
Lyft,0.44,1.0
Lyft,0.44,1.0
Lyft,0.44,1.0
Lyft,0.44,1.0
Lyft,0.44,1.0
Lyft,1.08,1.0
Lyft,1.08,1.0
Lyft,1.08,1.0
Lyft,1.08,1.0
Lyft,1.08,1.0


Databricks visualization. Run in Databricks to view.

# Trend based on the date

In [None]:
%sql
SELECT cab_type, date, COUNT(*) FROM cabs_table GROUP BY date,cab_type ORDER BY date

cab_type,date,count(1)
Uber,2018-11-26,14248
Lyft,2018-11-26,13264
Uber,2018-11-27,37906
Lyft,2018-11-27,35120
Uber,2018-11-28,29906
Lyft,2018-11-28,27999
Uber,2018-11-29,31796
Lyft,2018-11-29,29314
Lyft,2018-11-30,20804
Uber,2018-11-30,22471


Databricks visualization. Run in Databricks to view.

# Revenue Earned in these days

In [None]:
%sql
SELECT cab_type, date, AVG(revenue) FROM cabs_table GROUP BY date,cab_type ORDER BY date

cab_type,date,avg(revenue)
Uber,2018-11-26,15.806464065131948
Lyft,2018-11-26,18.176144074185768
Uber,2018-11-27,15.891507940695403
Lyft,2018-11-27,18.317398917995444
Uber,2018-11-28,15.772436969170066
Lyft,2018-11-28,18.335063752276863
Uber,2018-11-29,15.817508491634168
Lyft,2018-11-29,18.30595875690796
Lyft,2018-11-30,18.190166554508743
Uber,2018-11-30,15.683814694495126


Databricks visualization. Run in Databricks to view.

#Which cab service is top of the line

In [None]:
%sql
SELECT cab_type, 
       ROUND(SUM(Revenue),3) AS total_revenue,
       COUNT(*) AS total_count,
       DENSE_RANK() OVER(ORDER BY AVG(Revenue) DESC) AS rank
FROM cabs_table
GROUP BY cab_type
ORDER BY total_revenue DESC;


cab_type,total_revenue,total_count,rank
Lyft,5577636.23,304637,1
Uber,5176316.5,327766,2


Databricks visualization. Run in Databricks to view.

location based ranking , which cab is popular in which location

date and quater add

# Which cab is popular in each location

In [None]:
%sql
WITH cab_counts AS (
    SELECT 
        location,
        cab_type,
        COUNT(*) AS count
    FROM 
        cabs_table
    GROUP BY 
        location, cab_type
),
ranked_cabs AS (
    SELECT 
        location,
        cab_type,
        count,
        RANK() OVER (PARTITION BY location ORDER BY count DESC) AS rank
    FROM 
        cab_counts
)
SELECT 
    location,
    cab_type,
    count,
    rank
FROM 
    ranked_cabs
ORDER BY 
    location, rank;


location,cab_type,count,rank
Back Bay,Uber,27348,1
Back Bay,Lyft,25438,2
Beacon Hill,Uber,27139,1
Beacon Hill,Lyft,25228,2
Boston University,Uber,27340,1
Boston University,Lyft,25397,2
Fenway,Uber,27319,1
Fenway,Lyft,25421,2
Financial District,Uber,27732,1
Financial District,Lyft,26005,2


Databricks visualization. Run in Databricks to view.

#### Insight
- In every location uber is popular

# Which cab is popular in each route

In [None]:
%sql
WITH RoutePopularity AS (
    SELECT 
        source,
        destination,
        COUNT(*) AS rides,
        DENSE_RANK() OVER(PARTITION BY source, destination ORDER BY COUNT(*) DESC) AS rank,
        cab_type
    FROM 
        cabs_table
    GROUP BY 
        cab_type, source, destination
)

SELECT 
    source,
    destination,
    rides,
    cab_type
FROM 
    RoutePopularity
WHERE 
    rank = 1 
ORDER BY 
    source, 
    destination;


source,destination,rides,cab_type
Back Bay,Boston University,4497,Uber
Back Bay,Fenway,4483,Uber
Back Bay,Haymarket Square,4563,Uber
Back Bay,North End,4832,Uber
Back Bay,Northeastern University,4533,Uber
Back Bay,South Station,4440,Uber
Beacon Hill,Boston University,4373,Uber
Beacon Hill,Fenway,4432,Uber
Beacon Hill,Haymarket Square,4544,Uber
Beacon Hill,North End,4776,Uber


#Growth Rate of Each Cab

In [None]:
%sql
WITH monthly_counts AS (
    SELECT 
        YEAR(date) AS year,
        MONTH(date) AS month,
        cab_type,
        AVG(revenue) AS revenue
    FROM 
        cabs_table
    GROUP BY 
        year, month, cab_type
),
growth_calculation AS (
    SELECT 
        year,
        month,
        cab_type,
        revenue,
        LAG(revenue) OVER (PARTITION BY cab_type ORDER BY year, month) AS previous_revenue
    FROM 
        monthly_counts
)
SELECT 
    year,
    month,
    cab_type,
    revenue,
    CASE WHEN previous_revenue IS NULL THEN 0
    ELSE previous_revenue
    END AS previous_revenue,
    CASE 
        WHEN previous_revenue IS NULL THEN 0
        ELSE (revenue - previous_revenue) * 100.0 / previous_revenue
    END AS growth_percentage
FROM 
    growth_calculation
ORDER BY 
    year, month, cab_type;


year,month,cab_type,revenue,previous_revenue,growth_percentage
2018,11,Lyft,18.28292246701607,0.0,0.0
2018,11,Uber,15.805005611507625,0.0,0.0
2018,12,Lyft,18.327728561323934,18.28292246701607,0.2450707450556456
2018,12,Uber,15.783970350868945,15.805005611507625,-0.1330923958885792


Databricks visualization. Run in Databricks to view.

Databricks visualization. Run in Databricks to view.

#Which cab_type of each company has more rides

In [None]:
%sql
SELECT
  cab_type,
  name,
  count(*) as rides
FROM
  cabs_table
GROUP BY 
  name, cab_type
ORDER BY 
  rides


cab_type,name,rides
Lyft,Lyft,50740
Lyft,Lyft XL,50751
Lyft,Shared,50775
Lyft,Lux Black,50777
Lyft,Lux Black XL,50786
Lyft,Lux,50808
Uber,Black SUV,54608
Uber,UberX,54614
Uber,UberXL,54625
Uber,Black,54635


Databricks visualization. Run in Databricks to view.

# surge correlation with different weather conditions

In [None]:
%sql
SELECT 
    CORR(rain, surge_multiplier) AS rain_surge,
    CORR(pressure, surge_multiplier) AS pressure_surge,
    CORR(clouds, surge_multiplier) AS clouds_surge,
    CORR(humidity, surge_multiplier) AS humidity_surge
FROM 
    cabs_table
WHERE 
    cab_type = "Lyft";  

rain_surge,pressure_surge,clouds_surge,humidity_surge
-0.0006253499599457358,-0.0039943639259205,-0.0021747444254609,-0.0011781680548037


# Rain and surge correlation

In [None]:
%sql
SELECT 
  rain, 
  ROUND(AVG(surge_multiplier),3) as avg_surge_multiplier 
FROM 
  cabs_table 
WHERE 
  cab_type = "Lyft" 
GROUP BY rain

rain,avg_surge_multiplier
0.0124,1.003
0.0287,1.006
0.0273,1.019
0.032,1.015
0.0331,1.107
0.0,1.031
0.0322,1.018
0.0052,1.033
0.0093,1.055
0.241,1.043


Databricks visualization. Run in Databricks to view.

# Humidity and surge correlation

In [None]:
%sql
SELECT 
  humidity, 
  ROUND(AVG(surge_multiplier),3) as avg_surge_multiplier
FROM 
  cabs_table 
WHERE 
  cab_type = "Lyft" 
GROUP BY humidity

humidity,avg_surge_multiplier
0.66,1.027
0.84,1.025
0.87,1.026
0.93,1.031
0.89,1.031
0.79,1.028
0.72,1.032
0.7,1.032
0.54,1.022
0.45,1.024


Databricks visualization. Run in Databricks to view.

#Pressure and Surge correlation

In [None]:
%sql
SELECT 
  pressure, 
  ROUND(AVG(surge_multiplier),3) as avg_surge_multiplier
FROM 
  cabs_table 
WHERE 
  cab_type = "Lyft" 
GROUP BY pressure

pressure,avg_surge_multiplier
1002.86,1.011
1021.56,1.045
1034.02,1.023
1014.17,1.037
1030.21,1.014
1001.79,1.0
1015.52,1.039
1027.94,1.061
1022.9,1.005
1022.18,1.017


Databricks visualization. Run in Databricks to view.

#Clouds and surge correlation

In [None]:
%sql
SELECT 
  clouds, 
  ROUND(AVG(surge_multiplier),3) as avg_surge_multiplier
FROM 
  cabs_table 
WHERE 
  cab_type = "Lyft" 
GROUP BY clouds

clouds,avg_surge_multiplier
0.66,1.051
0.84,1.028
0.07,1.022
0.87,1.041
0.0,1.024
0.93,1.035
0.89,1.029
0.18,1.037
0.2,1.029
0.05,1.026


Databricks visualization. Run in Databricks to view.

# Map Location and count

In [None]:
%sql
SELECT 
  source_latitude,
  source_longitude, 
  count(*) as No_of_rides 
FROM 
  cabs_table 
WHERE 
  distance > 0
GROUP BY 
  source_latitude,source_longitude


source_latitude,source_longitude,No_of_rides
42.3629502,-71.0578447,52719
42.3504215,-71.10322831831216,52737
42.35684365,-71.06862657516852,52367
42.3451868,-71.1045987,52740
39.7446498,-104.996402,52637
42.3525085,-71.0549447,52618
40.7076681,-74.009271,53737
42.33895455,-71.08805803336392,52618
42.35054885,-71.08031131584724,52786
42.3650974,-71.0544954,52709


Databricks visualization. Run in Databricks to view.

#Distribution of price for both cabs

In [None]:
%sql
SELECT cab_type, price FROM cabs_table 

cab_type,price
Lyft,5.0
Lyft,11.0
Lyft,26.0
Lyft,9.0
Lyft,16.5
Lyft,10.5
Lyft,16.5
Lyft,3.0
Lyft,27.5
Lyft,13.5


Databricks visualization. Run in Databricks to view.

#Dollar per mile

#Top 10 Most frequent Routes

In [None]:
%sql
WITH CTE AS (SELECT source, destination, count(*) as count, DENSE_RANK() OVER(ORDER BY count(*) DESC) AS rank FROM cabs_table  GROUP BY source, destination ORDER BY count DESC)
SELECT * FROM CTE WHERE rank <= 10

source,destination,count,rank
South Station,Financial District,9463,1
Financial District,South Station,9432,2
Back Bay,North End,9325,3
North End,Back Bay,9309,4
Fenway,West End,9296,5
West End,Fenway,9278,6
Haymarket Square,Financial District,9269,7
Financial District,Haymarket Square,9268,8
North End,Beacon Hill,9201,9
Beacon Hill,North End,9182,10


#Routes Having Higher price to lower

In [None]:
%sql
SELECT DISTINCT source,destination, AVG(distance) AS distance, AVG(dollars_per_mile*distance) AS price FROM cabs_table GROUP BY source,destination ORDER BY distance DESC

source,destination,distance,price
Financial District,Boston University,5.172579078174429,25.505817894261185
Boston University,Financial District,4.544917644404321,24.127369133574007
Financial District,Fenway,4.477117228929223,23.408495994584225
Fenway,Financial District,4.290446750902522,23.42509025270758
Northeastern University,Financial District,4.154206927985436,22.58859389243391
Financial District,Northeastern University,4.0214578354387305,21.916296802094003
Boston University,North Station,3.4312368633791235,20.1751356969627
North Station,Northeastern University,3.302347066167308,19.540687776642837
Theatre District,Boston University,3.254709833718718,20.36365488382337
Northeastern University,North Station,3.222558456299632,19.911237230419975


In [None]:
%sql
SELECT 
    cab_type,
    name AS category, 
    COUNT(*) AS number_of_rides, 
    SUM(price) AS total_revenue,
    sum(price) *100 / count(*) as avg
FROM 
    cabs_table
GROUP BY 
    name,cab_type
ORDER BY 
    total_revenue DESC;


cab_type,category,number_of_rides,total_revenue,avg
Uber,Black SUV,54608,1653851.5,3028.588302080281
Lyft,Lux Black XL,50786,1641448.05,3232.0876816445475
Lyft,Lux Black,50777,1170796.05,2305.7605805778207
Uber,Black,54635,1121128.5,2052.0334950123547
Lyft,Lux,50808,902692.0,1776.6729648874193
Uber,UberXL,54625,856301.0,1567.5990846681923
Lyft,Lyft XL,50751,776926.2,1530.858899332033
Uber,WAV,54640,533507.0,976.4037335285506
Uber,UberX,54614,533297.0,976.4840517083531
Lyft,Lyft,50740,487595.18,960.9680331099725


Databricks visualization. Run in Databricks to view.

Lyft has made more profit in less rides

#maximum profitable location

In [None]:
%sql
SELECT 
    source,
    AVG(dollars_per_mile) AS mean_profitability
FROM 
    cabs_table
GROUP BY 
    source
ORDER BY 
    mean_profitability DESC;


source,mean_profitability
Haymarket Square,14.812249136488472
Financial District,11.631785875374629
South Station,10.911680823361936
Theatre District,10.605359416142289
North End,10.270954384789778
Back Bay,9.757145117606784
North Station,9.608297353431125
West End,9.478153542932466
Beacon Hill,8.01726023027299
Boston University,7.236261855166663


Databricks visualization. Run in Databricks to view.

# Most profitable Routes

In [None]:
%sql
SELECT 
    source,
    destination,
    AVG(dollars_per_mile) AS mean_profitability
FROM 
    cabs_table
GROUP BY 
    source,destination
ORDER BY 
    mean_profitability DESC;

source,destination,mean_profitability
Financial District,South Station,29.594486316795283
Haymarket Square,North Station,24.18363041257747
Theatre District,South Station,23.881690395953303
Haymarket Square,West End,20.970310727433763
South Station,Financial District,20.82289764384637
North Station,Haymarket Square,19.576114628544342
West End,Haymarket Square,18.315544888051747
Back Bay,Boston University,14.094126524015016
South Station,Theatre District,13.37881377815825
Boston University,Back Bay,13.241823720863708


In [None]:
%sql
SELECT 
    distance, dollars_per_mile
FROM 
    cabs_table



distance,dollars_per_mile
0.44,11.363636363636363
0.44,25.0
0.44,59.09090909090909
0.44,20.454545454545453
0.44,37.5
1.08,9.72222222222222
1.08,15.277777777777777
1.08,2.7777777777777777
1.08,25.46296296296296
1.08,12.5


Databricks visualization. Run in Databricks to view.