# Data analysis on upfront pricing precision

In [None]:
install.packages("dplyr")
install.packages("readr")
install.packages("tidyverse")
install.packages("tidyr")
install.packages("dbplyr")
install.packages("pacman")
install.packages("ggplot2")
install.packages("reshape2")
install.packages("rpart")
install.packages("raprt.plot")
install.packages("forecast")
install.packages('caret', dependencies = TRUE)

library("tidyverse")
library("readr")
library("dplyr")
library("tidyr")
library("stringi")
library("ggplot2")
library("reshape2")
pacman::p_load("datasets", "rio", "pacman")

Installing package into ‘/srv/rlibs’
(as ‘lib’ is unspecified)



In [None]:
rawdatabolt <-read_csv("C:/Users/Guest/Documents/Homework/test.csv")

In [None]:
#Clearing data and N/A rows
#Not using any empty rows as the comparison between upfront and metered is required to decide which pricing to use
boltdata <- Test[!is.na(Test$upfront_price), ]
boltdata <- boltdata[!is.na(boltdata$metered_price), ]

# Calculating the % diffrenece of the estimated upfront fee from the metered price
boltdata <- boltdata %>%
  mutate(Diff = ((upfront_price*100)/metered_price))

# When Difference between metered and upfront is less than 20% upfront pricing is used, upfront is marked as 1, metered is marked as 0
boltdata <- boltdata %>%
  mutate(pricingdummy = case_when(Diff == 20 ~ '0', 
                                  Diff > 20 ~ '0', 
                                  Diff < 20 ~ '1'), 
         pricingused = case_when(Diff == 20 ~ 'Metered', 
                                  Diff > 20 ~ 'Metered', 
                                  Diff < 20 ~ 'Upfront'))

In [None]:
# Graph V1 - metered vs upfront fee usage in the final pricing
V1 <- boltdata %>%
  ggplot(aes(pricingused))
V1 + 
  geom_bar(fill = '#31D287')+
  theme()+
  labs(
    title = "Figure 1 - Pricing method used as the final price",
    x = NULL,
    y = "Count of rides")

sum(boltdata$pricingused == "Metered")/3409

In [None]:
#Graph 2 and 3 - the diffrence of metered vs upfront prices vs metered prices
ggplot(boltdata) + 
  geom_point(aes(x= boltdata$Diff, y=boltdata$metered_price), color = '#31D287') +
  theme()+
  labs(title = "Figure 2 - % of price difference between Upfront and Metered price", x = "% of difference", y = "Metered Price")


ggplot(boltdata) + geom_point(aes(x= boltdata$Diff, y=boltdata$metered_price), color = '#31D287')+
  theme()+
  labs(title = "Figure 3 - % of price difference between Upfront and Metered price",
       subtitle = "Excluding outliers over 50,000 in Metered price and 200% of price difference", x = "% of difference", y = "Metered Price")+
  coord_cartesian(xlim =  c(0,200), ylim = c(0,50000))

In [None]:
# Graph 4 and 5 - time vs upfront and metered fees, per hour and per hour and minute
boltdata$date_created <- as.Date(boltdata$calc_created)
class(boltdata$calc_created) #character will be changed into timestamp
boltdata$time_created <- format(as.POSIXct(boltdata$calc_created), format = "%H:%M")
boltdata$hour_created <- format(as.POSIXct(boltdata$calc_created), format = "%H")

boltdataG4 <- data.frame(Hour = boltdata$hour_created, Metered = boltdata$metered_price, Upfront = boltdata$upfront_price)
boltdataG4_long <- melt(boltdataG4)

V4 <- ggplot(data = boltdataG4_long, mapping = aes(x = Hour , y = value, z= variable, fill = variable))
V4 +
  geom_boxplot()+
  scale_fill_manual(values = c("#31D287", "#b175ff"))+
  theme()+
  labs(
    title = "Figure 4 - Upfront vs Metered prices per hour",
    x = "Hour",
    y = "Price value")

V4 +
  geom_boxplot()+
  scale_fill_manual(values = c("#31D287", "#b175ff"))+
  theme()+
  labs(
    title = "Figure 5 - Upfront vs Metered prices per hour",
    subtitle = "Excluding outliers over 10,000 in price value",
    x = "Hour",
    y = "Price value")+
  coord_cartesian(ylim=c(0,10000))

In [None]:
# Graph 6, 7 and 8 - Price variaton on different weekdays
boltdata$weekday <- weekdays(boltdata$date_created)
boltdata$weekday <- case_when(boltdata$weekday == "esmaspäev" ~ 'Monday',
                              boltdata$weekday == "teisipäev" ~ 'Tuesday',
                              boltdata$weekday == "kolmapäev" ~ 'Wednesday',
                              boltdata$weekday == "neljapäev" ~ 'Thursday',
                              boltdata$weekday == "reede" ~ 'Friday',
                              boltdata$weekday == "laupäev" ~ 'Saturday',
                              boltdata$weekday == "pühapäev" ~ 'Sunday')


boltdataG5 <- data.frame(weekday = boltdata$weekday, Metered = boltdata$metered_price, Upfront = boltdata$upfront_price)
boltdataG5_long <- melt(boltdataG5)

V5 <- ggplot(data = boltdataG5_long, mapping = aes(x = ordered(boltdataG5_long$weekday, levels = c("Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday")), y , y = value, z= variable, fill = variable))

V5 +
  geom_boxplot()+
  scale_fill_manual(values = c("#31D287", "#b175ff"))+
  theme()+
  labs(
    title = "Figure 6 - Upfront vs Metered prices per weekday",
    x = "Weekday",
    y = "Price value")

V5 +
  geom_boxplot()+
  scale_fill_manual(values = c("#31D287", "#b175ff"))+
  theme()+
  labs(
    title = "Figure 7 - Upfront vs Metered prices per weekday",
    subtitle = "Excluding outliers over 10,000 in price value",
    x = "Weekday",
    y = "Price value")+
  coord_cartesian(ylim=c(0,10000))

V5 +
  geom_boxplot()+
  scale_fill_manual(values = c("#31D287", "#b175ff"))+
  theme()+
  labs(
    title = "Figure 8 - Upfront vs Metered prices per weekday",
    subtitle = "Excluding outliers over 50 in price value",
    x = "Weekday",
    y = "Price value")+
  coord_cartesian(ylim=c(0,50))

In [None]:
# Regeression analysis

boltR1data <- boltdata

boltR1data$fraud_score[is.na(boltR1data$fraud_score)] <- 0 #if fraud score NA put it to 0
boltR1data <- boltR1data %>%
  mutate(fraud_dummy = case_when(fraud_score != 0  ~ '1',
                                 fraud_score == 0  ~ '0'))

boltR1data <- boltR1data %>%
  mutate(operation_system = case_when(str_sub(boltR1data$device_name, start = 1, end = 6) == 'iPhone'  ~ 'IOS',
                                      str_sub(boltR1data$device_name, start = 1, end = 6) != 'iPhone'  ~ 'Android'))

boltR1data$change_reason_pricing[is.na(boltR1data$change_reason_pricing)] <- "none" #if reason change in price type NA put it to 0

boltR1data$driver_app_version <- str_sub(boltR1data$driver_app_version, start = 4, end = 4) #Only version number
boltR1data$rider_app_version <- str_sub(boltR1data$rider_app_version, start = 4, end = 4) #Only version number
boltR1data$device_name <- sub(" .*$", "", boltR1data$device_name) #Only device brands

drop <- c("device_token", "b_state", "order_try_state", "prediction_price_type")
df = boltR1data[,!(names(boltR1data) %in% drop)] #deleting empty or single-value columns

boltR1 <- lm(formula = pricingdummy ~ 
               gps_confidence +
               entered_by +
               dest_change_number +
               change_reason_pricing +
               rider_app_version +
               order_state +
               driver_app_version +
               device_name +
               operation_system +
               eu_indicator +
               overpaid_ride_ticket +
               fraud_dummy +
               hour_created +
               weekday,
             data = boltR1data)
options(scipen = 999)
summary(boltR1)

In [None]:
#Graph 9 - fraud_dummy vs upfront and metered fees

boltdataG6 <- data.frame(fraud = boltR1data$fraud_dummy, diff = boltR1data$Diff)
V6 <- ggplot(boltdataG6, aes(x = fraud, y = diff))

V6 + 
  geom_boxplot(fill = "#31D287")+
  theme()+
  labs(
    title ="Figure 9 - Fraud score vs price differences", 
    x = "Fraud Score existence", y = "Price diffrenrence in %") + 
  coord_cartesian(ylim = c(0,400))

In [None]:
#Graph 10 - phone operating system vs % of price difference

boltdataG7 <- data.frame(provider = boltR1data$phone_provider, diff = boltR1data$Diff)
V7 <- ggplot(boltdataG7, aes(x = provider, y = diff))

V7 + 
  geom_boxplot(fill = "#31D287")+
  theme()+
  labs(
    title ="Figure 10 - Phone Operation System vs price differences", 
    x = NULL, y = "Price diffrenrence in %") + 
  coord_cartesian(ylim = c(0,200))

In [None]:
#Decision tree model

install.packages("randomForest")
library(randomForest)

RFdata <- data.frame("GPS connection" = boltR1data$gps_confidence, 
                     "Address entered by" = boltR1data$entered_by,
                     boltR1data$dest_change_number,
                     boltR1data$change_reason_pricing,
                     boltR1data$rider_app_version,
                     boltR1data$order_state,
                     boltR1data$driver_app_version,
                     boltR1data$device_name,
                     boltR1data$operation_system,
                     boltR1data$eu_indicator,
                     boltR1data$overpaid_ride_ticket,
                     boltR1data$fraud_dummy,
                     boltR1data$hour_created,
                     boltR1data$weekday)

random_forest <- randomForest(as.factor(pricingdummy) ~ 
                                gps_confidence +
                                entered_by +
                                dest_change_number +
                                change_reason_pricing +
                                rider_app_version +
                                order_state +
                                driver_app_version +
                                device_name +
                                operation_system +
                                eu_indicator +
                                overpaid_ride_ticket +
                                fraud_dummy +
                                hour_created +
                                weekday,
                              data = boltR1data, ntree = 1000,
                              mtry = 1, nodesize = 5, importance = TRUE)

varImpPlot(random_forest, type = 1)

In [None]:
# Comparing time vs distance difference vs prices differences

boltdata2 <- data.frame(ID = boltdata$order_id_new, 
                        Upfront_price = boltdata$upfront_price, 
                        Distance_predicted = boltdata$predicted_distance, 
                        Duration_predicted = boltdata$predicted_duration, 
                        Metered_price = boltdata$metered_price,
                        Distance_real = boltdata$distance,
                        Duration_real = boltdata$duration,
                        Price_diff = ((boltdata$upfront_price*100)/boltdata$metered_price),
                        Distance_diff = ((boltdata$predicted_distance*100)/boltdata$distance),
                        Duration_diff= ((boltdata$predicted_duration*100)/boltdata$duration)) 

ggplot(boltdata2) + 
  geom_line(aes(x = Price_diff, y = Distance_real, group = 1, colour = "Real distance"), size = 0.5) +
  geom_line(aes(x = Price_diff, y = Distance_predicted, colour = "Predicted distance"), size = 0.5)+
  labs(title = "Figure 11 - Price difference vs Distance", y = "Distance", x="% of Price Difference")+
  coord_cartesian(xlim = c(0,1000), ylim = c(0,25000))

ggplot(boltdata2) + 
  geom_line(aes(x = Price_diff, y = Duration_real, colour = "Duration_real"), size =  0.5) +
  geom_line(aes(x = Price_diff, y = Duration_predicted, colour = "Duration_predicted"), size = 0.5)+
  labs(title = "Figure 12 - Price difference vs Duration", y = "Duration", x="% of Price Difference")+
  coord_cartesian(xlim = c(0,200), ylim = c(0,10000))