In [53]:
import pandas as pd
import numpy as np
from sqlalchemy import create_engine
import statsmodels.api as sm

In [41]:
conn_url = 'postgresql://postgres:123@localhost/5310_Project'
engine = create_engine(conn_url)
connection = engine.connect()

What factors are the most important that affects surge multiplier based on current data?

In [26]:
q6 = """
SELECT 
    date AS date,
    foo.windSpeed AS windSpeed,
    foo.precipIntensity AS precipIntensity,
    foo.precipProbability AS precipProbability,
    foo.cloudCover AS cloudCover,
    foo.ozone AS ozone,
    foo.uvIndex AS uvIndex,
    foo.humidity AS humidity
FROM(
    SELECT
        CAST(t.datetime AS date) AS date,
        w.humidity AS humidity,
        w."windSpeed" AS windSpeed,
        w."precipIntensity" AS precipIntensity,
        w."precipProbability" AS precipProbability,
        w.ozone AS ozone,
        w."cloudCover" AS cloudCover,
        w."uvIndex" AS uvIndex
    FROM 
        timestamp t
        JOIN trip tr ON t.timestamp = tr.timestamp
        JOIN order_ o ON tr.id = o.id
        LEFT JOIN "climateSummary" c ON o.id = c.id
        LEFT JOIN weather w ON c.weather_id = w.weather_id
    GROUP BY
        t.datetime,
        c.weather_id, 
        humidity, 
        windSpeed, 
        precipIntensity, 
        precipProbability, 
        cloudCover, 
        ozone, 
        uvIndex
) AS foo;
"""

order_weather = pd.read_sql_query(q6, con=engine)
order_weather


Unnamed: 0,date,windspeed,precipintensity,precipprobability,cloudcover,ozone,uvindex,humidity
0,2018-11-25,0.51,0.0,0.0,1.00,317.9,0,0.92
1,2018-11-25,0.51,0.0,0.0,1.00,317.9,0,0.92
2,2018-11-25,0.51,0.0,0.0,1.00,317.9,0,0.92
3,2018-11-25,0.45,0.0,0.0,1.00,313.0,0,0.90
4,2018-11-25,0.45,0.0,0.0,1.00,313.0,0,0.90
...,...,...,...,...,...,...,...,...
36174,2018-12-18,12.52,0.0,0.0,0.01,372.6,0,0.38
36175,2018-12-18,12.52,0.0,0.0,0.01,372.6,0,0.38
36176,2018-12-18,12.52,0.0,0.0,0.01,372.6,0,0.38
36177,2018-12-18,12.52,0.0,0.0,0.01,372.6,0,0.38


In [49]:
df_count = order_weather
df_count['order'] = 1
df_mean = df_count.groupby(pd.Grouper(key='date')).agg({'windspeed': 'mean', 'precipintensity': 'mean', 'precipprobability': 'mean', 'cloudcover': 'mean', 'ozone': 'mean', 'uvindex': 'mean', 'humidity': 'mean', 'order': 'sum'})
df_mean = df_mean.reset_index().drop('date', axis=1)
df_mean


Unnamed: 0,windspeed,precipintensity,precipprobability,cloudcover,ozone,uvindex,humidity,order
0,0.458571,0.0,0.0,1.0,313.7,0.0,0.902857,21
1,6.5075,0.04037,0.464331,1.0,296.003482,0.183844,0.90257,1436
2,8.752875,0.027461,0.428443,0.824931,309.443435,0.242659,0.827767,1805
3,7.762225,0.0,0.0,0.725509,350.258846,0.260061,0.681025,2634
4,9.144346,0.0,0.0,0.338546,319.299343,0.21366,0.611654,2738
5,3.182639,0.0,0.0,0.515969,281.589167,0.327431,0.707104,2880
6,3.163255,0.0,0.0,0.515803,277.795308,0.412264,0.725534,2707
7,4.08438,0.025783,0.53963,1.0,279.181613,0.256382,0.923996,2703
8,5.311358,0.0,0.0,0.682496,309.293833,0.254405,0.732794,2724
9,7.439021,0.0,0.0,0.864639,340.359794,0.0,0.64,194


In [50]:
x = df_mean[["windspeed", "precipintensity", "precipprobability", "cloudcover", "ozone", "uvindex", "humidity"]]
y = df_mean["order"]
model = sm.OLS(y, x).fit()
print(model.summary())


                                 OLS Regression Results                                
Dep. Variable:                  order   R-squared (uncentered):                   0.935
Model:                            OLS   Adj. R-squared (uncentered):              0.894
Method:                 Least Squares   F-statistic:                              22.74
Date:                Tue, 18 Apr 2023   Prob (F-statistic):                    1.05e-05
Time:                        16:09:05   Log-Likelihood:                         -140.07
No. Observations:                  18   AIC:                                      294.1
Df Residuals:                      11   BIC:                                      300.4
Df Model:                           7                                                  
Covariance Type:            nonrobust                                                  
                        coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------

  "anyway, n=%i" % int(n))


Does ozone level and other weather information increases the number of rides.

In [104]:
q7="""
SELECT
    s.surge_multiplier,
    p.source,
    p.destination,
    t.datetime,
    c.weather_id,
    w.temperature_id,
    a."apparentTemperature_id"
FROM
    surge s
    JOIN price p ON s.surge_id = p.surge_id
    LEFT JOIN timestamp t ON p.timestamp = t.timestamp
    LEFT JOIN "climateSummary" c ON p.id = c.id
    LEFT JOIN weather w ON c.weather_id = w.weather_id
    LEFT JOIN temperature te ON w.temperature_id = te.temperature_id
    LEFT JOIN "apparentTemperature" a ON te."apparentTemperature_id" = a."apparentTemperature_id";
"""
q7_df = pd.read_sql_query(q7, con=engine)
q7_df['hour'] = q7_df['datetime'].dt.hour
q7_df['weekday'] = q7_df['datetime'].dt.day_name()
q7_df = q7_df.drop(columns=["datetime"])

q7_df


Unnamed: 0,surge_multiplier,source,destination,weather_id,temperature_id,apparentTemperature_id,hour,weekday
0,1.0,Beacon Hill,South Station,28,28,28,8,Tuesday
1,1.0,North Station,Northeastern University,32,32,32,22,Monday
2,1.0,North End,West End,60,60,60,2,Saturday
3,1.0,North End,Beacon Hill,63,63,63,5,Tuesday
4,1.0,Boston University,North Station,72,72,72,3,Thursday
...,...,...,...,...,...,...,...,...
693066,1.5,Theatre District,North End,146,146,146,14,Monday
693067,1.5,Theatre District,North End,292,292,292,22,Thursday
693068,1.5,Boston University,Beacon Hill,71,71,71,8,Saturday
693069,1.5,Boston University,Beacon Hill,135,135,135,12,Wednesday


In [105]:
q7_df.to_csv('/Users/zhangjiaming/Desktop/5310_q7.csv', index=False)
