In [1]:
import pandas as pd
from sqlalchemy import create_engine

In [3]:
conn_url = 'postgresql://postgres:123@localhost/5310_Project'
engine = create_engine(conn_url)
connection = engine.connect()

In [97]:
# Where are the most popular pick up points? 
queryCmd = """SELECT source, COUNT(*) as num_pickup
FROM trip
GROUP BY source
ORDER BY num_pickup DESC
LIMIT 10;"""

result = connection.execute(queryCmd) 
results = result.fetchall()
for row in results:
    print("The  popular pick up points are:", row[0], "with", row[1], "occurrences.")


The  popular pick up points are: Financial District with 58857 occurrences.
The  popular pick up points are: Theatre District with 57813 occurrences.
The  popular pick up points are: Back Bay with 57792 occurrences.
The  popular pick up points are: Boston University with 57764 occurrences.
The  popular pick up points are: North End with 57763 occurrences.
The  popular pick up points are: Fenway with 57757 occurrences.
The  popular pick up points are: Northeastern University with 57756 occurrences.
The  popular pick up points are: South Station with 57750 occurrences.
The  popular pick up points are: Haymarket Square with 57736 occurrences.
The  popular pick up points are: West End with 57562 occurrences.


In [98]:
# Where are the most popular destinations? 
queryCmd = """SELECT destination, COUNT(*) as num_des
FROM trip
GROUP BY destination
ORDER BY num_des DESC
LIMIT 10;"""

result = connection.execute(queryCmd) 
results = result.fetchall()
for row in results:
    print("The popular destinations are:", row[0], "with", row[1], "occurrences.")

The popular destinations are: Financial District with 58851 occurrences.
The popular destinations are: Theatre District with 57798 occurrences.
The popular destinations are: Back Bay with 57780 occurrences.
The popular destinations are: Haymarket Square with 57764 occurrences.
The popular destinations are: Boston University with 57764 occurrences.
The popular destinations are: Fenway with 57757 occurrences.
The popular destinations are: North End with 57756 occurrences.
The popular destinations are: Northeastern University with 57755 occurrences.
The popular destinations are: South Station with 57749 occurrences.
The popular destinations are: West End with 57575 occurrences.


In [None]:
# How does weather affect number of rides? 
queryCmd = """SELECT short_summary, COUNT(*) as count, (COUNT(*) * 100.0 / (SELECT COUNT(*) FROM climateSummary)) as percentage
FROM climateSummary
GROUP BY short_summary
ORDER BY count DESC;
"""

df = pd.read_sql(queryCmd, connection)
print(df)


In [40]:
df = pd.read_sql_query(
    'SELECT short_summary, COUNT(*) as count, (COUNT(*) * 100.0 / (SELECT COUNT(*) FROM "climateSummary")) as percentage '
    'FROM "climateSummary" '
    'GROUP BY short_summary '
    'ORDER BY count DESC',
    connection
)

print(df)

        short_summary   count  percentage
0           Overcast   218895   31.583344
1      Mostly Cloudy   146210   21.095963
2      Partly Cloudy   127224   18.356561
3              Clear    87126   12.571006
4         Light Rain    54912    7.922998
5               Rain    23712    3.421294
6   Possible Drizzle    18636    2.688902
7              Foggy     9060    1.307225
8            Drizzle     7296    1.052706


In [81]:
# Are there more orders on rainy days or sunny days or humid days or windy days etc? 
queryCmd = """
SELECT
  COALESCE(
    CASE 
      WHEN short_summary LIKE '%Rain%' THEN 'Rain'
      WHEN short_summary LIKE '%Drizzle%' THEN 'Drizzle'
      WHEN short_summary LIKE '%Cloudy%' THEN 'Cloudy'
	  WHEN short_summary LIKE '%Overcast%' THEN 'Overcast'
	  WHEN short_summary LIKE '%Clear%' THEN 'Clear'
	  WHEN short_summary LIKE '%Foggy%' THEN 'Foggy'
    END,
    'other'
  ) AS category,
  COUNT(*) AS count,
  COUNT(*) * 100.0 / SUM(COUNT(*)) OVER () AS percentage
FROM
  "climateSummary"
GROUP BY
  category
ORDER BY
  count DESC;
"""

In [93]:
import psycopg2
conn = psycopg2.connect(
    host="localhost",
    database="5310_Project",
    user="postgres",
    password="123"
)


In [94]:
cursor = conn.cursor()
cursor.execute(queryCmd)
result = cursor.fetchall()

In [95]:
df = pd.DataFrame(result, columns=['category', 'count', 'percentage'])

In [96]:
print(df)

   category   count           percentage
0    Cloudy  273434  39.4525236231208635
1  Overcast  218895  31.5833442749732711
2     Clear   87126  12.5710064336842834
3      Rain   78624  11.3442922875145548
4   Drizzle   25932   3.7416080026433078
5     Foggy    9060   1.3072253780637193
