In [11]:
from pymongo import MongoClient
from matplotlib import pyplot as plt

In [12]:
class BlueBikeMongoConnection:
    def __init__(self, db_name, collection_name):  # Construction takes about 10 seconds
        client = MongoClient()
        db = client[db_name]
        self.collection = db[collection_name]
        # self.docs = self.collection.count_documents({})  # This is the slow part if we want to delete
        self.docs = 15622370
        print(f'Connection established with {db_name}.{collection_name} '
              f'({self.docs} documents)')
    
    def count_documents(self):  # Fast
        return self.docs

    def find_one(self):  # Fast
        return self.collection.find_one()
    
    def rides_per_month(self):  # 15 seconds
        pipeline = [
            {
                "$project": {
                    "year": "$start_time.year",
                    "month": "$start_time.month",
                    "subscribed": "$subscribed"
                }
            },
            {
                "$group": {
                    "_id": {
                        "year": "$year",
                        "month": "$month"
                    },
                    "total_rides": {"$sum": 1},
                    "sub_rides": {"$sum": {"$cond": [{"$eq": ["$subscribed", 1]}, 1, 0]}},
                    "non_sub_rides": {"$sum": {"$cond": [{"$eq": ["$subscribed", 0]}, 1, 0]}}
                }
            },
            {
                "$sort": {"_id.year": 1, "_id.month": 1}
            }
        ]

        agg = self.collection.aggregate(pipeline)
        data = []
        for idx, entry in enumerate(agg):
            year = entry['_id']['year']
            month = entry['_id']['month']
            num_rides = entry['total_rides']
            sub_rides = entry['sub_rides']
            non_sub_rides = entry['non_sub_rides']
            data.append({'id': idx, 'month': month, 'year': year,
                         'sub rides': sub_rides, 'non sub rides': non_sub_rides,
                         'number of rides': num_rides})
        return data

    def get_distinct(self, criterion):  # Varies
        return self.collection.distinct(criterion)
    
    def get_municipalities(self):  # 10 seconds
        return self.collection.distinct('start_station.municipality')
    
    def seasonals_proportion_per_year(self):
        pipeline = [
            {
                "$project": {
                    "year": "$start_time.year",
                    "month": "$start_time.month",
                    "subscribed": "$subscribed"
                }
            },
            {
                "$group": {
                    "_id": {
                        "year": "$year",
                        "season": {
                            "$switch": {
                                "branches": [
                                    {"case": {"$in": ["$month", [3, 4, 5]]}, "then": "a_spring"},
                                    {"case": {"$in": ["$month", [6, 7, 8]]}, "then": "b_summer"},
                                    {"case": {"$in": ["$month", [9, 10, 11]]}, "then": "c_fall"},
                                    {"case": {"$in": ["$month", [12, 1, 2]]}, "then": "d_winter"}
                                ],
                                "default": "other"
                            }
                        }
                    },
                    "total_rides": {"$sum": 1},
                    "total_subscriber_rides": {"$sum": {"$cond": [{"$eq": ["$subscribed", 1]}, 1, 0]}},
                    "total_non_subscriber_rides": {"$sum": {"$cond": [{"$eq": ["$subscribed", 0]}, 1, 0]}}
                }
            },
            {
                "$group": {
                    "_id": {"year": "$_id.year", "season": "$_id.season"},
                    "average_total_rides": {"$avg": "$total_rides"},
                    "average_sub_rides": {"$avg": "$total_subscriber_rides"},
                    "average_non_sub_rides": {"$avg": "$total_non_subscriber_rides"}
                }
            },
            {
                "$sort": {"_id.year": 1, "_id.season": 1}
            }
        ]

        agg = self.collection.aggregate(pipeline)
        data = []
        for idx, entry in enumerate(agg):
            year = entry['_id']['year']
            season = entry['_id']['season']
            num_rides = entry['average_total_rides']
            sub_rides = entry['average_sub_rides']
            non_sub_rides = entry['average_non_sub_rides']
            data.append({'id': idx, 'season': season, 'year': year,
                         'avg sub rides': sub_rides, 'avg non sub rides': non_sub_rides,
                         'avg number of rides': num_rides})
        return data
    
    # ------ MQ revamped queries!
    def popular_routes(self):
        pipeline = [
            {
                "$group": {
                    "_id": {
                        "start": "$start_station.name",
                        "end": "$end_station.name"
                    },
                    "num": { "$sum": 1 }
                }
            },
            {
                "$sort": { "num": -1 }
            }
        ]
        return self.collection.aggregate(pipeline)
    
    def circular_routes(self):
        pipeline = [
            {
                "$match": {
                    "$expr": {
                        "$eq": ["$start_station.name", "$end_station.name"]
                    }
                }
            }
        ]
        return self.collection.aggregate(pipeline)
    
    def ending_in_boston(self):
        pipeline = [
            {
                "$group": {
                    "_id": {
                        "start": "$start_station.municipality",
                        "end": "$end_station.municipality"
                    },
                    "num": { "$sum": 1 }
                }
            },
            {
                "$match": {
                    "$expr": {
                        "$eq": ["$_id.end", "Boston"]
                    }
                }
            },
            {
                "$sort": { "num": -1 }
            }
        ]

        return self.collection.aggregate(pipeline)
    
    def ending_in_boston(self):
        pipeline = [
            {
                "$group": {
                    "_id": {
                        "start": "$start_station.municipality",
                        "end": "$end_station.municipality"
                    },
                    "num": { "$sum": 1 }
                }
            },
            {
                "$match": {
                    "$expr": {
                        "$eq": ["$_id.start", "Boston"]
                    }
                }
            },
            {
                "$sort": { "num": -1 }
            }
        ]

        return self.collection.aggregate(pipeline)
    # --------
    def custom_pipeline(self, pipeline):
        return self.collection.aggregate(pipeline)
    
    def crosses_river(self):
        pipeline = [
            {
                "$match": {
                    "$or": [
                        {
                            "$and": [
                                {"start_station.municipality": {"$in": ["Arlington", "Cambridge", "Chelsea", "Everett", "Malden", "Medford", "Revere", "Salem", "Somerville", "Watertown"]}},
                                {"end_station.municipality": {"$in": ["Boston", "Brookline", "Newton"]}}
                            ]
                        },
                        {
                            "$and": [
                                {"end_station.municipality": {"$in": ["Arlington", "Cambridge", "Chelsea", "Everett", "Malden", "Medford", "Revere", "Salem", "Somerville", "Watertown"]}},
                                {"start_station.municipality": {"$in": ["Boston", "Brookline", "Newton"]}}
                            ]
                        }
                    ]
                }
            },
            {
                "$group": {
                    "_id": {
                        "year": "$start_time.year",
                        "month": "$start_time.month"
                    },
                    "crosses_river_rides": { "$sum": 1 }
                }
            },
            {
                "$sort": {"_id.year": 1, "_id.month": 1}
            }
        ]

        return self.collection.aggregate(pipeline)

    

In [13]:
db_name = 'DS4300'
collection_name = 'bluebikes'
conn = BlueBikeMongoConnection(db_name, collection_name)

Connection established with DS4300.bluebikes (15622370 documents)


In [14]:
# # seasonal vis
# data = conn.seasonals_proportion_per_year()
# x = []
# y = []
# idx = 1
# for season in data:
#     y.append(season['avg number of rides'])
#     x.append(idx)
#     idx += 1

# plt.plot(x, y)
# for e in x:
#     if e % 4 == 0:
#         plt.axvline(x=e, color='r', linestyle='--', label="winter")
#     elif e % 2 == 0:
#         plt.axvline(x=e, color='g', linestyle='--', label="summer")

# plt.legend(["rides", "summer","winter"])
# plt.xlabel()
# plt.show()

In [15]:
# North vs south of the river municipalities
# N ~ 'Arlington', 'Cambridge', 'Chelsea', 'Everett', 'Malden', 'Medford', 
# 'Revere', 'Salem', 'Somerville', 'Watertown'

# S ~ 'Boston', 'Brookline', 'Newton'
for each in conn.crosses_river():
    print(each)

{'_id': {'year': 2019, 'month': 1}, 'crosses_river_rides': 12387}
{'_id': {'year': 2019, 'month': 2}, 'crosses_river_rides': 14597}
{'_id': {'year': 2019, 'month': 3}, 'crosses_river_rides': 20347}
{'_id': {'year': 2019, 'month': 4}, 'crosses_river_rides': 33955}
{'_id': {'year': 2019, 'month': 5}, 'crosses_river_rides': 46153}
{'_id': {'year': 2019, 'month': 6}, 'crosses_river_rides': 59801}
{'_id': {'year': 2019, 'month': 7}, 'crosses_river_rides': 67358}
{'_id': {'year': 2019, 'month': 8}, 'crosses_river_rides': 70680}
{'_id': {'year': 2019, 'month': 9}, 'crosses_river_rides': 68776}
{'_id': {'year': 2019, 'month': 10}, 'crosses_river_rides': 58212}
{'_id': {'year': 2019, 'month': 11}, 'crosses_river_rides': 36227}
{'_id': {'year': 2019, 'month': 12}, 'crosses_river_rides': 17606}
{'_id': {'year': 2020, 'month': 1}, 'crosses_river_rides': 25386}
{'_id': {'year': 2020, 'month': 2}, 'crosses_river_rides': 26272}
{'_id': {'year': 2020, 'month': 3}, 'crosses_river_rides': 20596}
{'_id':