# **BDA - Phase 1**

### **MongoDB**

### **1. Load the data**

In [67]:
import pandas as pd
from pymongo import MongoClient
import time

In [68]:
host_df = pd.read_csv(
    filepath_or_buffer='../data/clean-data/non-relational/host.csv',
    dtype={
        'id': 'int64',
        'url': 'string',
        'name': 'string',
        'about': 'string',
        'is_superhost': 'bool',
        'thumbnail_url': 'string',
        'picture_url': 'string',
        'verifications': 'string',
        'location': 'string',
        'neighbourhood': 'string',
        "response_time": "string",
        "response_rate": "float32",
        "acceptance_rate": "float32",
        "listings_count": "int16",
        "total_listings_count": "int16",
        "calculated_host_listings_count": "int16",
        "calculated_host_listings_count_entire_homes": "int16",
        "calculated_host_listings_count_private_rooms": "int16",
        "calculated_host_listings_count_shared_rooms": "int16"
    },
    parse_dates=["since"]
)

listings_df = pd.read_csv(
    filepath_or_buffer='../data/clean-data/non-relational/listings.csv', 
    dtype={
        "id": "int32",
        "listing_url": "string",
        "name": "string",
        "description": "string",
        "neighborhood_overview": "string",
        "picture_url": "string",
        "host_id": "int32",
        "neighbourhood": "string",
        "latitude": "float32",
        "longitude": "float32",
        "property_type": "string",
        "room_type": "string",
        "accommodates": "int16",
        "bathrooms": "float32",
        "bathrooms_text": "string",
        "bedrooms": "float32",
        "beds": "float32",
        "amenities": "string",
        "base_price": "float32",
        "minimum_nights": "int16",
        "maximum_nights": "int16",
        "has_availability": "bool",
        "instant_bookable": "bool",
        "minimum_minimum_nights": "int16",
        "maximum_minimum_nights": "int16",
        "minimum_maximum_nights": "int16",
        "maximum_maximum_nights": "int16",
        "minimum_nights_avg_ntm": "float32",
        "maximum_nights_avg_ntm": "float32",
        "availability_30": "int16",
        "availability_60": "int16",
        "availability_90": "int16",
        "availability_365": "int16",
        "number_of_reviews": "int16",
        "number_of_reviews_ltm": "int16",
        "number_of_reviews_l30d": "int16",
        "review_scores_rating": "float32",
        "review_scores_accuracy": "float32",
        "review_scores_cleanliness": "float32",
        "review_scores_checkin": "float32",
        "review_scores_communication": "float32",
        "review_scores_location": "float32",
        "review_scores_value": "float32",
        "reviews_per_month": "float32"
    },
    parse_dates=["last_scraped", "first_review", "last_review"]
)

calendar_df = pd.read_csv(
    filepath_or_buffer='../data/clean-data/non-relational/calendar.csv',
    dtype={
        "id": "int32",
        "listing_id": "int32",
        "available": "bool",
        "price": "float32"
    },
    parse_dates=["date"]
)

reviews_df = pd.read_csv(
    filepath_or_buffer='../data/clean-data/non-relational/reviews.csv',
    dtype={
        "id": "int32",
        "listing_id": "int32",
        "reviewer_id": "int32",
        "reviewer_name": "string",
        "comments": "string"
    },
    parse_dates=["date"]
)

### **2. Create the DB**

In [69]:
client =MongoClient('mongodb+srv://dbUser:password@somecluster.rrxnu.mongodb.net/?retryWrites=true&w=majority&appName=NORA')

client.drop_database("Project_DB")
print("Database dropped")
db = client["Project_DB"]

Database dropped


In [70]:
db['Hosts'].drop()
print("Collection dropped")
host=db.create_collection("Hosts",validator={
    '$jsonSchema':{
        'bsonType':'object',
        'required':['id','url','name','is_superhost','verifications','location',
                    'response_time','response_rate','acceptance_rate','listings_count'],
        'properties':{
            'id':{
                'bsonType':'int',
                'description':'must be a valid 16-bit integer (required)'
            },
            'url':{
                'bsonType':'string',
                'description':'must be a valid string (required)'
            },
            'name':{
                'bsonType':'string',
                'description':'must be a valid string (required)'
            },
            'about':{
                'bsonType':'string',
                'description':'must be a valid string'
            },
            'is_superhost':{
                'bsonType':'bool',
                'description':'must be a valid boolean (required)'
            },
            'thumbnail_url':{
                'bsonType':'string',
                'description':'must be a valid string'
            },
            'picture_url':{
                'bsonType':'string',
                'description':'must be a valid string'
            },
            'verifications':{
                'bsonType':'string',
                'description':'must be a valid string (required)'
            },
            'location':{
                'bsonType':'string',
                'description':'must be a valid string (required)'
            },
            'neighbourhood':{
                'bsonType':'string',
                'description':'must be a valid string'
            },
            'since':{
                'bsonType':'date',
                'description':'must be a valid date'
            },
            'response_time':{
                'bsonType':'string',
                'description':'must be a valid string (required)'
            },
            'response_rate':{
                'bsonType':'double',
                'minimum':0,
                'description':'must be a valid double (required)'
            },
            'listings_count':{
                'bsonType':'int',
                'minimum':0,
                'description':'must be a valid 16-bit integer (required)'
            },
            'total_listings_count':{
                'bsonType':'int',
                'minimum':0,
                'description':'must be a valid 16-bit integer'
            },
            'calculated_host_listings_count':{
                'bsonType':'int',
                'minimum':0,
                'description':'must be a valid 16-bit integer'
            },
            'calculated_host_listings_count_entire_homes':{
                'bsonType':'int',
                'minimum':0,
                'description':'must be a valid 16-bit integer'
            },
            'calculated_host_listings_count_private_rooms':{
                'bsonType':'int',
                'minimum':0,
                'description':'must be a valid 16-bit integer'
            },
            'calculated_host_listings_count_shared_rooms':{
                'bsonType':'int',
                'minimum':0,
                'description':'must be a valid 16-bit integer'
            }     
        }
    }    
})

Collection dropped


In [71]:
db['Listings'].drop()
print("Collection dropped")
listings=db.create_collection("Listings",validator={
    '$jsonSchema':{
        'bsonType':'object',
        'required':['id','base_price','instant_bookable','has_availability',
                    'minimum_nights','maximum_nights','name','description',
                    'neighborhood_overview','neighbourhood',
                    'latitude','longitude','bathrooms','property_type',
                    'room_type','accommodates','bedrooms','beds','amenities','availability_30','availability_60',
                    'availability_90','availability_365',
                    'number_of_reviews','review_scores_value'
                    ],
        'properties':{
            'id':{
                'bsonType':'number',
                'description':'must be a valid numerical value (required)'
            },
            'listing_url':{
                'bsonType':'string',
                'description':'must be a valid string'
            },
            'last_scraped':{
                'bsonType':'date',
                'description':'must be a valid date'
            },
            'name':{
                'bsonType':'string',
                'description':'must be a valid string (required)'
            },
            'description':{
                'bsonType':'string',
                'description':'must be a valid string (required)'
            },
            'neighborhood_overview':{
                'bsonType':'string',
                'description':'must be a valid string (required)'
            },
            'picture_url':{
                'bsonType':'string',
                'description':'must be a valid string'
            },
            'neighbourhood':{
                'bsonType':'string',
                'description':'must be a valid string (required)'
            },
            'latitude':{
                'bsonType':'double',
                'description':'must be a valid double (required)'
            },
            'longitude':{
                'bsonType':'double',
                'description':'must be a valid double (required)'
            },
            'property_type':{
                'bsonType':'string',
                'description':'must be a valid string (required)'
            },
            'room_type':{
                'bsonType':'string',
                'description':'must be a valid string (required)'
            },
            'accomodates':{
                'bsonType':'int',
                'minimum':0,
                'description':'must be a valid 16-bit integer (required)'
            },
            'bathrooms':{
                'bsonType':'double',
                'minimum':0,
                'description':'must be a valid double (required)'
            },
            'bathrooms_text':{
                'bsonType':'string',
                'description':'must be a valid string'
            },
            'bedrooms':{
                'bsonType':'double',
                'minimum':0,
                'description':'must be a valid double (required)'
            },
            'beds':{
                'bsonType':'double',
                'minimum':0,
                'description':'must be a valid double (required)'
            },
            'amenities':{
                'bsonType':'string',
                'description':'must be a valid string (required)'
            },
            'base_price':{
                'bsonType':'double',
                'minimum':0,
                'description':'must be a valid double (required)'
            },
            'minimum_nights':{
                'bsonType':'int',
                'minimum':0,
                'description':'must be a valid 16-bit integer (required)'
            },
            'maximum_nights':{
                'bsonType':'int',
                'minimum':0,
                'description':'must be a valid 16-bit integer (required)'
            },
            'has_availability':{
                'bsonType':'bool',
                'description':'must be a valid boolean (required)'
            },
            'instant_bookable':{
                'bsonType':'bool',
                'description':'must be a valid boolean (required)'
            },
            'minimum_minimum_nights':{
                'bsonType':'int',
                'minimum':0,
                'description':'must be a 16-bit valid integer'
            },
            'maximum_minimum_nights':{
                'bsonType':'int',
                'minimum':0,
                'description':'must be a valid 16-bit integer'
            },
            'minimum_maximum_nights':{
                'bsonType':'int',
                'minimum':0,
                'description':'must be a valid 16-bit integer'
            },
            'maximum_maximum_nights':{
                'bsonType':'int',
                'minimum':0,
                'description':'must be a valid 16-bit integer'
            },
            'minimum_nights_avg_ntm':{
                'bsonType':'double',
                'minimum':0,
                'description':'must be a valid double'
            },
            'maximum_nights_avg_ntm':{
                'bsonType':'double',
                'minimum':0,
                'description':'must be a valid double'
            },
            'availability_30':{
                'bsonType':'int',
                'minimum':0,
                'description':'must be a valid 16-bit integer (required)'
            },
            'availability_60':{
                'bsonType':'int',
                'minimum':0,
                'description':'must be a valid 16-bit integer (required)'
            },
            'availability_90':{
                'bsonType':'int',
                'minimum':0,
                'description':'must be a valid 16-bit integer (required)'
            },
            'availability_365':{
                'bsonType':'int',
                'minimum':0,
                'description':'must be a valid 16-bit integer (required)'
            },
            'number_of_reviews':{
                'bsonType':'int',
                'minimum':0,
                'description':'must be a valid 16-bit integer (required)'
            },
            'number_of_reviews_ltm':{
                'bsonType':'int',
                'minimum':0,
                'description':'must be a valid 16-bit integer'
            },
            'number_of_reviews_l30d':{
                'bsonType':'int',
                'minimum':0,
                'description':'must be a valid 16-bit integer'
            },
            'first_review':{
                'bsonType':'date',
                'description':'must be a valid date'
            },
            'last_review':{
                'bsonType':'date',
                'description':'must be a valid date'
            },
            'review_scores_rating':{
                'bsonType':'double',
                'minimum':0,
                'description':'must be a valid double'
            },
            'review_scores_accuracy':{
                'bsonType':'double',
                'minimum':0,
                'description':'must be a valid double'
            },
            'review_scores_cleanliness':{
                'bsonType':'double',
                'minimum':0,
                'description':'must be a valid double'
            },
            'review_scores_checkin':{
                'bsonType':'double',
                'minimum':0,
                'description':'must be a valid double'
            },
            'review_scores_communication':{
                'bsonType':'double',
                'minimum':0,
                'description':'must be a valid double'
            },
            'review_scores_location':{
                'bsonType':'double',
                'minimum':0,
                'description':'must be a valid double'
            },
            'review_scores_value':{
                'bsonType':'double',
                'minimum':0,
                'description':'must be a valid double'
            },
            'reviews_per_month':{
                'bsonType':'double',
                'minimum':0,
                'description':'must be a valid double'
            }

        }
    }    
})

Collection dropped


In [72]:
db['Calendar'].drop()
print("Collection dropped")

calendar=db.create_collection("Calendar",validator={
    '$jsonSchema':{
        'bsonType':'object',
        'required':['date','available','price'],
        'properties':{
            'id':{
                'bsonType':'int',
                'description':'must be a valid 16-bit integer'
            },
            'date:':{
                'bsonType':'string',
                'description':'must be a valid string'
            },
            'available':{
                'bsonType':'bool',
                'description':'must be a valid boolean (required)'
            },
            'price':{
                'bsonType':'double',
                'minimum':0,
                'description':'must be a valid double (required)'
            }
        }
    }
})

Collection dropped


In [73]:
db['Reviews'].drop()
print("Collection dropped")

reviews=db.create_collection("Reviews",validator={
    '$jsonSchema':{
        'bsonType':'object',
        'required':['date','reviewer_name','comments'],
        'properties':{
            'id':{
                'bsonType':'number',
                'description':'must be a valid numerical value'
            },
            'date:':{
                'bsonType':'string',
                'description':'must be a valid string'
            },
            'reviewer_id':{
                'bsonType':'int',
                'description':'must be a valid 16-bit integer'
            },
            'reviewer_name':{
                'bsonType':'string',
                'description':'must be a valid string (required)'
            },
            'comments':{
                'bsonType':'string',
                'description':'must be a valid string (required)'
            }

        }
    }
})

Collection dropped


In [74]:
def insert_dataframe_into_collection(collection, dataframe):
    """
    Inserts data from a Pandas DataFrame into a MongoDB collection.
    """
    try:
        data = dataframe.to_dict(orient='records')
        
        result = collection.insert_many(data)
        
        print(f"Inserted {len(result.inserted_ids)} documents into the collection.")
        return result
    except Exception as e:
        print(f"An error occurred while inserting data into the collection: {e}")
        return None

In [75]:
insert_dataframe_into_collection(host, host_df)
insert_dataframe_into_collection(listings, listings_df)
insert_dataframe_into_collection(calendar, calendar_df)
insert_dataframe_into_collection(reviews, reviews_df)

Inserted 426 documents into the collection.
Inserted 426 documents into the collection.
Inserted 155490 documents into the collection.
Inserted 24752 documents into the collection.


InsertManyResult([ObjectId('675e159e25ce11ea6ebb1fee'), ObjectId('675e159e25ce11ea6ebb1fef'), ObjectId('675e159e25ce11ea6ebb1ff0'), ObjectId('675e159e25ce11ea6ebb1ff1'), ObjectId('675e159e25ce11ea6ebb1ff2'), ObjectId('675e159e25ce11ea6ebb1ff3'), ObjectId('675e159e25ce11ea6ebb1ff4'), ObjectId('675e159e25ce11ea6ebb1ff5'), ObjectId('675e159e25ce11ea6ebb1ff6'), ObjectId('675e159e25ce11ea6ebb1ff7'), ObjectId('675e159e25ce11ea6ebb1ff8'), ObjectId('675e159e25ce11ea6ebb1ff9'), ObjectId('675e159e25ce11ea6ebb1ffa'), ObjectId('675e159e25ce11ea6ebb1ffb'), ObjectId('675e159e25ce11ea6ebb1ffc'), ObjectId('675e159e25ce11ea6ebb1ffd'), ObjectId('675e159e25ce11ea6ebb1ffe'), ObjectId('675e159e25ce11ea6ebb1fff'), ObjectId('675e159e25ce11ea6ebb2000'), ObjectId('675e159e25ce11ea6ebb2001'), ObjectId('675e159e25ce11ea6ebb2002'), ObjectId('675e159e25ce11ea6ebb2003'), ObjectId('675e159e25ce11ea6ebb2004'), ObjectId('675e159e25ce11ea6ebb2005'), ObjectId('675e159e25ce11ea6ebb2006'), ObjectId('675e159e25ce11ea6ebb20

In [76]:
db.Listings.aggregate([
    {'$lookup': {
        'from': 'Calendar',
        'localField': 'id',
        'foreignField': 'listing_id',
        'as': 'Calendar'
    }},
    {'$lookup': {
        'from': 'Reviews',
        'localField': 'id',
        'foreignField': 'listing_id',
        'as': 'Reviews'
    }},
    {'$out': "Listings"}
])

db.Hosts.aggregate([
    {'$lookup': {
        'from': 'Listings',
        'localField': 'id',
        'foreignField': 'host_id',
        'as': 'Listings'
    }},
    {'$out': "Hosts"}
])

<pymongo.synchronous.command_cursor.CommandCursor at 0x15c64ad80>

### **3. Queries**

**Simple Queries:**

In [12]:
# available 2 bedrooms airbnbs with price lower then 100

query = {
    "bedrooms": {"$eq": 2},
    "base_price": {"$lt": 100},
    "has_availability": True
}

try:
    result = db.Listings.find(query)
    explain_output = result.explain()

    execution_time = explain_output['executionStats']['executionTimeMillis']
    num_results = explain_output['executionStats']['nReturned']

    print(f"{num_results} results found in {execution_time} ms")
except Exception as e:
    print(f"An unexpected error occurred: {e}")

17 results found in 0 ms


In [13]:
# all hosts in Albany, NY with more than one listing

query = {
    'location': 'Albany, NY',
    'listings_count': {'$gt': 1}
}

try:
    result = db.Hosts.find(query)
    explain_output = result.explain()
    
    execution_time = explain_output['executionStats']['executionTimeMillis']
    num_results = explain_output['executionStats']['nReturned']

    print(f"{num_results} results found in {execution_time} ms")
except Exception as e:
    print(f"An unexpected error occurred: {e}")

262 results found in 39 ms


### **3.1 Complex Queries:**

**Complex Query 1: Hosts em New York**

In [122]:
results_collection = db['RESULTS']

pipeline = [
    {
        '$match': {
            'location': 'New York, NY'
        }
    },

    {
        '$lookup': {
            'from': 'Listings',  
            'localField': 'id',  
            'foreignField': 'host_id',  
            'as': 'listings'
        }
    },

    {
        '$addFields': {
            'listings_available': {
                '$filter': {
                    'input': '$listings',
                    'as': 'listing',
                    'cond': {'$eq': ['$$listing.has_availability', True]}  
                }
            }
        }
    },

    {
        '$addFields': {
            'average_price': {
                '$cond': [
                    {'$gt': [{'$size': '$listings_available'}, 0]},  
                    {
                        '$avg': {
                            '$map': {
                                'input': '$listings_available',
                                'as': 'listing',
                                'in': '$$listing.base_price'  
                            }
                        }
                    },
                    None  
                ]
            }
        }
    },
    {
        '$addFields': {
            'listings_count': {'$size': '$listings'}  
        }
    },
    {
        '$match': {
            'listings_count': {'$gt': 5},  
            'average_price': {'$lt': 150} 
        }
    },
    {
        '$project': {
            'host_id': '$id',
            'host_name': '$name',
            'host_location': '$location',
            'average_price': 1,
            'listings_count': 1
        }
    },
    {
        '$sort': {'average_price': 1}
    }
]

In [123]:
# without indexes

try:
    start_time = time.time()

    results_list = list(db.Hosts.aggregate(pipeline))

    if results_list:
        data_to_insert = [
            {
                "host_id": result_["host_id"],
                "host_name": result_["host_name"],
                "host_location": result_["host_location"],
                "average_price": result_["average_price"],
                "total_listings": result_["listings_count"]
            }
            for result_ in results_list
        ]
        
        results_collection.insert_many(data_to_insert)
        print(f"{len(data_to_insert)} records inserted into RESULTS.")

        end_time = time.time()

        print(f"{len(results_list)} results found and inserted in {end_time - start_time:.4f} seconds")

    else:
        print("No results to insert.")

except Exception as e:
    print(f"An error occurred: {e}")

6 records inserted into RESULTS.
6 results found and inserted in 0.2067 seconds


In [111]:
try:
    print("Existing indexes:")
    print("Hosts:", db.Hosts.index_information())
    print("Listings:", db.Listings.index_information())
    print("Calendar:", db.Calendar.index_information())
    print("Reviews:", db.Reviews.index_information())

    print("\nCreating indexes")
    db.Hosts.create_index([("location", 1)])  # $match operation filtering by location
    print("Index on 'location' created for Hosts.")

    db.Listings.create_index([("has_availability", 1)])  # filtering by availability
    db.Listings.create_index([("host_id", 1)])  # $lookup joining Listings with Hosts
    print("Indexes on 'has_availability' and 'host_id' created for Listings.")

    print("\nUpdated indexes:")
    print("Hosts:", db.Hosts.index_information())
    print("Listings:", db.Listings.index_information())
    print("Calendar:", db.Calendar.index_information())
    print("Reviews:", db.Reviews.index_information())

    start_time = time.time()
    result = db.Hosts.aggregate(pipeline)

    results_list = list(result)

    if results_list:
        data_to_insert = [
            {
                "host_id": result_["host_id"],
                "host_name": result_["host_name"],
                "host_location": result_["host_location"],
                "average_price": result_["average_price"],
                "total_listings": result_["listings_count"]
            }
            for result_ in results_list
        ]
        
        results_collection = db['RESULTS']
        results_collection.insert_many(data_to_insert)
        print(f"{len(data_to_insert)} records inserted into RESULTS.")

        end_time = time.time()
        print(f"\n{len(results_list)} results found and inserted in {end_time - start_time:.4f} seconds")
    else:
        print("No results to insert.")

    print("\nRemoving indexes")
    db.Hosts.drop_index("location_1")
    db.Listings.drop_index("has_availability_1")
    db.Listings.drop_index("host_id_1")
    print("Indexes removed.")

except Exception as e:
    print(f"Error querying: {e}")

Existing indexes:
Hosts: {'_id_': {'v': 2, 'key': [('_id', 1)]}}
Listings: {'_id_': {'v': 2, 'key': [('_id', 1)]}}
Calendar: {'_id_': {'v': 2, 'key': [('_id', 1)]}}
Reviews: {'_id_': {'v': 2, 'key': [('_id', 1)]}}

Creating indexes
Index on 'location' created for Hosts.
Indexes on 'has_availability' and 'host_id' created for Listings.

Updated indexes:
Hosts: {'_id_': {'v': 2, 'key': [('_id', 1)]}, 'location_1': {'v': 2, 'key': [('location', 1)]}}
Listings: {'_id_': {'v': 2, 'key': [('_id', 1)]}, 'has_availability_1': {'v': 2, 'key': [('has_availability', 1)]}, 'host_id_1': {'v': 2, 'key': [('host_id', 1)]}}
Calendar: {'_id_': {'v': 2, 'key': [('_id', 1)]}}
Reviews: {'_id_': {'v': 2, 'key': [('_id', 1)]}}
6 records inserted into RESULTS.

6 results found and inserted in 0.1728 seconds

Removing indexes
Indexes removed.


In [86]:
try:
    count = results_collection.count_documents({})
    print(f"The RESULTS collection contains {count} rows.")

    results = results_collection.find()

    print("Contents of the RESULTS collection:")
    for result in results:
        print(result)

except Exception as e:
    print(f"An error occurred while retrieving data from RESULTS: {e}")

The RESULTS collection contains 6 rows.
Contents of the RESULTS collection:
{'_id': ObjectId('675e173225ce11ea6ebb80aa'), 'host_id': 42708277, 'host_name': 'Rodney', 'host_location': 'New York, NY', 'average_price': 102.16666666666667, 'total_listings': 6}
{'_id': ObjectId('675e173225ce11ea6ebb80ab'), 'host_id': 42708277, 'host_name': 'Rodney', 'host_location': 'New York, NY', 'average_price': 102.16666666666667, 'total_listings': 6}
{'_id': ObjectId('675e173225ce11ea6ebb80ac'), 'host_id': 42708277, 'host_name': 'Rodney', 'host_location': 'New York, NY', 'average_price': 102.16666666666667, 'total_listings': 6}
{'_id': ObjectId('675e173225ce11ea6ebb80ad'), 'host_id': 42708277, 'host_name': 'Rodney', 'host_location': 'New York, NY', 'average_price': 102.16666666666667, 'total_listings': 6}
{'_id': ObjectId('675e173225ce11ea6ebb80ae'), 'host_id': 42708277, 'host_name': 'Rodney', 'host_location': 'New York, NY', 'average_price': 102.16666666666667, 'total_listings': 6}
{'_id': ObjectId('6

In [83]:
try:
    result = results_collection.delete_many({})
    print(f"Deleted {result.deleted_count} documents from the RESULTS collection.")

except Exception as e:
    print(f"An error occurred while clearing the RESULTS collection: {e}")

Deleted 12 documents from the RESULTS collection.


In [None]:
try:
    start_time = time.time()

    high_reviewed_listing_ids = [
        doc['id'] for doc in db['Listings'].find(
            {
                'review_scores_value': {'$gt': 4.5},
                'number_of_reviews': {'$gt': 100}
            },
            {'id': 1, '_id': 0}
        )
    ]

    active_host_pipeline = [
        {
            '$lookup': {
                'from': 'Hosts',
                'localField': 'host_id',
                'foreignField': 'id',
                'as': 'host_details'
            }
        },
        {
            '$unwind': '$host_details'
        },
        {
            '$match': {
                'host_details.location': {'$eq': 'Albany, NY'}
            }
        },
        {
            '$group': {
                '_id': '$host_id',
                'max_listings_count': {'$max': '$host_details.total_listings_count'},
                'available_listings': {
                    '$sum': {'$cond': [{'$eq': ['$has_availability', True]}, 1, 0]}
                }
            }
        },
        {
            '$match': {
                'max_listings_count': {'$gt': 5},  
                'available_listings': {'$gt': 3}  
            }
        },
        {
            '$project': {'_id': 1}
        }
    ]

    active_host_ids = [
        doc['_id'] for doc in db['Listings'].aggregate(active_host_pipeline)
    ]

    update_result = db['Listings'].update_many(
        {
            'has_availability': True,                
            'base_price': {'$lt': 300},              
            'host_id': {'$in': active_host_ids},     
            'id': {'$in': high_reviewed_listing_ids} 
        },
        {
            '$mul': {'base_price': 1.1}              
        }
    )

    end_time = time.time()
    print(f"{update_result.modified_count} results changed in {end_time - start_time:.4f} seconds")

except Exception as e:
    print(f"Error: {e}")

24 results changed in 0.4827 seconds


In [18]:
try:
    print("Existing indexes:")
    print(db.Hosts.index_information())
    print(db.Listings.index_information())

    print("\nCreating indexes...")

    db.Listings.create_index([("review_scores_value", 1), ("number_of_reviews", 1)])
    print("Compound index on 'review_scores_value' and 'number_of_reviews' in Listings created.")

    db.Listings.create_index([("host_id", 1)])
    print("Simple index for host_id in Listings created.")

    db.Hosts.create_index([("id", 1)])
    print("Simple index for id in Hosts created.")
    
    db.Hosts.create_index([("location", 1), ("total_listings_count", 1)])
    print("Compound index on location and 'total_listings_count' in Hosts created.")
    
    db.Listings.create_index([("has_availability", 1),("base_price", 1),("id", 1)])
    print("Compound index on 'has_availability','base_price' and 'id' in Listings created.")

    start_time = time.time()

    high_reviewed_listing_ids = [
        doc['id'] for doc in db.Listings.find(
            {
                'review_scores_value': {'$gt': 4.5},
                'number_of_reviews': {'$gt': 100}
            },
            {'id': 1, '_id': 0}
        )
    ]

    active_host_pipeline = [
        {
            '$lookup': {
                'from': 'Hosts',
                'localField': 'host_id',
                'foreignField': 'id',
                'as': 'host_details'
            }
        },
        {
            '$unwind': '$host_details'
        },
        {
            '$match': {
                'host_details.location': {'$eq': 'Albany, NY'}
            }
        },
        {
            '$group': {
                '_id': '$host_id',
                'max_listings_count': {'$max': '$host_details.total_listings_count'},
                'available_listings': {
                    '$sum': {'$cond': [{'$eq': ['$has_availability', True]}, 1, 0]}
                }
            }
        },
        {
            '$match': {
                'max_listings_count': {'$gt': 5},
                'available_listings': {'$gt': 3} 
            }
        },
        {
            '$project': {'_id': 1}
        }
    ]

    active_host_ids = [
        doc['_id'] for doc in db.Listings.aggregate(active_host_pipeline)
    ]

    
    update_result = db.Listings.update_many(
        {
            'has_availability': True,                
            'base_price': {'$lt': 300},              
            'host_id': {'$in': active_host_ids},     
            'id': {'$in': high_reviewed_listing_ids} 
        },
        {
            '$mul': {'base_price': 1.1}             
        }
    )

   
    end_time = time.time()
    print(f"{update_result.modified_count} results changed in {end_time - start_time:.4f} seconds")


except Exception as e:
    print(f"Error: {e}")
finally:
    try:
        print("\nRemoving indexes...")
        db.Listings.drop_index("review_scores_value_1_number_of_reviews_1")
        db.Listings.drop_index("host_id_1")
        db.Hosts.drop_index("id_1")
        db.Hosts.drop_index("location_1_total_listings_count_1")
        db.Listings.drop_index("has_availability_1_base_price_1_id_1")
        print("Indexes removed.")
    except Exception as e:
        print(f"Error while removing indexes: {e}")

Existing indexes:
{'_id_': {'v': 2, 'key': [('_id', 1)]}}
{'_id_': {'v': 2, 'key': [('_id', 1)]}}

Creating indexes...
Compound index on 'review_scores_value' and 'number_of_reviews' in Listings created.
Simple index for host_id in Lisitings created.
Simple index for id in Hosts created.
Compound index on location and 'total_listings_count' in Hosts created.
Compound index on 'has_availability','base_price' in Listings created.
Simple index for id in Listings created.
24 results changed in 0.3843 seconds

Removing indexes...
Indexes removed.
