# **BDA - Phase 1**

**MySQL**

In [2]:
import pandas as pd
import mysql.connector
from sqlalchemy import create_engine, text
import time

In [3]:
host_df = pd.read_csv('../data/clean-data/host.csv')
host_statistics_df = pd.read_csv('../data/clean-data/host_statistics.csv')
listings_df = pd.read_csv('../data/clean-data/listings.csv')
calendar_df = pd.read_csv('../data/clean-data/calendar.csv')
reviews_df = pd.read_csv('../data/clean-data/reviews.csv')
availability_statistics_df = pd.read_csv('../data/clean-data/availability_statistics.csv')
reviews_statistics_df = pd.read_csv('../data/clean-data/reviews_statistics.csv')

In [3]:
mydb = mysql.connector.connect(
  host="localhost",
  user="root",
  passwd="password",
  port = '3306',
)

database_name = 'Project_DB'

with mydb.cursor() as cursor:
    cursor.execute(f"CREATE DATABASE IF NOT EXISTS {database_name}")
    cursor.execute(f"USE {database_name}")

# Load the SQL schema file and execute it
try:
    with open('../schemas/db_schema.sql', 'r') as f:
        sql_script = f.read()
    with mydb.cursor() as cursor:
        # Split the script into statements
        statements = sql_script.split(';')
        for statement in statements:
            if statement.strip():  # Skip empty statements
                try:
                    cursor.execute(statement)
                    first_line = statement.strip().split('\n')[0]
                    print(f"Executed: {first_line}...")
                except Exception as e:
                    first_line = statement.strip().split('\n')[0]
                    print(f"Error with statement: {first_line}...\n{e}")
        
    
    mydb.commit()
    print("Schema successfully loaded into the database!")

    db_url = f'mysql+mysqlconnector://root:password@localhost:3306/{database_name}'
    engine = create_engine(db_url)

    # Insert DataFrames into the database
    host_statistics_df.to_sql('host_statistics', con=engine, if_exists='replace', index=False, method='multi')
    calendar_df.to_sql('calendar', con=engine, if_exists='replace', index=False, method='multi')
    reviews_df.to_sql('reviews', con=engine, if_exists='replace', index=False, method='multi')
    availability_statistics_df.to_sql('availability_statistics', con=engine, if_exists='replace', index=False, method='multi')
    reviews_statistics_df.to_sql('reviews_statistics', con=engine, if_exists='replace', index=False, method='multi')
    listings_df.to_sql('listings', con=engine, if_exists='replace', index=False, method='multi')
    host_df.to_sql('host', con=engine, if_exists='replace', index=False, method='multi')
    

    print("All data inserted successfully!")
except Exception as e:
    print(f"An error occurred: {e}")
finally:
    mydb.close()
    print("Database connection closed.")

Executed: -- Create the HOST table...
Executed: -- Create the HOST_STATISTICS table...
Executed: -- Create the LISTINGS table...
Executed: -- Create the CALENDAR table...
Executed: -- Create the REVIEWS table...
Executed: -- Create the AVAILABILITY_STATISTICS table...
Executed: -- Create the REVIEWS_STATISTICS table...
Schema successfully loaded into the database!
All data inserted successfully!
Database connection closed.


**Queries**

In [4]:
host = 'localhost'
username = 'root'
password = 'password'
port = '3306'
database_name = 'Project_DB'

db_url = f'mysql+mysqlconnector://{username}:{password}@{host}:{port}/{database_name}'
engine = create_engine(db_url)

with engine.connect() as connection:
    start_time = time.time()
    simple_query = """
    SELECT *
    FROM listings
    WHERE has_availability = TRUE
    AND bedrooms = 2
    AND base_price < 100;
    """
    result = connection.execute(text(simple_query))
    results = result.fetchall()
    end_time = time.time()
    print(f"{len(results)} results found in {end_time - start_time:.4f} seconds")

17 results found in 0.0010 seconds


In [5]:
host = 'localhost'
username = 'root'
password = 'password'
port = '3306'
database_name = 'Project_DB'

db_url = f'mysql+mysqlconnector://{username}:{password}@{host}:{port}/{database_name}'
engine = create_engine(db_url)

with engine.connect() as connection:
    start_time = time.time()
    simple_query = """

WITH unique_host_statistics AS (
    SELECT host_id, MAX(listings_count) AS listings_count
    FROM host_statistics
    WHERE listings_count > 1
    GROUP BY host_id
)
SELECT h.id, h.location
FROM host h
JOIN unique_host_statistics uhs ON h.id = uhs.host_id
WHERE h.location = 'Albany, NY';
    """
    result = connection.execute(text(simple_query))
    results = result.fetchall()
    end_time = time.time()
    print(f"{len(results)} results found in {end_time - start_time:.4f} seconds")

262 results found in 0.0050 seconds


In [None]:
#Increase the base_price for listings with high review ratings, a high number of reviews and a high average price.
with engine.connect() as connection:    
    start_time = time.time()
    reduce_response_time_query = """
    UPDATE LISTINGS l
    JOIN REVIEWS_STATISTICS rs ON l.id = rs.listing_id
    SET 
        l.base_price = l.base_price * 1.1 -- Increase base price by 10% for qualifying listings
    WHERE 
        l.has_availability = true -- Only consider listings with availability
        AND rs.number_of_reviews > 10 -- Only consider listings with more than 10 reviews
        AND rs.review_scores_rating > 4.5 -- Only consider listings with an average score above 4.5
        AND l.base_price < 300 -- Only increase base price for listings with base price below 300
    """
    result = connection.execute(text(reduce_response_time_query))
    connection.commit()
    end_time = time.time()
    affected_rows = result.rowcount
    print(f"{affected_rows} results changed in {end_time - start_time:.4f} seconds")


233 results changed in 0.3559 seconds


In [7]:
host = 'localhost'
username = 'root'
password = 'password'
port = '3306'
database_name = 'Project_DB'

db_url = f'mysql+mysqlconnector://{username}:{password}@{host}:{port}/{database_name}'
engine = create_engine(db_url)

complex_query = """
WITH average_prices AS (
    SELECT 
        l.host_id,
        AVG(l.base_price) AS avg_price
    FROM listings l
    WHERE l.has_availability = TRUE
    GROUP BY l.host_id
),
host_with_high_listings AS (
    SELECT 
        hs.host_id,
        MAX(hs.listings_count) as listings_count
    FROM host_statistics hs
    WHERE listings_count > 5
    GROUP BY hs.host_id
)
SELECT 
    h.id AS host_id,
    h.name AS host_name,
    h.location AS host_location,
    apl.avg_price AS average_price,
    hw.listings_count AS total_listings
FROM host h
JOIN average_prices apl ON h.id = apl.host_id
JOIN host_with_high_listings hw ON h.id = hw.host_id
WHERE apl.avg_price < 150
  AND h.location = 'New York, NY'
ORDER BY apl.avg_price ASC;
"""

with engine.connect() as connection:
    start_time = time.time()
    result = connection.execute(text(complex_query))
    results = result.fetchall()
    end_time = time.time()

    print(f"{len(results)} results found in {end_time - start_time:.4f} seconds")
    for row in results:
        print(row)


9 results found in 0.0082 seconds
(42708277, 'Rodney', 'New York, NY', 102.16666666666667, 8)
(42708277, 'Rodney', 'New York, NY', 102.16666666666667, 8)
(42708277, 'Rodney', 'New York, NY', 102.16666666666667, 8)
(42708277, 'Rodney', 'New York, NY', 102.16666666666667, 8)
(42708277, 'Rodney', 'New York, NY', 102.16666666666667, 8)
(42708277, 'Rodney', 'New York, NY', 102.16666666666667, 8)
(110453341, 'Alexis', 'New York, NY', 103.0, 7)
(385664127, 'Dillon', 'New York, NY', 104.5, 7)
(385664127, 'Dillon', 'New York, NY', 104.5, 7)
