#### Explore Airbnb Dataset (Main)

Start refreshable spark session

In [8]:
# Nuclear Spark Reset - Complete Clean Start
# This completely wipes all Spark references and starts from scratch

import subprocess
import time
import os
import sys

print("=" * 60)
print("NUCLEAR SPARK RESET - COMPLETE SYSTEM CLEANUP")
print("=" * 60)

# Step 1: Kill ALL Java processes (nuclear option)
print("\nSTEP 1: TERMINATING ALL JAVA PROCESSES")
print("-" * 40)
try:
    subprocess.run(['pkill', '-f', 'java'], capture_output=True)
    time.sleep(5)  # Longer wait for complete cleanup
    print("STATUS: All Java processes terminated")
except:
    print("STATUS: Java process cleanup completed")

# Step 2: Clear ALL PySpark imports from memory
print("\nSTEP 2: CLEARING PYSPARK FROM MEMORY")
print("-" * 40)
modules_to_remove = []
for module_name in sys.modules:
    if 'pyspark' in module_name or 'py4j' in module_name:
        modules_to_remove.append(module_name)

for module_name in modules_to_remove:
    if module_name in sys.modules:
        del sys.modules[module_name]
        
print(f"STATUS: Removed {len(modules_to_remove)} Spark-related modules from memory")

# Step 3: Set environment variables for clean start
print("\nSTEP 3: CONFIGURING CLEAN ENVIRONMENT")
print("-" * 40)
os.environ['PYSPARK_PYTHON'] = sys.executable
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable
print("STATUS: Environment variables configured")

# Step 4: Now import fresh PySpark and create session
print("\nSTEP 4: CREATING FRESH SPARK SESSION")
print("-" * 40)
from pyspark.sql import SparkSession
from pyspark import SparkContext

# Create session with unique app name
spark = SparkSession.builder \
    .appName(f"BerlinAirbnb_{int(time.time())}") \
    .master("local[1]") \
    .config("spark.driver.memory", "2g") \
    .config("spark.sql.shuffle.partitions", "4") \
    .getOrCreate()

# Verify it works
print("\nRESULTS:")
print("-" * 40)
print("STATUS: Nuclear reset successful")
print(f"Spark UI: {spark.sparkContext.uiWebUrl}")
print(f"App ID: {spark.sparkContext.applicationId}")
print("=" * 60)
print("SPARK SESSION READY FOR USE")
print("=" * 60)

NUCLEAR SPARK RESET - COMPLETE SYSTEM CLEANUP

STEP 1: TERMINATING ALL JAVA PROCESSES
----------------------------------------
STATUS: All Java processes terminated

STEP 2: CLEARING PYSPARK FROM MEMORY
----------------------------------------
STATUS: Removed 102 Spark-related modules from memory

STEP 3: CONFIGURING CLEAN ENVIRONMENT
----------------------------------------
STATUS: Environment variables configured

STEP 4: CREATING FRESH SPARK SESSION
----------------------------------------
STATUS: All Java processes terminated

STEP 2: CLEARING PYSPARK FROM MEMORY
----------------------------------------
STATUS: Removed 102 Spark-related modules from memory

STEP 3: CONFIGURING CLEAN ENVIRONMENT
----------------------------------------
STATUS: Environment variables configured

STEP 4: CREATING FRESH SPARK SESSION
----------------------------------------


Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/08/04 14:30:21 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/08/04 14:30:21 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable



RESULTS:
----------------------------------------
STATUS: Nuclear reset successful
Spark UI: http://192.168.1.137:4040
App ID: local-1754307021925
SPARK SESSION READY FOR USE


### Explore Airbnb Data

In [9]:
# Enhanced Airbnb Data Analysis - Business Question Assessment
# Load all datasets and analyze their potential for answering business questions

listings_df = spark.read.csv('./sample_data/listings.csv', header=True, inferSchema=True)
reviews_df = spark.read.csv('./sample_data/reviews.csv', header=True, inferSchema=True)
calendar_df = spark.read.csv('./sample_data/calendar.csv', header=True, inferSchema=True)
neighborhoods_df = spark.read.csv('./sample_data/neighbourhoods.csv', header=True, inferSchema=True)

dataframes = [listings_df, reviews_df, calendar_df, neighborhoods_df]
df_names = ['listings', 'reviews', 'calendar', 'neighbourhoods']

print("=" * 80)
print("COMPREHENSIVE DATA ASSESSMENT FOR BUSINESS QUESTIONS")
print("=" * 80)

for df, name in zip(dataframes, df_names):
    print(f"\n{name.upper()} DATASET ANALYSIS:")
    print("-" * 50)
    
    # Basic dimensions
    row_count = df.count()
    col_count = len(df.columns)
    print(f"Dimensions: {row_count:,} rows × {col_count} columns")
    
    # Data types analysis
    print("\nDATA TYPES BREAKDOWN:")
    type_counts = {}
    for col_name, col_type in df.dtypes:
        type_counts[col_type] = type_counts.get(col_type, 0) + 1
    
    for data_type, count in type_counts.items():
        print(f"  {data_type}: {count} columns")
    
    # Sample a few rows to see actual data
    print("\nSAMPLE DATA (first 3 rows):")
    df.show(3, truncate=False)

COMPREHENSIVE DATA ASSESSMENT FOR BUSINESS QUESTIONS

LISTINGS DATASET ANALYSIS:
--------------------------------------------------
Dimensions: 226 rows × 79 columns

DATA TYPES BREAKDOWN:
  string: 79 columns

SAMPLE DATA (first 3 rows):
Dimensions: 226 rows × 79 columns

DATA TYPES BREAKDOWN:
  string: 79 columns

SAMPLE DATA (first 3 rows):


25/08/04 14:33:15 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.


+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+---------------------------------+-----------+------------+---------------+-----------------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

#### Top 5 Core Business Questions for Ideal Airbnb Selection

##### 1. Which listings offer the best balance between price, availability, and guest satisfaction?
Identify listings that consistently meet criteria for affordability (`price`), booking flexibility (`availability_365`), and strong guest feedback (`review_scores_rating`, `reviews_per_month`).

##### 2. How do minimum and maximum night restrictions impact suitability for short-term stays?
Analyze `minimum_nights` and `maximum_nights` to filter out listings that are either too restrictive or overly optimized for long-term stays.

##### 3. What types of listings (room type, property type) align most with traveler preferences in Berlin?
Explore trends in `room_type`, `property_type`, and booking volume to uncover which types of spaces are most commonly chosen by short-term guests.

##### 4. Which hosts exhibit the highest levels of reliability and responsiveness?
Evaluate `host_is_superhost`, `host_response_rate`, and `host_acceptance_rate` to find hosts who maintain strong performance and communication standards.

##### 5. What amenities consistently appear in top-rated listings?
Examine the `amenities` field across listings with high `review_scores_*` to determine which features contribute most to positive guest experiences.

---


### Explore Open Map Data 

In [10]:
# Quick Berlin Museums API - Fixed version with restored API key
# This cell executes fast and gets Berlin museum data efficiently

import requests

# Restore the API key (the cell above might be missing this variable)
API_KEY = "5ae2e3f221c38a28845f05b67c57ad8bd2113f143d1bcda448913ae2"

print("=" * 60)
print("BERLIN MUSEUMS API LOOKUP")
print("=" * 60)

# Simple function to get museums fast
def get_museums_fast():
    """Get Berlin museums with minimal processing for speed"""
    
    url = "https://api.opentripmap.com/0.1/en/places/bbox"
    params = {
        'lon_min': '13.35',
        'lat_min': '52.48', 
        'lon_max': '13.45',
        'lat_max': '52.54',
        'kinds': 'museums',
        'limit': '10',  # Smaller limit for faster response
        'format': 'json',
        'apikey': API_KEY
    }
    
    try:
        # Quick request with short timeout
        response = requests.get(url, params=params, timeout=5)
        
        if response.status_code == 200:
            return response.json()
        else:
            print(f"ERROR: Request failed with status code {response.status_code}")
            return []
            
    except requests.exceptions.Timeout:
        print("ERROR: Request timed out - API might be slow")
        return []
    except Exception as e:
        print(f"ERROR: {e}")
        return []

# Get museums quickly
print("Fetching museum data...")
print("-" * 30)
museums = get_museums_fast()

# Display results simply
if museums:
    print(f"STATUS: Found {len(museums)} museums")
    print("\nMUSEUM LIST:")
    print("-" * 30)
    for i, museum in enumerate(museums[:5], 1):  # Show only first 5
        name = museum.get('name', 'Unnamed')
        print(f"{i}. {name}")
else:
    print("STATUS: No museums found")

print("\n" + "=" * 60)
print("MUSEUMS API LOOKUP COMPLETE")
print("=" * 60)

BERLIN MUSEUMS API LOOKUP
Fetching museum data...
------------------------------
STATUS: Found 10 museums

MUSEUM LIST:
------------------------------
1. ehem. Kunsthaus Tacheles
2. Schwerbelastungskörper
3. Fichtebunker
4. Gropius-Bau
5. Bunker

MUSEUMS API LOOKUP COMPLETE
STATUS: Found 10 museums

MUSEUM LIST:
------------------------------
1. ehem. Kunsthaus Tacheles
2. Schwerbelastungskörper
3. Fichtebunker
4. Gropius-Bau
5. Bunker

MUSEUMS API LOOKUP COMPLETE


##### 1. How does proximity to diverse attractions influence listing pricing patterns?
Investigate whether listings near high-density or multi-category POI zones (e.g., culture + nightlife + landmarks) have noticeably different price behavior compared to isolated or suburban listings.

##### 2. Which areas of Berlin offer the highest guest-rated listings with walkable access to major points of interest?
Identify clusters of listings with strong `review_scores_location` and `review_scores_rating` that are also within short walking distances to top-rated attractions.

##### 3. Can we classify listings by attraction context to better match different traveler profiles (e.g., cultural explorer, nightlife seeker, quiet retreat)?
Use nearby POI categories to tag listings by travel persona, supporting tailored recommendation or filtering (e.g., “best for architecture lovers” or “best for clubbing”).

---


### Explore Weather API

In [None]:
# Simple BrightSky Weather API Test for Berlin
# Quick test to get current Berlin weather data

import requests
from datetime import datetime, timedelta

print("=" * 60)
print("BERLIN WEATHER API TEST")
print("=" * 60)

# Get yesterday's weather data (simple 1-day test)
yesterday = (datetime.now() - timedelta(days=1)).strftime('%Y-%m-%d')

print(f"Fetching weather data for: {yesterday}")
print("-" * 40)

# Simple API request for Berlin weather
url = "https://api.brightsky.dev/weather"
params = {
    'lat': 52.52,        # Berlin latitude
    'lon': 13.405,       # Berlin longitude
    'date': yesterday    # Just get yesterday's data
}

try:
    # Make quick API request
    response = requests.get(url, params=params, timeout=5)
    
    if response.status_code == 200:
        data = response.json()
        weather_records = data.get('weather', [])
        
        if weather_records:
            # Just show first weather record
            weather = weather_records[0]
            temp = weather.get('temperature', 'N/A')
            condition = weather.get('condition', 'Unknown')
            
            print("STATUS: Weather data retrieved successfully")
            print("\nWEATHER DETAILS:")
            print("-" * 30)
            print(f"Date: {yesterday}")
            print(f"Temperature: {temp}°C")
            print(f"Condition: {condition}")
        else:
            print("ERROR: No weather data found in response")
    else:
        print(f"ERROR: API request failed with status code {response.status_code}")
        
except Exception as e:
    print(f"ERROR: {e}")

print("\n" + "=" * 60)
print("WEATHER API TEST COMPLETE")
print("=" * 60)

🌤️ Testing Berlin weather API...
✅ Berlin weather for 2025-08-03:
   🌡️ Temperature: 15.8°C
   ☁️ Condition: dry
🎯 Weather API test done!
✅ Berlin weather for 2025-08-03:
   🌡️ Temperature: 15.8°C
   ☁️ Condition: dry
🎯 Weather API test done!
