In [4]:
# Install and setup PySpark
!pip install pyspark



In [5]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
import pandas as pd
import json
import matplotlib.pyplot as plt
import seaborn as sns
from google.colab import files
import os

In [6]:
# Initialize Spark Session
spark = SparkSession.builder \
    .appName("Lab3-Formatting-Zone") \
    .master("local[*]") \
    .config("spark.sql.adaptive.enabled", "true") \
    .getOrCreate()

print("Spark Session Created Successfully!")
print(f"Spark Version: {spark.version}")
spark.sparkContext.setLogLevel("ERROR")

Spark Session Created Successfully!
Spark Version: 3.5.1


In [7]:
spark

In [8]:
#Display all the configurations
print(f"Python version = {spark.sparkContext.pythonVer}")
print(f"Spark version = {spark.sparkContext.version}")
print(spark.sparkContext.getConf().getAll())

Python version = 3.11
Spark version = 3.5.1
[('spark.driver.extraJavaOptions', '-Djava.net.preferIPv6Addresses=false -XX:+IgnoreUnrecognizedVMOptions --add-opens=java.base/java.lang=ALL-UNNAMED --add-opens=java.base/java.lang.invoke=ALL-UNNAMED --add-opens=java.base/java.lang.reflect=ALL-UNNAMED --add-opens=java.base/java.io=ALL-UNNAMED --add-opens=java.base/java.net=ALL-UNNAMED --add-opens=java.base/java.nio=ALL-UNNAMED --add-opens=java.base/java.util=ALL-UNNAMED --add-opens=java.base/java.util.concurrent=ALL-UNNAMED --add-opens=java.base/java.util.concurrent.atomic=ALL-UNNAMED --add-opens=java.base/jdk.internal.ref=ALL-UNNAMED --add-opens=java.base/sun.nio.ch=ALL-UNNAMED --add-opens=java.base/sun.nio.cs=ALL-UNNAMED --add-opens=java.base/sun.security.action=ALL-UNNAMED --add-opens=java.base/sun.util.calendar=ALL-UNNAMED --add-opens=java.security.jgss/sun.security.krb5=ALL-UNNAMED -Djdk.reflect.useDirectMethodHandle=false'), ('spark.app.startTime', '1750700285804'), ('spark.app.submitT

In [9]:
# Retrieve the level of parallelism configured (equal to the number of cores is obtained with "*")
print("Master: ",spark.conf.get("spark.master"))
print("Parallelism: ",spark.sparkContext.defaultParallelism)
print("Minimum number of partitions: ",spark.sparkContext.defaultMinPartitions)

Master:  local[*]
Parallelism:  2
Minimum number of partitions:  2


In [10]:
### Formatting Zone

In [11]:
from google.colab import drive
import os
from datetime import datetime

# Mount Google Drive
print("Mounting Google Drive...")
drive.mount('/content/drive')

# Auto-generate project name or let user customize
def setup_project():
    """Interactive setup for the data pipeline project"""
    print("\n" + "="*60)
    print("DATA PIPELINE PROJECT SETUP")
    print("="*60)

    # Option 1: Use default name with timestamp
    default_name = f"data-pipeline-{datetime.now().strftime('%Y%m%d')}"

    print(f"\nDefault project name: {default_name}")
    user_input = input("Press Enter to use default, or type a custom project name: ").strip()

    project_name = user_input if user_input else default_name

    return project_name

# Get project name
PROJECT_NAME = setup_project()

# Set up all paths
BASE_DRIVE_PATH = "/content/drive/MyDrive"
DRIVE_PATH = f"{BASE_DRIVE_PATH}/{PROJECT_NAME}"
LANDING_ZONE = f"{DRIVE_PATH}/landing_zone"
FORMATTED_ZONE = f"{DRIVE_PATH}/formatted_zone"
EXPLOITATION_ZONE = f"{DRIVE_PATH}/exploitation_zone"

# Create directories
def create_project_structure():
    """Create the complete project folder structure"""
    directories = {
        "Project Root": DRIVE_PATH,
        "Landing Zone": LANDING_ZONE,
        "Formatted Zone": FORMATTED_ZONE,
        "Exploitation Zone": EXPLOITATION_ZONE
    }

    print(f"\nCreating project structure in: {DRIVE_PATH}")
    print("-" * 40)

    for name, path in directories.items():
        if not os.path.exists(path):
            os.makedirs(path)
            print(f"✓ Created {name}: {path}")
        else:
            print(f"✓ {name} exists: {path}")

# Run the setup
create_project_structure()

Mounting Google Drive...
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).

DATA PIPELINE PROJECT SETUP

Default project name: data-pipeline-20250623
Press Enter to use default, or type a custom project name: dbm-lab3

Creating project structure in: /content/drive/MyDrive/dbm-lab3
----------------------------------------
✓ Project Root exists: /content/drive/MyDrive/dbm-lab3
✓ Landing Zone exists: /content/drive/MyDrive/dbm-lab3/landing_zone
✓ Formatted Zone exists: /content/drive/MyDrive/dbm-lab3/formatted_zone
✓ Exploitation Zone exists: /content/drive/MyDrive/dbm-lab3/exploitation_zone


For our formatting notebook, we implement the following steps:

1. Read raw CSVs/JSON files from the Landing Zone
2. Perform the required joins/unions (if necessary) to combine files
3. Standardize column names
4. Standardize column data types
5. Check for duplicates and missing values
6. Impute missing values with the mean
7. Store formatted data in the formatting_zone as parquet/delta

### Step 1: Format Income Data

In [12]:
def format_income_data():
    """
    WHAT: Read raw income CSVs, union with year column, standardize schema
    - Read 5 CSV files from your Drive landing_zone/income/
    - Union all years with year column
    - Standardize column names
    - Save as Parquet in formatted_zone (without partitioning)
    """
    print("\nFORMATTING INCOME DATA...")

    # Check what files are actually in your income folder
    income_folder = f"{LANDING_ZONE}/income"
    if os.path.exists(income_folder):
        income_files = [f for f in os.listdir(income_folder) if f.endswith('.csv')]
        print(f"Found income files: {income_files}")

        # Read and union all years
        all_income_dfs = []

        for filename in income_files:
            filepath = f"{income_folder}/{filename}"
            # Extract year from filename
            year = None
            for y in [2013, 2014, 2015, 2016, 2017]:
                if str(y) in filename:
                    year = y
                    break

            if year:
                df = spark.read.csv(filepath, header=True, inferSchema=True)
                # Add year column for each DataFrame
                df = df.withColumn("year", lit(year))
                all_income_dfs.append(df)
                print(f"Loaded {year}: {df.count()} rows")

        if all_income_dfs:
            # Union all DataFrames
            combined_income = all_income_dfs[0]
            for df in all_income_dfs[1:]:
                combined_income = combined_income.union(df)

            print(f"Combined dataset: {combined_income.count()} rows")
            print(f"Column names: {combined_income.columns}")

            # Standardize column names
            try:
                combined_income = combined_income.select(
                    col("Codi_Districte").alias("district_code"),
                    col("Nom_Districte").alias("district_name"),
                    col("Codi_Barri").alias("neighborhood_code"),
                    col("Nom_Barri").alias("neighborhood_name"),
                    col("Població").alias("population"),
                    col("Índex RFD Barcelona = 100").alias("rfd_index"),
                    col("year")  # Keep the year column
                )
                print("Column names standardized")
            except:
                print("Column names different than expected. Using original names.")
                combined_income = combined_income

            # Check data types
            print("\nData types:")
            combined_income.printSchema()

            # Fix data types if needed
            combined_income = combined_income.withColumn("district_code", col("district_code").cast("integer")) \
                                           .withColumn("neighborhood_code", col("neighborhood_code").cast("integer")) \
                                           .withColumn("population", col("population").cast("integer")) \
                                           .withColumn("rfd_index", col("rfd_index").cast("double")) \
                                           .withColumn("year", col("year").cast("integer"))

            print("Data types after casting:")
            combined_income.printSchema()

            # Check for missing values
            print("\nMissing values check:")
            missing_counts = combined_income.select([
                sum(col(c).isNull().cast("int")).alias(c)
                for c in combined_income.columns
            ])
            missing_counts.show()

            # Simple mean imputation for missing RFD values
            if combined_income.filter(col("rfd_index").isNull()).count() > 0:
                print("\nMean imputation for missing RFD values...")
                mean_rfd = combined_income.agg(avg("rfd_index")).collect()[0][0]
                print(f"Mean RFD index: {mean_rfd:.2f}")
                combined_income = combined_income.fillna({"rfd_index": mean_rfd})
                print("Mean imputation completed")

            # Check for duplicates
            print("\nDuplicate check:")
            original_count = combined_income.count()
            distinct_count = combined_income.distinct().count()
            duplicates = original_count - distinct_count
            print(f"Original rows: {original_count}")
            print(f"Distinct rows: {distinct_count}")
            print(f"Duplicate rows: {duplicates}")

            if duplicates > 0:
                print("Removing duplicates...")
                combined_income = combined_income.distinct()
                print(f"Rows after removing duplicates: {combined_income.count()}")

            # Check for business key duplicates
            business_key_dups = combined_income.groupBy("district_code", "neighborhood_code", "year").count().filter(col("count") > 1)
            dup_count = business_key_dups.count()

            if dup_count > 0:
                print(f"Found {dup_count} business key duplicates (district + neighborhood + year):")
                business_key_dups.show()
            else:
                print("No business key duplicates found")

            # Data validation
            negative_pop = combined_income.filter(col("population") < 0).count()
            print(f"Negative population values: {negative_pop}")

            # Show basic statistics
            print("\nBasic statistics:")
            combined_income.describe().show()

            # Remove rows where district_name or neighborhood_name is "No consta"
            print(f"Rows before filtering: {combined_income.count()}")
            combined_income = combined_income.filter(col("district_name") != "No consta")
            combined_income = combined_income.filter(col("neighborhood_name") != "No consta")
            print(f"Rows after filtering: {combined_income.count()}")

            # Save as Parquet WITHOUT partitioning
            output_path = f"{FORMATTED_ZONE}/income"
            combined_income.write.mode("overwrite").parquet(output_path)
            print(f"Saved formatted income data: {combined_income.count()} total rows")
            print(f"Saved to: {output_path}")

            return combined_income

        else:
            print("No income files found")
            return None
    else:
        print(f"Income folder not found: {income_folder}")
        return None

In [13]:
income = format_income_data()
income.show()


FORMATTING INCOME DATA...
Found income files: ['2014_Distribucio_territorial_renda_familiar.csv', '2007_Distribucio_territorial_renda_familiar.csv', '2010_Distribucio_territorial_renda_familiar.csv', '2008_Distribucio_territorial_renda_familiar.csv', '2015_Distribucio_territorial_renda_familiar.csv', '2011_Distribucio_territorial_renda_familiar.csv', '2009_Distribucio_territorial_renda_familiar.csv', '2013_Distribucio_territorial_renda_familiar.csv', '2012_Distribucio_territorial_renda_familiar.csv', '2016_Distribucio_territorial_renda_familiar.csv', '2017_Distribució_territorial_renda_familiar.csv']
Loaded 2014: 74 rows
Loaded 2015: 74 rows
Loaded 2013: 74 rows
Loaded 2016: 73 rows
Loaded 2017: 73 rows
Combined dataset: 368 rows
Column names: ['Any', 'Codi_Districte', 'Nom_Districte', 'Codi_Barri', 'Nom_Barri', 'Població', 'Índex RFD Barcelona = 100', 'year']
Column names standardized

Data types:
root
 |-- district_code: integer (nullable = true)
 |-- district_name: string (nullabl

### Step 2: Format Cultural Sites Data

In [14]:
def format_cultural_sites_data():
  """
  WHAT: Read raw cultural sites CSV, standardize schema
  - Standardize column names
  - Standardize data types
  - Handle missing/duplicate values
  - Select relevant columns
  - Save as Parquet in formatted_zone
  """
  print("\nFORMATTING CULTURAL SITES DATA...")

  # Define the cultural sites folder path
  cultural_folder = f"{LANDING_ZONE}/cultural-sites"

  df = spark.read.csv(cultural_folder, header=True, inferSchema=True)

  # Step 1: Drop columns
  columns_to_keep = ['addresses_zip_code',
                    'values_attribute_name',
                    # 'values_category',
                    'addresses_district_name',
                    'name',
                    'addresses_neighborhood_name',
                    'geo_epgs_4326_y',
                    'geo_epgs_4326_x',
                    'created']



  df = df.select(
      col("addresses_district_name").alias("district_name"),
      col("addresses_neighborhood_name").alias("neighborhood_name"),
      col("name").alias("cultural_site_name"),
      col("geo_epgs_4326_y").alias("cultural_site_latitude"),
      col("geo_epgs_4326_x").alias("cultural_site_longitude"),
      col("created").cast("date").alias("creation_date")
  )
  print("Column names standardized")

  # Check data types
  print("\nData types:")
  df.printSchema()

  df = df.withColumn(
    "neighborhood_name",
    when(col("neighborhood_name") == "el Poble-sec", "el Poble Sec")
    .otherwise(col("neighborhood_name"))
  )

  # Save as Parquet partitioned by year
  output_path = f"{FORMATTED_ZONE}/cultural-sites"
  df.write.mode("overwrite").parquet(output_path)
  print(f"Saved formatted income data: {df.count()} total rows")
  print(f"Saved to: {output_path}")

  return df


In [15]:
cultural_sites = format_cultural_sites_data()
cultural_sites.show()


FORMATTING CULTURAL SITES DATA...
Column names standardized

Data types:
root
 |-- district_name: string (nullable = true)
 |-- neighborhood_name: string (nullable = true)
 |-- cultural_site_name: string (nullable = true)
 |-- cultural_site_latitude: double (nullable = true)
 |-- cultural_site_longitude: double (nullable = true)
 |-- creation_date: date (nullable = true)

Saved formatted income data: 871 total rows
Saved to: /content/drive/MyDrive/dbm-lab3/formatted_zone/cultural-sites
+-------------------+--------------------+--------------------+----------------------+-----------------------+-------------+
|      district_name|   neighborhood_name|  cultural_site_name|cultural_site_latitude|cultural_site_longitude|creation_date|
+-------------------+--------------------+--------------------+----------------------+-----------------------+-------------+
|             Gràcia|Vallcarca i els P...|Parc Sanitari Per...|     2.140265906627508|      41.41495508215849|   1988-03-14|
|     Ho

In [16]:
cultural_sites.select("district_name").distinct().collect()

[Row(district_name='Gràcia'),
 Row(district_name='Sant Martí'),
 Row(district_name='Horta-Guinardó'),
 Row(district_name='Les Corts'),
 Row(district_name='Sants-Montjuïc'),
 Row(district_name='Nou Barris'),
 Row(district_name='Sarrià-Sant Gervasi'),
 Row(district_name='Eixample'),
 Row(district_name='Sant Andreu'),
 Row(district_name='Ciutat Vella')]

In [17]:
income.select("district_name").distinct().collect()

[Row(district_name='Gràcia'),
 Row(district_name='Sant Martí'),
 Row(district_name='Horta-Guinardó'),
 Row(district_name='Les Corts'),
 Row(district_name='Sants-Montjuïc'),
 Row(district_name='Nou Barris'),
 Row(district_name='Sarrià-Sant Gervasi'),
 Row(district_name='Eixample'),
 Row(district_name='Sant Andreu'),
 Row(district_name='Ciutat Vella')]

In [18]:
cultural_sites.select("neighborhood_name").distinct().collect()

[Row(neighborhood_name='el Poblenou'),
 Row(neighborhood_name='la Vila de Gràcia'),
 Row(neighborhood_name='el Besòs i el Maresme'),
 Row(neighborhood_name='la Guineueta'),
 Row(neighborhood_name="la Dreta de l'Eixample"),
 Row(neighborhood_name='el Barri Gòtic'),
 Row(neighborhood_name='el Guinardó'),
 Row(neighborhood_name='Vallbona'),
 Row(neighborhood_name='Provençals del Poblenou'),
 Row(neighborhood_name='la Verneda i la Pau'),
 Row(neighborhood_name='Vilapicina i la Torre Llobeta'),
 Row(neighborhood_name="l'Antiga Esquerra de l'Eixample"),
 Row(neighborhood_name='la Marina de Port'),
 Row(neighborhood_name='Sarrià'),
 Row(neighborhood_name='la Marina del Prat Vermell'),
 Row(neighborhood_name='Torre Baró'),
 Row(neighborhood_name='la Trinitat Nova'),
 Row(neighborhood_name='Sant Antoni'),
 Row(neighborhood_name='Baró de Viver'),
 Row(neighborhood_name='Sant Gervasi - Galvany'),
 Row(neighborhood_name='el Congrés i els Indians'),
 Row(neighborhood_name='Sant Pere, Santa Caterina

In [19]:
income.select("neighborhood_name").distinct().collect()

[Row(neighborhood_name='el Poblenou'),
 Row(neighborhood_name='la Vila de Gràcia'),
 Row(neighborhood_name='el Besòs i el Maresme'),
 Row(neighborhood_name='la Guineueta'),
 Row(neighborhood_name='la Teixonera'),
 Row(neighborhood_name="la Dreta de l'Eixample"),
 Row(neighborhood_name='el Barri Gòtic'),
 Row(neighborhood_name='el Guinardó'),
 Row(neighborhood_name='Vallbona'),
 Row(neighborhood_name='Canyelles'),
 Row(neighborhood_name='Provençals del Poblenou'),
 Row(neighborhood_name='la Verneda i la Pau'),
 Row(neighborhood_name='Vilapicina i la Torre Llobeta'),
 Row(neighborhood_name="l'Antiga Esquerra de l'Eixample"),
 Row(neighborhood_name='Navas'),
 Row(neighborhood_name='la Marina de Port'),
 Row(neighborhood_name='la Marina del Prat Vermell'),
 Row(neighborhood_name='Sarrià'),
 Row(neighborhood_name='Torre Baró'),
 Row(neighborhood_name='la Trinitat Nova'),
 Row(neighborhood_name='Sant Antoni'),
 Row(neighborhood_name='Baró de Viver'),
 Row(neighborhood_name='Sant Gervasi - Ga

We want to compare the names to ensure we have matches across the datasets. We will do so by creating sets and compare the unique values in the sets.

In [20]:
cultural_neighborhoods = {row["neighborhood_name"] for row in cultural_sites.select("neighborhood_name").distinct().collect()}
income_neighborhoods = {row["neighborhood_name"] for row in income.select("neighborhood_name").distinct().collect()}

# Check if they're the same
are_same = cultural_neighborhoods == income_neighborhoods
print(f"Same neighborhoods: {are_same}")

# Find differences
only_in_cultural = cultural_neighborhoods - income_neighborhoods
only_in_income = income_neighborhoods - cultural_neighborhoods

print(f"Only in cultural: {only_in_cultural}")
print(f"Only in income: {only_in_income}")
print(f"Common neighborhoods: {len(cultural_neighborhoods & income_neighborhoods)}")

Same neighborhoods: False
Only in cultural: set()
Only in income: {'les Roquetes', 'Canyelles', 'Navas', 'la Teixonera', 'la Prosperitat'}
Common neighborhoods: 68


### Part 3: Format Price-opendata JSON format

In [21]:
def format_prices_data():
    """
    WHAT: Read raw JSON, flatten nested structure, standardize schema
    - Read JSON from your Drive landing_zone/prices/
    - Flatten nested 'info' array structure
    - Standardize column names
    - Save as Parquet in formatted_zone
    """
    print("\nFORMATTING PRICES DATA...")

    # Check what files are in prices folder
    prices_folder = f"{LANDING_ZONE}/price_opendata"

    json_file = f"{prices_folder}/price_opendata_neighborhood.json"

    # Read JSON
    prices_df = spark.read.json(json_file)
    print(f"Original data: {prices_df.count()} rows")
    print("Schema:")
    prices_df.printSchema()

    # Flatten the nested structure
    flattened = prices_df.select(
        col("neigh_name ").alias("neighborhood_name"),
        col("district_name"),
        explode(col("info")).alias("year_info")
    ).select(
        col("neighborhood_name"),
        col("district_name"),
        col("year_info.year").alias("year"),
        col("year_info.Amount").alias("amount"),
        col("year_info.usedAmount").alias("used_amount"),
        col("year_info.PerMeter").alias("per_meter"),
        col("year_info.usedPerMeter").alias("used_per_meter")
    )

    print(f"After flattening: {flattened.count()} rows")

    # Clean neighborhood names
    flattened = flattened.withColumn("neighborhood_name", trim(col("neighborhood_name")))

    # Check data types and cast appropriately
    print("Data types:")
    flattened.printSchema()

    # Cast to appropriate types
    flattened = flattened.withColumn("year", col("year").cast("integer")) \
                          .withColumn("amount", col("amount").cast("double")) \
                          .withColumn("used_amount", col("used_amount").cast("double")) \
                          .withColumn("per_meter", col("per_meter").cast("double")) \
                          .withColumn("used_per_meter", col("used_per_meter").cast("double"))

    # Check for missing values
    print("\nMissing values check:")
    missing_counts = flattened.select([
        sum(col(c).isNull().cast("int")).alias(c)
        for c in flattened.columns
    ])
    missing_counts.show()

    # Check for duplicates
    print("\nDuplicate check:")
    original_count = flattened.count()
    distinct_count = flattened.distinct().count()
    duplicates = original_count - distinct_count
    print(f"Original rows: {original_count}")
    print(f"Distinct rows: {distinct_count}")
    print(f"Duplicate rows: {duplicates}")

    if duplicates > 0:
        flattened = flattened.distinct()
        print(f"Rows after removing duplicates: {flattened.count()}")

    # Data validation
    print("\nData validation:")
    print("Year range:")
    flattened.select(min("year"), max("year")).show()

    print("Price statistics:")
    flattened.select("per_meter").describe().show()

    # Save as Parquet partitioned by year
    output_path = f"{FORMATTED_ZONE}/prices"
    flattened.write.mode("overwrite").partitionBy("year").parquet(output_path)
    print(f"Saved formatted prices data: {flattened.count()} rows")
    print(f"Saved to: {output_path}")

    return flattened

In [22]:
prices = format_prices_data()
prices.show()


FORMATTING PRICES DATA...
Original data: 73 rows
Schema:
root
 |-- _id: long (nullable = true)
 |-- district_id: long (nullable = true)
 |-- district_name: string (nullable = true)
 |-- info: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- Amount: double (nullable = true)
 |    |    |-- PerMeter: double (nullable = true)
 |    |    |-- diffAmount: double (nullable = true)
 |    |    |-- diffPerMeter: double (nullable = true)
 |    |    |-- usedAmount: double (nullable = true)
 |    |    |-- usedPerMeter: double (nullable = true)
 |    |    |-- year: long (nullable = true)
 |-- neigh_name : string (nullable = true)

After flattening: 359 rows
Data types:
root
 |-- neighborhood_name: string (nullable = true)
 |-- district_name: string (nullable = true)
 |-- year: long (nullable = true)
 |-- amount: double (nullable = true)
 |-- used_amount: double (nullable = true)
 |-- per_meter: double (nullable = true)
 |-- used_per_meter: double (nullable = tru

In [23]:
prices.select("neighborhood_name").distinct().collect()

[Row(neighborhood_name='el Poblenou'),
 Row(neighborhood_name='la Vila de Gràcia'),
 Row(neighborhood_name='el Besòs i el Maresme'),
 Row(neighborhood_name='la Guineueta'),
 Row(neighborhood_name='la Teixonera'),
 Row(neighborhood_name="la Dreta de l'Eixample"),
 Row(neighborhood_name='el Barri Gòtic'),
 Row(neighborhood_name='el Guinardó'),
 Row(neighborhood_name='Vallbona'),
 Row(neighborhood_name='Canyelles'),
 Row(neighborhood_name='Provençals del Poblenou'),
 Row(neighborhood_name='la Verneda i la Pau'),
 Row(neighborhood_name='Vilapicina i la Torre Llobeta'),
 Row(neighborhood_name="l'Antiga Esquerra de l'Eixample"),
 Row(neighborhood_name='Navas'),
 Row(neighborhood_name='la Marina de Port'),
 Row(neighborhood_name='Sarrià'),
 Row(neighborhood_name='la Marina del Prat Vermell'),
 Row(neighborhood_name='Torre Baró'),
 Row(neighborhood_name='la Trinitat Nova'),
 Row(neighborhood_name='Sant Antoni'),
 Row(neighborhood_name='Baró de Viver'),
 Row(neighborhood_name='Sant Gervasi - Ga

In [24]:
prices.select("district_name").distinct().collect()

[Row(district_name='Gràcia'),
 Row(district_name='Sant Martí'),
 Row(district_name='Horta-Guinardó'),
 Row(district_name='Les Corts'),
 Row(district_name='Sants-Montjuïc'),
 Row(district_name='Nou Barris'),
 Row(district_name='Sarrià-Sant Gervasi'),
 Row(district_name='Eixample'),
 Row(district_name='Sant Andreu'),
 Row(district_name='Ciutat Vella')]

In [25]:
cultural_neighborhoods = {row["neighborhood_name"] for row in cultural_sites.select("neighborhood_name").distinct().collect()}
income_neighborhoods = {row["neighborhood_name"] for row in income.select("neighborhood_name").distinct().collect()}
prices_neighborhoods = {row["neighborhood_name"] for row in prices.select("neighborhood_name").distinct().collect()}

# Check if they're the same
are_same = cultural_neighborhoods == income_neighborhoods == prices_neighborhoods
print(f"Same neighborhoods: {are_same}")

# Find differences - what's in one but not in the others
only_in_cultural = cultural_neighborhoods - income_neighborhoods - prices_neighborhoods
only_in_income = income_neighborhoods - cultural_neighborhoods - prices_neighborhoods
only_in_prices = prices_neighborhoods - cultural_neighborhoods - income_neighborhoods

# What's in cultural but not in income (regardless of prices)
cultural_not_income = cultural_neighborhoods - income_neighborhoods
income_not_cultural = income_neighborhoods - cultural_neighborhoods

# What's in prices but not in income (regardless of cultural)
prices_not_income = prices_neighborhoods - income_neighborhoods
income_not_prices = income_neighborhoods - prices_neighborhoods

print(f"Only in cultural (not in either other): {only_in_cultural}")
print(f"Only in income (not in either other): {only_in_income}")
print(f"Only in prices (not in either other): {only_in_prices}")

print(f"In cultural but not income: {cultural_not_income}")
print(f"In income but not cultural: {income_not_cultural}")
print(f"In prices but not income: {prices_not_income}")
print(f"In income but not prices: {income_not_prices}")

print(f"Common to all three: {len(cultural_neighborhoods & income_neighborhoods & prices_neighborhoods)}")

Same neighborhoods: False
Only in cultural (not in either other): set()
Only in income (not in either other): set()
Only in prices (not in either other): set()
In cultural but not income: set()
In income but not cultural: {'les Roquetes', 'Canyelles', 'Navas', 'la Teixonera', 'la Prosperitat'}
In prices but not income: set()
In income but not prices: set()
Common to all three: 68


In [26]:
# End session
spark.stop()