In this file we load in the raw data to pyspark, deal with outliers, drop irrelevant columns, rename variables to standardized names, and convert variables to the correct format.

In [87]:
import pandas as pd
import glob
import re

from pyspark.sql import SparkSession
from pyspark import  SparkConf
from pyspark.sql.types import *
from pyspark.sql import functions as F
from pyspark.sql.functions import col
from pyspark.sql.functions import lit
from pyspark.sql.functions import to_date
from pyspark.sql.functions import col, explode
from pyspark.ml.feature import Imputer
from pyspark.sql import SparkSession
from pyspark.sql.functions import lit, input_file_name, regexp_extract

# Set up pyspark session

In [88]:
appName = "app"
master = "local[*]" # Spark will use all cores (*) available
if not 'spark' in globals(): # This 'trick' makes sure the SparkContext sc is initialized exactly once
  conf = SparkConf().setAppName(appName).setMaster(master)
  spark = SparkSession.builder \
        .config(conf=conf) \
        .getOrCreate()

In [89]:
#Display the information kept in the spark variable
spark

# Idealista

In [90]:
# List of JSON files
json_files = glob.glob('../datasets/idealista/*.json')

# Read the first JSON file to infer the schema
first_file = '../datasets/idealista/2020_08_06_idealista.json'
first_df = spark.read.json(first_file)
schema = first_df.schema

# Load JSON files into Spark DataFrames using inferred schema
idealista_dfs = []
for file in json_files:
    # Extract date from filename 
    date_match = re.search(r'(\d{4}_\d{2}_\d{2})', file)
    date_str = date_match.group(1)  # Extracting the date string from the filename
    date_column = to_date(lit(date_str), 'yyyy_MM_dd')  # Converting date string to date format
    idealista_df = spark.read.schema(schema).json(file).withColumn('date', date_column)  # Adding the 'date' column
    idealista_dfs.append(idealista_df)

# Union all the DataFrames
idealista = idealista_dfs[0]
for df in idealista_dfs[1:]:
    idealista = idealista.union(df)

In [91]:
idealista.show(5)

+--------------------+---------+-------+------------+--------+--------------+--------+-----------------+-----+------+---------+-------+-------+----------+--------+----------+---------+------------+--------------------+--------------+---------+---------+------------+--------+-----------+------------+------------+---------+-----+-----------+----+------+--------------------+--------------------+-----------------+--------------------+----------+
|             address|bathrooms|country|detailedType|distance|      district|exterior|externalReference|floor|has360|has3DTour|hasLift|hasPlan|hasStaging|hasVideo|  latitude|longitude|municipality|        neighborhood|newDevelopment|numPhotos|operation|parkingSpace|   price|priceByArea|propertyCode|propertyType| province|rooms|showAddress|size|status|      suggestedTexts|           thumbnail|topNewDevelopment|                 url|      date|
+--------------------+---------+-------+------------+--------+--------------+--------+-----------------+----

24/06/06 12:49:50 WARN DAGScheduler: Broadcasting large task binary with size 3.0 MiB


In [92]:
# drop duplicates
idealista = idealista.dropDuplicates()

In [93]:
# Invetigate
idealista.describe().show()

24/06/06 12:49:55 WARN DAGScheduler: Broadcasting large task binary with size 3.3 MiB
24/06/06 12:49:58 WARN DAGScheduler: Broadcasting large task binary with size 1818.8 KiB
[Stage 307:>                                                        (0 + 7) / 7]

+-------+----------------+------------------+-------+------------------+--------------------+-----------------+-----------------+-------------------+-------------------+------------+--------------------+------------------+---------+------------------+------------------+-------------------+------------+---------+------------------+-----------------+------+--------------------+--------------------+
|summary|         address|         bathrooms|country|          distance|            district|externalReference|            floor|           latitude|          longitude|municipality|        neighborhood|         numPhotos|operation|             price|       priceByArea|       propertyCode|propertyType| province|             rooms|             size|status|           thumbnail|                 url|
+-------+----------------+------------------+-------+------------------+--------------------+-----------------+-----------------+-------------------+-------------------+------------+------------------

24/06/06 12:49:59 WARN DAGScheduler: Broadcasting large task binary with size 1958.9 KiB
                                                                                

In [94]:
# show schema
idealista.printSchema()

root
 |-- address: string (nullable = true)
 |-- bathrooms: long (nullable = true)
 |-- country: string (nullable = true)
 |-- detailedType: struct (nullable = true)
 |    |-- subTypology: string (nullable = true)
 |    |-- typology: string (nullable = true)
 |-- distance: string (nullable = true)
 |-- district: string (nullable = true)
 |-- exterior: boolean (nullable = true)
 |-- externalReference: string (nullable = true)
 |-- floor: string (nullable = true)
 |-- has360: boolean (nullable = true)
 |-- has3DTour: boolean (nullable = true)
 |-- hasLift: boolean (nullable = true)
 |-- hasPlan: boolean (nullable = true)
 |-- hasStaging: boolean (nullable = true)
 |-- hasVideo: boolean (nullable = true)
 |-- latitude: double (nullable = true)
 |-- longitude: double (nullable = true)
 |-- municipality: string (nullable = true)
 |-- neighborhood: string (nullable = true)
 |-- newDevelopment: boolean (nullable = true)
 |-- numPhotos: long (nullable = true)
 |-- operation: string (nullable =

In [95]:
# Split the 'parkingSpace' struct column into three separate columns
idealista  = idealista  \
    .select(
        col("*"),
        col("parkingSpace.hasParkingSpace").alias("hasParkingSpace"),
        col("parkingSpace.isParkingSpaceIncludedInPrice").alias("isParkingSpaceIncludedInPrice"),
        col("parkingSpace.parkingSpacePrice").alias("parkingSpacePrice")
    ) \
    .drop("parkingSpace")

In [96]:
# Split topology into two columns
idealista  = idealista  \
    .select(
        col("*"),
        col("detailedType.subTypology").alias("subTypology"),
        col("detailedType.typology").alias("typology")
    ) \
    .drop("detailedType")

In [97]:
# drop columns which dont contain useful information
idealista = idealista.drop('thumbnail', 'url', 'country' 'suggestedTexts', 'externalReference', 'subTypology', 'suggestedTexts', 'status')

# replace floor col with 0 when its st 
idealista = idealista.withColumn('floor', F.when(F.col('floor') == 'st', 0).otherwise(F.col('floor')))
idealista = idealista.withColumn('floor', idealista['floor'].cast(IntegerType()))

# Convert to interger
idealista = idealista \
    .withColumn("distance", idealista["distance"].cast("double")) \
    .withColumn("rooms", idealista["rooms"].cast("integer")) \
    .withColumn("bathrooms", idealista["bathrooms"].cast("integer")) 
    
row_count = idealista.count()
print("Number of rows:", row_count)

24/06/06 12:50:04 WARN DAGScheduler: Broadcasting large task binary with size 3.3 MiB

Number of rows: 20040


24/06/06 12:50:07 WARN DAGScheduler: Broadcasting large task binary with size 1738.8 KiB
                                                                                

In [98]:
# filll in missing parking space values with false
idealista = idealista.fillna({'hasParkingSpace': False, 'isParkingSpaceIncludedInPrice': False, 'parkingSpacePrice': 0})

In [99]:
# Drop if missing values in district and neighborhood
idealista = idealista.dropna(subset=['district', 'neighborhood'])

In [100]:
# Check for missing values
#for col_name in idealista.columns:
#    # Check if the column is not of numeric type
#    if "int" in str(idealista.schema[col_name].dataType) or "double" in str(idealista.schema[col_name].dataType):
#        missing_count = idealista.filter(F.col(col_name).isNull() | F.isnan(col_name)).count()
#    else:
#        missing_count = idealista.filter(F.col(col_name).isNull()).count()
#    print(f"Missing values for column {col_name}: {missing_count}")

In [101]:
# Fill floor
floor_imputer = Imputer(strategy='mode', inputCols=['floor'], outputCols=['floor'])
idealista = floor_imputer.fit(idealista).transform(idealista)

# Fill has lift with false if missing
idealista = idealista.fillna({'hasLift': False})

24/06/06 12:50:12 WARN DAGScheduler: Broadcasting large task binary with size 3.4 MiB
24/06/06 12:50:14 WARN DAGScheduler: Broadcasting large task binary with size 1841.6 KiB
24/06/06 12:50:15 WARN DAGScheduler: Broadcasting large task binary with size 1823.9 KiB
24/06/06 12:50:15 WARN DAGScheduler: Broadcasting large task binary with size 1824.2 KiB
                                                                                

In [102]:
idealista.show(20)

24/06/06 12:50:20 WARN DAGScheduler: Broadcasting large task binary with size 3.4 MiB

+--------------------+---------+-------+--------+-------------------+--------+-----+------+---------+-------+-------+----------+--------+----------+---------+------------+--------------------+--------------+---------+---------+---------+-----------+------------+------------+---------+-----+-----------+-----+-----------------+----------+---------------+-----------------------------+-----------------+--------+
|             address|bathrooms|country|distance|           district|exterior|floor|has360|has3DTour|hasLift|hasPlan|hasStaging|hasVideo|  latitude|longitude|municipality|        neighborhood|newDevelopment|numPhotos|operation|    price|priceByArea|propertyCode|propertyType| province|rooms|showAddress| size|topNewDevelopment|      date|hasParkingSpace|isParkingSpaceIncludedInPrice|parkingSpacePrice|typology|
+--------------------+---------+-------+--------+-------------------+--------+-----+------+---------+-------+-------+----------+--------+----------+---------+------------+-----

24/06/06 12:50:22 WARN DAGScheduler: Broadcasting large task binary with size 1843.9 KiB
                                                                                

In [103]:
def find_outliers(df, column_names):

    for column in column_names:

        less_Q1 = 'less_Q1_{}'.format(column)
        more_Q3 = 'more_Q3_{}'.format(column)
        Q1 = 'Q1_{}'.format(column)
        Q3 = 'Q3_{}'.format(column)

        # Q1 : First Quartile ., Q3 : Third Quartile
        Q1 = df.approxQuantile(column,[0.25],relativeError=0)
        Q3 = df.approxQuantile(column,[0.75],relativeError=0)
        
        # IQR : Inter Quantile Range
        IQR = Q3[0] - Q1[0]
        
        #selecting the data, with -2.5*IQR to + 2.5*IQR
        less_Q1 =  Q1[0] - 2.5*IQR
        more_Q3 =  Q3[0] + 2.5*IQR
        
        isOutlierCol = 'is_outlier_{}'.format(column)
        
        df = df.withColumn(isOutlierCol,F.when((df[column] > more_Q3) | (df[column] < less_Q1), 1).otherwise(0))
    

    # Selecting the specific columns which we have added above, to check if there are any outliers
    selected_columns = [column for column in df.columns if column.startswith("is_outlier")]

    # Adding all the outlier columns into a new colum "total_outliers", to see the total number of outliers
    df = df.withColumn('total_outliers',sum(df[column] for column in selected_columns))

    # Dropping the extra columns created above, just to create nice dataframe., without extra columns
    df = df.drop(*[column for column in df.columns if column.startswith("is_outlier")])

    # drop total_outliers column 
    df = df.drop('total_outliers')

    return df
    

In [104]:
idealista = find_outliers(idealista, ['price' ,'priceByArea', 'size'])

24/06/06 12:50:27 WARN DAGScheduler: Broadcasting large task binary with size 3.3 MiB
24/06/06 12:50:29 WARN DAGScheduler: Broadcasting large task binary with size 1685.6 KiB
24/06/06 12:50:31 WARN DAGScheduler: Broadcasting large task binary with size 3.3 MiB
24/06/06 12:50:33 WARN DAGScheduler: Broadcasting large task binary with size 1685.6 KiB
24/06/06 12:50:35 WARN DAGScheduler: Broadcasting large task binary with size 3.3 MiB
24/06/06 12:50:37 WARN DAGScheduler: Broadcasting large task binary with size 1685.6 KiB
24/06/06 12:50:39 WARN DAGScheduler: Broadcasting large task binary with size 3.3 MiB
24/06/06 12:50:41 WARN DAGScheduler: Broadcasting large task binary with size 1685.6 KiB
24/06/06 12:50:42 WARN DAGScheduler: Broadcasting large task binary with size 3.3 MiB
24/06/06 12:50:44 WARN DAGScheduler: Broadcasting large task binary with size 1685.6 KiB
24/06/06 12:50:46 WARN DAGScheduler: Broadcasting large task binary with size 3.3 MiB
24/06/06 12:50:48 WARN DAGScheduler: Br

In [105]:
row_count = idealista.count()
print("Number of rows:", row_count)

24/06/06 12:50:50 WARN DAGScheduler: Broadcasting large task binary with size 3.4 MiB

Number of rows: 12267


24/06/06 12:50:52 WARN DAGScheduler: Broadcasting large task binary with size 1818.7 KiB
                                                                                

In [106]:
# Create year and month columns
idealista = idealista.withColumn('year', F.year(F.to_timestamp('date', 'yyyy-MM-dd')))

In [107]:
# Rename columns to lowercase
idealista = idealista.toDF(*[c.lower() for c in idealista.columns])

In [109]:
# write data to  Parquet format, partitioned by year
idealista.write.partitionBy('year').mode('overwrite').parquet("../formatted_zone/idealista.parquet")

24/06/06 12:51:10 WARN DAGScheduler: Broadcasting large task binary with size 3.4 MiB
24/06/06 12:51:15 WARN DAGScheduler: Broadcasting large task binary with size 2045.6 KiB
                                                                                

In [110]:
idealista.printSchema()

root
 |-- address: string (nullable = true)
 |-- bathrooms: integer (nullable = true)
 |-- country: string (nullable = true)
 |-- distance: double (nullable = true)
 |-- district: string (nullable = true)
 |-- exterior: boolean (nullable = true)
 |-- floor: integer (nullable = true)
 |-- has360: boolean (nullable = true)
 |-- has3dtour: boolean (nullable = true)
 |-- haslift: boolean (nullable = false)
 |-- hasplan: boolean (nullable = true)
 |-- hasstaging: boolean (nullable = true)
 |-- hasvideo: boolean (nullable = true)
 |-- latitude: double (nullable = true)
 |-- longitude: double (nullable = true)
 |-- municipality: string (nullable = true)
 |-- neighborhood: string (nullable = true)
 |-- newdevelopment: boolean (nullable = true)
 |-- numphotos: long (nullable = true)
 |-- operation: string (nullable = true)
 |-- price: double (nullable = true)
 |-- pricebyarea: double (nullable = true)
 |-- propertycode: string (nullable = true)
 |-- propertytype: string (nullable = true)
 |-- p

# Prices data

In [111]:
prices_file = "../datasets/price_opendata/price_opendata_neighborhood.json"
prices_df = spark.read.json(prices_file)

In [112]:
prices_df.printSchema()

root
 |-- _id: long (nullable = true)
 |-- district_id: long (nullable = true)
 |-- district_name: string (nullable = true)
 |-- info: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- Amount: double (nullable = true)
 |    |    |-- PerMeter: double (nullable = true)
 |    |    |-- diffAmount: double (nullable = true)
 |    |    |-- diffPerMeter: double (nullable = true)
 |    |    |-- usedAmount: double (nullable = true)
 |    |    |-- usedPerMeter: double (nullable = true)
 |    |    |-- year: long (nullable = true)
 |-- neigh_name : string (nullable = true)



In [113]:
# Flatten the nested structure
prices_df = prices_df.withColumn("info", explode("info"))
prices_df.printSchema()

root
 |-- _id: long (nullable = true)
 |-- district_id: long (nullable = true)
 |-- district_name: string (nullable = true)
 |-- info: struct (nullable = true)
 |    |-- Amount: double (nullable = true)
 |    |-- PerMeter: double (nullable = true)
 |    |-- diffAmount: double (nullable = true)
 |    |-- diffPerMeter: double (nullable = true)
 |    |-- usedAmount: double (nullable = true)
 |    |-- usedPerMeter: double (nullable = true)
 |    |-- year: long (nullable = true)
 |-- neigh_name : string (nullable = true)



In [114]:
# Split into individual columns
prices_df  = prices_df  \
    .select(
        col("*"),
        col("info.Amount").alias("amount"),
        col("info.PerMeter").alias("per_meter"),
        col("info.diffAmount").alias("diff_amount"),
        col("info.diffPerMeter").alias("diff_per_meter"),
        col("info.usedAmount").alias("used_amount"),
        col("info.usedPerMeter").alias("used_per_meter"),
        col("info.year").alias("year")
    ) \
    .drop("info")

In [115]:
prices_df.printSchema()

root
 |-- _id: long (nullable = true)
 |-- district_id: long (nullable = true)
 |-- district_name: string (nullable = true)
 |-- neigh_name : string (nullable = true)
 |-- amount: double (nullable = true)
 |-- per_meter: double (nullable = true)
 |-- diff_amount: double (nullable = true)
 |-- diff_per_meter: double (nullable = true)
 |-- used_amount: double (nullable = true)
 |-- used_per_meter: double (nullable = true)
 |-- year: long (nullable = true)



In [116]:
prices_df = prices_df.withColumn('year', prices_df['year'].cast(IntegerType()))
prices_df.show(5)

+---+-----------+-------------+-----------+------+---------+-----------+--------------+-----------+--------------+----+
|_id|district_id|district_name|neigh_name |amount|per_meter|diff_amount|diff_per_meter|used_amount|used_per_meter|year|
+---+-----------+-------------+-----------+------+---------+-----------+--------------+-----------+--------------+----+
|  1|          1| Ciutat Vella|   el Raval|  97.0|   1726.5|       NULL|          NULL|       97.0|        1726.5|2013|
|  1|          1| Ciutat Vella|   el Raval| 141.7|   2087.6|       99.1|        2534.3|      143.1|        2073.5|2014|
|  1|          1| Ciutat Vella|   el Raval| 193.8|   2401.9|       NULL|          NULL|      193.8|        2401.9|2015|
|  1|          1| Ciutat Vella|   el Raval| 181.0|   2805.2|       NULL|          NULL|      180.7|        2798.6|2016|
|  1|          1| Ciutat Vella|   el Raval| 240.3|   3469.9|      292.5|        3633.1|      240.0|        3468.9|2017|
+---+-----------+-------------+---------

In [117]:
# drop duplicates
prices_df = prices_df.dropDuplicates()

In [118]:
# Check for missing values
for col_name in prices_df.columns:
    # Check if the column is not of numeric type
    if "int" in str(prices_df.schema[col_name].dataType) or "double" in str(prices_df.schema[col_name].dataType):
        missing_count = prices_df.filter(F.col(col_name).isNull() | F.isnan(col_name)).count()
    else:
        missing_count = prices_df.filter(F.col(col_name).isNull()).count()
    print(f"Missing values for column {col_name}: {missing_count}")

Missing values for column _id: 0
Missing values for column district_id: 0
Missing values for column district_name: 0
Missing values for column neigh_name : 0
Missing values for column amount: 0
Missing values for column per_meter: 0
Missing values for column diff_amount: 149
Missing values for column diff_per_meter: 148
Missing values for column used_amount: 2
Missing values for column used_per_meter: 2
Missing values for column year: 0


In [119]:
row_count = prices_df.count()
print("Number of rows:", row_count)

Number of rows: 359


In [120]:
# drop diff_amount and diff_per_meter as mostly missing values
prices_df = prices_df.drop('diff_amount', 'diff_per_meter')

In [121]:
# Create an Imputer object to fill missing values with mean, median, etc.

imputer = Imputer(inputCols=['used_amount', 'used_per_meter'], outputCols=['used_amount', 'used_per_meter'])

# Fit the imputer to the data and transform
prices_df = imputer.fit(prices_df).transform(prices_df)

In [122]:
# Check for missing values
for col_name in prices_df.columns:
    # Check if the column is not of numeric type
    if "int" in str(prices_df.schema[col_name].dataType) or "double" in str(prices_df.schema[col_name].dataType):
        missing_count = prices_df.filter(F.col(col_name).isNull() | F.isnan(col_name)).count()
    else:
        missing_count = prices_df.filter(F.col(col_name).isNull()).count()
    print(f"Missing values for column {col_name}: {missing_count}")

Missing values for column _id: 0
Missing values for column district_id: 0
Missing values for column district_name: 0
Missing values for column neigh_name : 0
Missing values for column amount: 0
Missing values for column per_meter: 0
Missing values for column used_amount: 0
Missing values for column used_per_meter: 0
Missing values for column year: 0


In [123]:
# Rename columns to lowercase
prices_df = prices_df.toDF(*[c.lower() for c in prices_df.columns])

# renmae district_name to district
prices_df = prices_df.withColumnRenamed('district_name', 'district')

# rename neigh_name to neighborhood
prices_df = prices_df.withColumnRenamed('neigh_name ', 'neighborhood')

In [124]:
# write data to  Parquet format, partitioned by year
prices_df.write.partitionBy('year').mode('overwrite').parquet("../formatted_zone/prices.parquet")

# Income data

In [125]:
# Read all CSV files into a single DataFrame and add the file name as a column
income = spark.read.format("csv").option("header", "true").load("../datasets/income/*.csv") \
    .withColumn("filename", input_file_name())


income.printSchema()


root
 |-- Any: string (nullable = true)
 |-- Codi_Districte: string (nullable = true)
 |-- Nom_Districte: string (nullable = true)
 |-- Codi_Barri: string (nullable = true)
 |-- Nom_Barri: string (nullable = true)
 |-- Població: string (nullable = true)
 |-- Índex RFD Barcelona = 100: string (nullable = true)
 |-- filename: string (nullable = false)



In [126]:
income.show(5)

+----+--------------+-------------+----------+--------------------+--------+-------------------------+--------------------+
| Any|Codi_Districte|Nom_Districte|Codi_Barri|           Nom_Barri|Població|Índex RFD Barcelona = 100|            filename|
+----+--------------+-------------+----------+--------------------+--------+-------------------------+--------------------+
|2007|             1| Ciutat Vella|         1|            el Raval|   46595|                     64.7|file:///Users/jon...|
|2007|             1| Ciutat Vella|         2|      el Barri Gòtic|   27946|                     86.5|file:///Users/jon...|
|2007|             1| Ciutat Vella|         3|      la Barceloneta|   15921|                     66.7|file:///Users/jon...|
|2007|             1| Ciutat Vella|         4|Sant Pere, Santa ...|   22572|                     80.2|file:///Users/jon...|
|2007|             2|     Eixample|         5|       el Fort Pienc|   31521|                    107.9|file:///Users/jon...|
+----+--

In [127]:
# drop filename column
income = income.drop('filename')

# rename any column year
income = income.withColumnRenamed('Any', 'year')

# rename Nom_Districte to district and Nom_Barri to neighborhood
income = income.withColumnRenamed('Nom_Districte', 'district').withColumnRenamed('Nom_Barri', 'neighborhood')
income = income.withColumnRenamed('Codi_Districte', 'district_id')
#rename Index RFD to index_rfd
income = income.withColumnRenamed('Índex RFD Barcelona = 100', 'index_rfd')

# rename all columns to lowercase
income = income.toDF(*[c.lower() for c in income.columns])

In [128]:
# Check for missing values
for col_name in income.columns:
    # Check if the column is not of numeric type
    if "int" in str(income.schema[col_name].dataType) or "double" in str(income.schema[col_name].dataType):
        missing_count = income.filter(F.col(col_name).isNull() | F.isnan(col_name)).count()
    else:
        missing_count = income.filter(F.col(col_name).isNull()).count()
    print(f"Missing values for column {col_name}: {missing_count}")

Missing values for column year: 0
Missing values for column district_id: 0
Missing values for column district: 0
Missing values for column codi_barri: 0
Missing values for column neighborhood: 0
Missing values for column població: 0
Missing values for column index_rfd: 0


In [129]:
# write data to  Parquet format, partitioned by year
income.write.partitionBy('year').mode('overwrite').parquet("../formatted_zone/income.parquet")

# Lookup data

In [130]:
idealista_lookup_file = "../datasets/lookup_tables/idealista_extended.csv"
idealista_lookup_df = spark.read.csv(idealista_lookup_file, header=True)

In [131]:
idealista_lookup_df.printSchema()

root
 |-- district: string (nullable = true)
 |-- neighborhood: string (nullable = true)
 |-- district_n_reconciled: string (nullable = true)
 |-- district_n: string (nullable = true)
 |-- district_id: string (nullable = true)
 |-- neighborhood_n_reconciled: string (nullable = true)
 |-- neighborhood_n: string (nullable = true)
 |-- neighborhood_id: string (nullable = true)



In [132]:
idealista_lookup_df.show(5)

+--------------+--------------------+---------------------+--------------+-----------+-------------------------+--------------------+---------------+
|      district|        neighborhood|district_n_reconciled|    district_n|district_id|neighborhood_n_reconciled|      neighborhood_n|neighborhood_id|
+--------------+--------------------+---------------------+--------------+-----------+-------------------------+--------------------+---------------+
|Sants-Montjuïc|  La Marina del Port|       Sants-Montjuïc|sants montjuic|    Q753075|        La Marina de Port|  la marina del port|       Q3751076|
|Horta Guinardó|    El Baix Guinardó|       Horta-Guinardó|horta guinardo|   Q1771488|         El Baix Guinardó|    el baix guinardo|       Q3297875|
|      Eixample|La Dreta de l'Eix...|             Eixample|      eixample|     Q64124|      Dreta de l'Eixample|la dreta de l eix...|       Q1904302|
|   Sant Andreu|El Congrés i els ...|          Sant Andreu|   sant andreu|   Q1650230|     El Congré

In [133]:
# Check for missing values
for col_name in idealista_lookup_df.columns:
    # Check if the column is not of numeric type
    if "int" in str(idealista_lookup_df.schema[col_name].dataType) or "double" in str(idealista_lookup_df.schema[col_name].dataType):
        missing_count = idealista_lookup_df.filter(F.col(col_name).isNull() | F.isnan(col_name)).count()
    else:
        missing_count = idealista_lookup_df.filter(F.col(col_name).isNull()).count()
    print(f"Missing values for column {col_name}: {missing_count}")

Missing values for column district: 0
Missing values for column neighborhood: 0
Missing values for column district_n_reconciled: 0
Missing values for column district_n: 0
Missing values for column district_id: 0
Missing values for column neighborhood_n_reconciled: 0
Missing values for column neighborhood_n: 0
Missing values for column neighborhood_id: 0


In [134]:
# write data to  Parquet format
idealista_lookup_df.write.mode('overwrite').parquet("../formatted_zone/idealista_lookup.parquet")

In [135]:
income_lookup_file = "../datasets/lookup_tables/income_opendatabcn_extended.csv"
income_lookup_df = spark.read.csv(income_lookup_file, header=True)
income_lookup_df.printSchema()

root
 |-- district: string (nullable = true)
 |-- neighborhood: string (nullable = true)
 |-- district_n_reconciled: string (nullable = true)
 |-- district_n: string (nullable = true)
 |-- district_id: string (nullable = true)
 |-- neighborhood_n_reconciled: string (nullable = true)
 |-- neighborhood_n: string (nullable = true)
 |-- neighborhood_id: string (nullable = true)



In [136]:
# Check for missing values
for col_name in income_lookup_df.columns:
    # Check if the column is not of numeric type
    if "int" in str(income_lookup_df.schema[col_name].dataType) or "double" in str(income_lookup_df.schema[col_name].dataType):
        missing_count = income_lookup_df.filter(F.col(col_name).isNull() | F.isnan(col_name)).count()
    else:
        missing_count = income_lookup_df.filter(F.col(col_name).isNull()).count()
    print(f"Missing values for column {col_name}: {missing_count}")

Missing values for column district: 0
Missing values for column neighborhood: 0
Missing values for column district_n_reconciled: 0
Missing values for column district_n: 0
Missing values for column district_id: 0
Missing values for column neighborhood_n_reconciled: 0
Missing values for column neighborhood_n: 0
Missing values for column neighborhood_id: 0


In [137]:
income_lookup_df.show(5)

+-------------------+--------------------+---------------------+-------------------+-----------+-------------------------+--------------------+---------------+
|           district|        neighborhood|district_n_reconciled|         district_n|district_id|neighborhood_n_reconciled|      neighborhood_n|neighborhood_id|
+-------------------+--------------------+---------------------+-------------------+-----------+-------------------------+--------------------+---------------+
|       Ciutat Vella|      el Barri Gòtic|         Ciutat Vella|       ciutat vella|    Q941385|           Gothic Quarter|      el barri gotic|         Q17154|
|         Nou Barris|         Can Peguera|           Nou Barris|         nou barris|   Q1641049|              Can Peguera|         can peguera|       Q3320716|
|         Nou Barris|    la Trinitat Nova|           Nou Barris|         nou barris|   Q1641049|         La Trinitat Nova|    la trinitat nova|       Q3750932|
|Sarrià-Sant Gervasi|Sant Gervasi - la..

In [141]:
# write data to  Parquet format
income_lookup_df.write.mode('overwrite').parquet("../formatted_zone/income_lookup.parquet")

In [139]:
# save to csv
idealista.toPandas().to_csv('../test/idealista.csv', index=False)
prices_df.toPandas().to_csv('../test/prices.csv', index=False)
income.toPandas().to_csv('../test/income.csv', index=False)
idealista_lookup_df.toPandas().to_csv('../test/idealista_lookup.csv', index=False)
income_lookup_df.toPandas().to_csv('../test/income_lookup.csv', index=False)

24/06/06 12:51:23 WARN DAGScheduler: Broadcasting large task binary with size 3.4 MiB
24/06/06 12:51:26 WARN DAGScheduler: Broadcasting large task binary with size 1838.3 KiB
                                                                                