In [None]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('DataEngineering').getOrCreate()
df = spark.read.csv('/content/sample_data/california_housing_test.csv', header=True, inferSchema=True)
# For Parquet: df = spark.read.parquet('path/to/parquet')
#print(df.printSchema())
df.show(10)

+---------+--------+------------------+-----------+--------------+----------+----------+-------------+------------------+
|longitude|latitude|housing_median_age|total_rooms|total_bedrooms|population|households|median_income|median_house_value|
+---------+--------+------------------+-----------+--------------+----------+----------+-------------+------------------+
|  -122.05|   37.37|              27.0|     3885.0|         661.0|    1537.0|     606.0|       6.6085|          344700.0|
|   -118.3|   34.26|              43.0|     1510.0|         310.0|     809.0|     277.0|        3.599|          176500.0|
|  -117.81|   33.78|              27.0|     3589.0|         507.0|    1484.0|     495.0|       5.7934|          270500.0|
|  -118.36|   33.82|              28.0|       67.0|          15.0|      49.0|      11.0|       6.1359|          330000.0|
|  -119.67|   36.33|              19.0|     1241.0|         244.0|     850.0|     237.0|       2.9375|           81700.0|
|  -119.56|   36.51|    

In [None]:
from pyspark.sql.functions import col
df_clean = df.select('housing_median_age', 'median_house_value', col('total_rooms').cast('int')) \
    .filter(col('median_house_value') > 100000) \
    .dropna() \
    .orderBy(col('housing_median_age').desc()) \
    .dropDuplicates(['housing_median_age']) \
    .withColumn('duplex_room', col('total_rooms') * 2)
df_clean.show(10)

+------------------+------------------+-----------+-----------+
|housing_median_age|median_house_value|total_rooms|duplex_room|
+------------------+------------------+-----------+-----------+
|              52.0|          250000.0|       3587|       7174|
|              51.0|          162500.0|       1005|       2010|
|              50.0|          295200.0|       2935|       5870|
|              49.0|          116800.0|       2120|       4240|
|              48.0|          214800.0|       1308|       2616|
|              47.0|          241500.0|       2994|       5988|
|              46.0|          137500.0|        860|       1720|
|              45.0|          125000.0|        972|       1944|
|              44.0|          292200.0|       1449|       2898|
|              43.0|          176500.0|       1510|       3020|
+------------------+------------------+-----------+-----------+
only showing top 10 rows



In [None]:
from pyspark.sql.functions import col, sum
df_train = spark.read.csv('/content/sample_data/california_housing_train.csv', header=True, inferSchema=True)
#orders.show(10)
df_joined = df.join(df_train, (df.latitude == df_train.latitude) & (df.longitude == df_train.longitude), 'inner')
#df_joined.select(df.housing_median_age).show()
agg_df = df_joined.groupBy(df_train.housing_median_age).agg(sum(df_train.median_house_value).alias('sum_median_house_value'))
agg_df.show(10)

+------------------+----------------------+
|housing_median_age|sum_median_house_value|
+------------------+----------------------+
|               8.0|             1687000.0|
|               7.0|              568000.0|
|              49.0|             8132700.0|
|              29.0|           1.5079601E7|
|              47.0|             1.10846E7|
|              42.0|           2.1801105E7|
|              44.0|           2.2057104E7|
|              35.0|           3.6042602E7|
|              18.0|           1.0158903E7|
|              39.0|           1.7574302E7|
+------------------+----------------------+
only showing top 10 rows



In [None]:
from pyspark.sql.window import Window
from pyspark.sql.functions import row_number, col

windowSpec = Window.partitionBy(df_train.housing_median_age).orderBy(df_train.median_house_value.desc())
df_ranked = df_joined.withColumn('rank', row_number().over(windowSpec))

# Select all columns from df_train and the 'rank' column
df_final = [df_train[col_name] for col_name in df_train.columns] + [col('rank')]
df_ranked.select(df_final).show(10)

+---------+--------+------------------+-----------+--------------+----------+----------+-------------+------------------+----+
|longitude|latitude|housing_median_age|total_rooms|total_bedrooms|population|households|median_income|median_house_value|rank|
+---------+--------+------------------+-----------+--------------+----------+----------+-------------+------------------+----+
|  -122.43|   37.78|               2.0|     1205.0|         468.0|     577.0|     363.0|       3.6437|          275000.0|   1|
|  -122.43|   37.78|               2.0|     1205.0|         468.0|     577.0|     363.0|       3.6437|          275000.0|   2|
|  -117.26|   33.19|               2.0|     2629.0|         509.0|    1044.0|     522.0|       4.2361|          158500.0|   3|
|  -117.59|   33.65|               2.0|     4860.0|        1193.0|    2332.0|    1073.0|       4.5022|          151900.0|   4|
|  -117.82|   33.68|               3.0|     7105.0|        1459.0|    3068.0|    1241.0|       6.1395|         