In [4]:
from pyspark.sql import SparkSession

spark = (
    SparkSession.builder
    .appName("Lab3 - Optimisation Spark")
    .getOrCreate()
)

spark


In [5]:
df_logs = spark.read.parquet("./data/output/logs_hourly/")
df_logs.printSchema()
df_logs.show(5)

root
 |-- hour: integer (nullable = true)
 |-- visits: long (nullable = true)

+----+------+
|hour|visits|
+----+------+
|   9|  1410|
+----+------+



In [6]:
# Analyse du plan d'exÃ©cution
df_logs.explain(True)

== Parsed Logical Plan ==
UnresolvedDataSource format: parquet, isStreaming: false, paths: 1 provided

== Analyzed Logical Plan ==
hour: int, visits: bigint
Relation [hour#0,visits#1L] parquet

== Optimized Logical Plan ==
Relation [hour#0,visits#1L] parquet

== Physical Plan ==
*(1) ColumnarToRow
+- FileScan parquet [hour#0,visits#1L] Batched: true, DataFilters: [], Format: Parquet, Location: InMemoryFileIndex(1 paths)[file:/c:/Users/imene/Desktop/ecole/ecole/M2/traitement-distribuÃ©/data/..., PartitionFilters: [], PushedFilters: [], ReadSchema: struct<hour:int,visits:bigint>



In [7]:
import time

results = []

for n in [2, 4, 8]:
    print(f"\nðŸ”¹ Test avec {n} partitions")

    df_opt = df_logs.repartition(n)
    df_opt.cache()  # mise en cache

    start = time.time()
    df_opt.count()  # action pour dÃ©clencher le cache
    duration = time.time() - start

    results.append((n, duration))
    print(f"DurÃ©e : {duration:.2f} sec")

results_df = spark.createDataFrame(results, ["nb_partitions", "temps_sec"])
results_df.show()



ðŸ”¹ Test avec 2 partitions
DurÃ©e : 1.30 sec

ðŸ”¹ Test avec 4 partitions
DurÃ©e : 0.55 sec

ðŸ”¹ Test avec 8 partitions
DurÃ©e : 0.59 sec
+-------------+------------------+
|nb_partitions|         temps_sec|
+-------------+------------------+
|            2| 1.295647382736206|
|            4|0.5467822551727295|
|            8|0.5938100814819336|
+-------------+------------------+



In [8]:
df_opt

DataFrame[hour: int, visits: bigint]

In [15]:
df_users = spark.read.csv("./data/users_data.csv", header=True, inferSchema=True)
df_users.show(5)

+-------+----------+---------+---+------+--------+------+
|user_id|first_name|last_name|age|gender| country|salary|
+-------+----------+---------+---+------+--------+------+
|      1|     Chloe|  Lefevre| 30|     F|  France|4100.8|
|      2|     Oscar|   Moreau| 41|     M|Belgique|5200.3|
|      3|      Maya|    Singh| 27|     F|    Inde|2300.5|
|      4|       Leo| Anderson| 33|     M|     USA|8900.0|
|      5|      Ines| Gonzalez| 36|     F| Espagne|6700.9|
+-------+----------+---------+---+------+--------+------+
only showing top 5 rows


In [12]:
from pyspark.sql.functions import broadcast

# CrÃ©ons un DataFrame fictif Ã  joindre
from pyspark.sql import Row
df_visits = spark.createDataFrame([
    Row(user_id=1, visits=5),
    Row(user_id=2, visits=3),
    Row(user_id=3, visits=7)
])

df_joined = df_visits.join(broadcast(df_users), "user_id")
df_joined.show()
df_joined.explain(True)


+-------+------+----------+---------+---+------+--------+------+
|user_id|visits|first_name|last_name|age|gender| country|salary|
+-------+------+----------+---------+---+------+--------+------+
|      1|     5|     Chloe|  Lefevre| 30|     F|  France|4100.8|
|      2|     3|     Oscar|   Moreau| 41|     M|Belgique|5200.3|
|      3|     7|      Maya|    Singh| 27|     F|    Inde|2300.5|
+-------+------+----------+---------+---+------+--------+------+

== Parsed Logical Plan ==
'Join UsingJoin(Inner, [user_id])
:- LogicalRDD [user_id#271L, visits#272L], false
+- ResolvedHint (strategy=broadcast)
   +- Relation [user_id#234,first_name#235,last_name#236,age#237,gender#238,country#239,salary#240] csv

== Analyzed Logical Plan ==
user_id: bigint, visits: bigint, first_name: string, last_name: string, age: int, gender: string, country: string, salary: double
Project [user_id#271L, visits#272L, first_name#235, last_name#236, age#237, gender#238, country#239, salary#240]
+- Join Inner, (user_i

In [16]:
start = time.time()
df_joined.count()
print(f"Temps total (broadcast join): {time.time() - start:.2f} sec")

Temps total (broadcast join): 13.79 sec


In [17]:
spark.sparkContext.setCheckpointDir("./data/checkpoints")

# CrÃ©ation du checkpoint
df_joined_checkpointed = df_joined.checkpoint(eager=True)
df_joined_checkpointed.count()


3

In [18]:
results_df.show()

+-------------+------------------+
|nb_partitions|         temps_sec|
+-------------+------------------+
|            2| 1.295647382736206|
|            4|0.5467822551727295|
|            8|0.5938100814819336|
+-------------+------------------+

