## Lab1

In [1]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("ETL-local").getOrCreate()
print('Spark session created:', spark)

Spark session created: <pyspark.sql.session.SparkSession object at 0x00000199D2E83990>


In [2]:
df = spark.read.csv("./data/users_data.csv", header=True, inferSchema=True)
df.show(5)

+-------+----------+---------+---+------+--------+------+
|user_id|first_name|last_name|age|gender| country|salary|
+-------+----------+---------+---+------+--------+------+
|      1|     Chloe|  Lefevre| 30|     F|  France|4100.8|
|      2|     Oscar|   Moreau| 41|     M|Belgique|5200.3|
|      3|      Maya|    Singh| 27|     F|    Inde|2300.5|
|      4|       Leo| Anderson| 33|     M|     USA|8900.0|
|      5|      Ines| Gonzalez| 36|     F| Espagne|6700.9|
+-------+----------+---------+---+------+--------+------+
only showing top 5 rows


In [3]:
df_clean = df.filter(df.age.isNotNull()).filter(df.country != "Unknown")

In [4]:
df_country = df_clean.groupBy("country").agg({"age": "avg", "salary": "avg"})
df_country.show()

+---------+-----------------+--------+
|  country|      avg(salary)|avg(age)|
+---------+-----------------+--------+
|     Inde|           2700.6|    29.0|
|  Sénégal|           2900.5|    37.0|
|   Russie|           5600.0|    29.0|
|   France|           4750.5|    34.5|
|  Espagne|6900.549999999999|    35.0|
|Argentine|           8800.9|    43.0|
|   Egypte|           3500.6|    40.0|
|   Italie|           4700.3|    28.0|
|      USA|           9250.4|    39.0|
|  Mexique|           2700.5|    23.0|
|    Maroc|           3300.1|    26.0|
|    Chine|           5800.4|    32.0|
|   Canada|           6850.5|    26.5|
| Belgique|           5200.3|    41.0|
|  Vietnam|           1250.4|    25.0|
+---------+-----------------+--------+



In [5]:
from pyspark.sql.functions import when, col
df = df.withColumn("age_category",
      when(col("age") < 30, "jeune")
      .when(col("age") < 45, "adulte")
      .otherwise("senior"))

In [6]:
df.groupBy("country", "age_category").agg({"salary": "avg"}).show()

+---------+------------+-----------------+
|  country|age_category|      avg(salary)|
+---------+------------+-----------------+
|Argentine|      adulte|           8800.9|
|   Egypte|      adulte|           3500.6|
|  Mexique|       jeune|           2700.5|
|   Canada|       jeune|           6850.5|
|   France|      adulte|           4750.5|
|      USA|      senior|           9600.8|
|  Espagne|      adulte|6900.549999999999|
|     Inde|      adulte|           3100.7|
| Belgique|      adulte|           5200.3|
|  Vietnam|       jeune|           1250.4|
|  Sénégal|      adulte|           2900.5|
|    Maroc|       jeune|           3300.1|
|   Italie|       jeune|           4700.3|
|   Russie|       jeune|           5600.0|
|     Inde|       jeune|           2300.5|
|      USA|      adulte|           8900.0|
|    Chine|      adulte|           5800.4|
+---------+------------+-----------------+



In [7]:
df_country.write.mode("overwrite").parquet("C:/Users/imene/Desktop/ecole/ecole/M2/traitement-distribué/data/output/country_stats.parquet")

## Lab2

In [11]:
df_logs = spark.read.option("header", True).csv("./data/logs_web.csv")

In [12]:
df_logs

DataFrame[timestamp: string, user_id: string, url: string, action: string, device: string, country: string]

In [13]:
from pyspark.sql.functions import split, col
df_logs = df_logs.withColumn("url_path", split(col("url"), "/").getItem(1))

In [14]:
from pyspark.sql.functions import hour, count
df_stats = df_logs.groupBy(hour(col("timestamp")).alias("hour")).agg(count("*").alias("visits"))

In [15]:
df_stats.write.mode("overwrite").parquet("./data/output/logs_hourly/")