## Exercice 1: Afficher les cinq premières lignes du DataFrame.



In [7]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.getOrCreate()
data = spark.read.csv('demographics_de.csv', header=True, inferSchema=True)
data.createOrReplaceTempView('demographics')
df = spark.table('demographics')
df.show(5)

+------------------+------+---------+----------+
|             state|gender|age_group|population|
+------------------+------+---------+----------+
|Baden-Wuerttemberg|female|    00-04|    261674|
|Baden-Wuerttemberg|female|    05-14|    490822|
|Baden-Wuerttemberg|female|    15-34|   1293488|
|Baden-Wuerttemberg|female|    35-59|   1919649|
|Baden-Wuerttemberg|female|    60-79|   1182736|
+------------------+------+---------+----------+
only showing top 5 rows



## Exercice 2: Compter le nombre total de lignes dans le DataFrame.



In [24]:
df.count()

192

## Exercice 3: Trouver le nombre distinct d'états (states) présents dans le DataFrame.



In [14]:
from pyspark.sql.functions import *

df.select(col('state')).distinct().count()

16

## Exercice 4: Calculer la somme de la population totale.



In [20]:
df.agg(sum(col('population')).alias('population_totale')).show()

+-----------------+
|population_totale|
+-----------------+
|         83019213|
+-----------------+



## Exercice 5: Afficher la population maximale par groupe d'âge (age_group).



In [31]:
df.groupby(col('age_group'))\
  .agg(max(col('population')).alias('popul_max_by_age_group'))\
  .show()

+---------+----------------------+
|age_group|popul_max_by_age_group|
+---------+----------------------+
|    80-99|                729148|
|    05-14|                832251|
|    35-59|               3147565|
|    60-79|               2009976|
|    00-04|                440708|
|    15-34|               2188068|
+---------+----------------------+



## Exercice 6: Filtrer les lignes du DataFrame pour n'inclure que les états (states) dont la population dépasse 1 million.



In [32]:
df.groupby(col('state'))\
  .agg(sum(col('population')).alias('state_population'))\
  .filter(col('state_population') > 1000000)\
  .sort(col('state_population').desc())\
  .show()

+--------------------+----------------+
|               state|state_population|
+--------------------+----------------+
| Nordrhein-Westfalen|        17932651|
|              Bayern|        13076721|
|  Baden-Wuerttemberg|        11069533|
|       Niedersachsen|         7982448|
|              Hessen|         6265809|
|     Rheinland-Pfalz|         4084844|
|             Sachsen|         4077937|
|              Berlin|         3644826|
|  Schleswig-Holstein|         2896712|
|         Brandenburg|         2511917|
|      Sachsen-Anhalt|         2208321|
|          Thueringen|         2143145|
|             Hamburg|         1841179|
|Mecklenburg-Vorpo...|         1609675|
+--------------------+----------------+



## Exercice 7: Calculer la moyenne de la population pour chaque groupe d'âge (age_group).



In [35]:
df.groupby(col('age_group'))\
  .agg(avg(col('population')).alias('average_population'))\
  .show()

+---------+------------------+
|age_group|average_population|
+---------+------------------+
|    80-99|       168409.5625|
|    05-14|       230138.0625|
|    35-59|      910557.46875|
|    60-79|        562135.625|
|    00-04|      122699.90625|
|    15-34|      600409.78125|
+---------+------------------+



## Exercice 8: Trier le DataFrame par ordre décroissant de population.



In [48]:
  df.sort(col('population').desc())\
    .show()

+-------------------+------+---------+----------+
|              state|gender|age_group|population|
+-------------------+------+---------+----------+
|Nordrhein-Westfalen|female|    35-59|   3147565|
|Nordrhein-Westfalen|  male|    35-59|   3127984|
|             Bayern|  male|    35-59|   2348310|
|             Bayern|female|    35-59|   2310659|
|Nordrhein-Westfalen|  male|    15-34|   2188068|
|Nordrhein-Westfalen|female|    15-34|   2047533|
|Nordrhein-Westfalen|female|    60-79|   2009976|
| Baden-Wuerttemberg|  male|    35-59|   1955828|
| Baden-Wuerttemberg|female|    35-59|   1919649|
|Nordrhein-Westfalen|  male|    60-79|   1775275|
|             Bayern|  male|    15-34|   1639500|
|             Bayern|female|    15-34|   1507659|
| Baden-Wuerttemberg|  male|    15-34|   1423367|
|             Bayern|female|    60-79|   1421273|
|      Niedersachsen|female|    35-59|   1392647|
|      Niedersachsen|  male|    35-59|   1391992|
| Baden-Wuerttemberg|female|    15-34|   1293488|


## Exercice 9: Sélectionner les lignes du DataFrame où l'âge (age_group) est "18-24" et le sexe (gender) est "F".



In [65]:
df = df.withColumn('age_min', df.age_group.substr(0, 2).astype('int'))
df = df.withColumn('age_max', df.age_group.substr(-2, 2).astype('int'))
df.filter(
    (df.gender.startswith('f'))
    & (df.age_min >= 18)
    & (df.age_max <= 24)
  )\
  .show()

+-----+------+---------+----------+-------+-------+
|state|gender|age_group|population|age_min|age_max|
+-----+------+---------+----------+-------+-------+
+-----+------+---------+----------+-------+-------+



## Exercice 10: Ajouter une nouvelle colonne "population_percentage" qui représente la proportion de la population par rapport à la population totale.



In [76]:
population_totale = df.agg(sum(col('population')).alias('population_totale'))\
                      .collect()[0][0]
df = df.withColumn('population_percentage', 
                   round(df.population / population_totale * 100, 2))
df.show()

+------------------+------+---------+----------+-------+-------+---------------------+
|             state|gender|age_group|population|age_min|age_max|population_percentage|
+------------------+------+---------+----------+-------+-------+---------------------+
|Baden-Wuerttemberg|female|    00-04|    261674|      0|      4|                 0.32|
|Baden-Wuerttemberg|female|    05-14|    490822|      5|     14|                 0.59|
|Baden-Wuerttemberg|female|    15-34|   1293488|     15|     34|                 1.56|
|Baden-Wuerttemberg|female|    35-59|   1919649|     35|     59|                 2.31|
|Baden-Wuerttemberg|female|    60-79|   1182736|     60|     79|                 1.42|
|Baden-Wuerttemberg|female|    80-99|    419471|     80|     99|                 0.51|
|Baden-Wuerttemberg|  male|    00-04|    274882|      0|      4|                 0.33|
|Baden-Wuerttemberg|  male|    05-14|    517387|      5|     14|                 0.62|
|Baden-Wuerttemberg|  male|    15-34|   142