# Démographie 2019

__Auteur__: Kiril Isakov  
__Date de création__: 2023-07-25  
__Présentation__: Notebook pour le calcul de la population moyenne des villes françaises en 2019.

__Prérequis__: Un bucket gcp pour le stockage des données. (BUCKET_NAME)  

__Inputs__: fichier CSV

__Params__:
- `BUCKET_NAME`: nom du bucket GCP
- `INPUT_FILE_NAME`

In [1]:
BUCKET_NAME = 'data_m2i'
INPUT_FILE_NAME = 'data.csv'
input_file_uri = f'gs://{BUCKET_NAME}/input/{INPUT_FILE_NAME}'

In [2]:
import pyspark.sql.functions as sqlf

In [3]:
df = spark.read.csv(input_file_uri, header=False)

                                                                                

In [4]:
colnums_to_show = [1, 5, 8, 15, 16, 17]
pop_villes = df.select(*[f'_c{n}' for n in colnums_to_show])
pop_villes.show()

[Stage 1:>                                                          (0 + 1) / 1]

+---+--------------------+-----+----+----+----+
|_c1|                 _c5|  _c8|_c15|_c16|_c17|
+---+--------------------+-----+----+----+----+
| 01|                Ozan|01190| 469| 500|  93|
| 01|Cormoranche-sur-S...|01290| 903|1000| 107|
| 01|              Plagne|01130|  83| 100|  20|
| 01|             Tossiat|01250|1111|1400| 138|
| 01|            Pouillat|01250|  58| 100|  14|
| 01|             Torcieu|01230| 643| 700|  65|
| 01|           Replonges|01620|2841|3300| 210|
| 01|           Corcelles|01110| 222| 200|  17|
| 01|               Péron|01630|1578|1900|  82|
| 01|            Relevant|01990| 367| 400|  37|
| 01|          Chaveyriat|01660| 810| 900|  54|
| 01|       Vaux-en-Bugey|01150|1003|1100| 142|
| 01|             Maillat|01430| 664| 700|  59|
| 01|            Faramans|01800| 591| 600|  60|
| 01|                Béon|01350| 364| 400|  36|
| 01|       Saint-Bernard|01600|1281|1400| 436|
| 01|           Rossillon|01510| 147| 100|  18|
| 01|          Pont-d'Ain|01160|2309|250

                                                                                

In [5]:
pop_villes.agg(sqlf.avg('_c15').alias('population_moy_des_villes'))\
          .show()



+-------------------------+
|population_moy_des_villes|
+-------------------------+
|       1644.0646049046322|
+-------------------------+



                                                                                

In [6]:
pop_villes.withColumnRenamed('_c1', 'dép')\
          .groupby(sqlf.col('dép'))\
          .agg(sqlf.avg('_c15').alias('pop_moy_par_dép'))\
          .sort(sqlf.col('pop_moy_par_dép').desc())\
          .show()



+---+------------------+
|dép|   pop_moy_par_dép|
+---+------------------+
| 75|         2125851.0|
| 92|           39685.5|
| 93|           34573.2|
|974|29424.166666666668|
| 94| 26105.55319148936|
| 13|15423.588235294117|
|971|12710.970588235294|
|976|12508.529411764706|
|972|11215.441176470587|
|973| 7126.818181818182|
| 06| 6207.766871165644|
|975|            6080.0|
| 95| 5974.183783783784|
| 83| 5869.287581699346|
| 91|  5785.84693877551|
| 69| 5387.109215017065|
| 78| 5167.774809160305|
| 44|5133.4524886877825|
| 59|3845.2553846153846|
| 84|3309.0397350993376|
+---+------------------+
only showing top 20 rows



                                                                                

In [7]:
population_totale = pop_villes.agg(sqlf.sum('_c15'))\
                              .collect()[0][0]
pop_villes.withColumnRenamed('_c1', 'dép')\
          .groupby(sqlf.col('dép'))\
          .agg(sqlf.sum('_c15').alias('population_dép'))\
          .sort(sqlf.col('population_dép').desc())\
          .withColumn('part_population_totale',
                      sqlf.format_string('%.4f%%', sqlf.col('population_dép') / population_totale * 100)
                      )\
          .show()



+---+--------------+----------------------+
|dép|population_dép|part_population_totale|
+---+--------------+----------------------+
| 59|     2499416.0|               4.1424%|
| 75|     2125851.0|               3.5233%|
| 13|     1835407.0|               3.0419%|
| 69|     1578423.0|               2.6160%|
| 62|     1441422.0|               2.3889%|
| 92|     1428678.0|               2.3678%|
| 93|     1382928.0|               2.2920%|
| 78|     1353957.0|               2.2440%|
| 33|     1287532.0|               2.1339%|
| 76|     1239176.0|               2.0538%|
| 94|     1226961.0|               2.0335%|
| 77|     1193511.0|               1.9781%|
| 44|     1134493.0|               1.8803%|
| 91|     1134026.0|               1.8795%|
| 95|     1105224.0|               1.8317%|
| 38|     1093786.0|               1.8128%|
| 31|     1046532.0|               1.7345%|
| 67|     1026023.0|               1.7005%|
| 57|     1023199.0|               1.6958%|
| 06|     1011866.0|            

                                                                                

In [23]:
pop_médiane = pop_villes.withColumnRenamed('_c1', 'dép')\
            .groupby('dép')\
            .agg(
                sqlf.percentile_approx('_c15', 0.5, 10).alias('population_médiane')
            )\
            .select('dép', 'population_médiane')
pop_villes.join(pop_médiane,
                on=[sqlf.col('_c15') == pop_médiane['population_médiane'],
                    sqlf.col('_c1') == pop_médiane['dép']
                   ]
               )\
            .select('dép', 'population_médiane', '_c5', '_c8', '_c15')\
            .withColumnRenamed('_c5', 'nom_commune')\
            .withColumnRenamed('_c8', 'code_postal')\
            .withColumnRenamed('_c15', 'population_2019')\
            .show()

+---+------------------+--------------------+-----------+---------------+
|dép|population_médiane|         nom_commune|code_postal|population_2019|
+---+------------------+--------------------+-----------+---------------+
| 01|             509.0|Saint-Julien-sur-...|      01560|            509|
| 02|             194.0|          Couvrelles|      02220|            194|
| 02|             194.0|   Vesles-et-Caumont|      02350|            194|
| 03|             332.0|            Arronnes|      03250|            332|
| 04|             168.0|           Montfuron|      04110|            168|
| 05|             195.0|             Réallon|      05160|            195|
| 06|             456.0|          Gréolières|      06620|            456|
| 07|             279.0|        Saint-Basile|      07270|            279|
| 08|             140.0|              Nouart|      08240|            140|
| 08|             140.0|      Chatel-Chéhéry|      08250|            140|
| 08|             140.0|         Houdi