### calcul multiprocessing

#### énoncé
1. avaler le fichier villes_france.zip dans pandas
2. transformer le zipcode en département
3. créer un "worker" qui calcule la combinaison de toutes les distances de villes dans un dpt
4. multiprocesser les worker dans la pool
5. réduire le max total des max locaux

In [1]:
import os, sys
from re import sub
import pandas as pd
import numpy as np
from latloncalc.latlon import LatLon, Latitude, Longitude
from itertools import combinations
from multiprocessing import Pool, cpu_count, current_process

In [14]:
villes_df = pd.read_csv(
    "villes_france.zip",
    encoding="utf8",
    # il n'y a pas d'header
    header=None,
    # donner des nom des colonnes
    names=["city", "zipcode", "lon", "lat"]
)
villes_df

Unnamed: 0,city,zipcode,lon,lat
0,OZAN,1190,4.91667,46.38330
1,CORMORANCHE-SUR-SAONE,1290,4.83333,46.23330
2,PLAGNE,1130,5.73333,46.18330
3,TOSSIAT,1250,5.31667,46.13330
4,POUILLAT,1250,5.43333,46.33330
...,...,...,...,...
36695,SADA,97640,45.10470,-12.84860
36696,TSINGONI,97680,45.10700,-12.78970
36697,SAINT BARTHELEMY,97133,-62.83330,17.91670
36698,SAINT MARTIN,97150,18.09130,-63.08290


In [15]:
# transformation du zipcode en dept 
def get_dept(zc: str):
    return "0" + zc[0] if len(zc) < 5 else zc[:2]

villes_df["zipcode"] = villes_df["zipcode"].apply(get_dept)
villes_df.rename(columns={"zipcode": "dept"}, inplace=True)
villes_df

Unnamed: 0,city,dept,lon,lat
0,OZAN,01,4.91667,46.38330
1,CORMORANCHE-SUR-SAONE,01,4.83333,46.23330
2,PLAGNE,01,5.73333,46.18330
3,TOSSIAT,01,5.31667,46.13330
4,POUILLAT,01,5.43333,46.33330
...,...,...,...,...
36695,SADA,97,45.10470,-12.84860
36696,TSINGONI,97,45.10700,-12.78970
36697,SAINT BARTHELEMY,97,-62.83330,17.91670
36698,SAINT MARTIN,97,18.09130,-63.08290


In [17]:
# trouver des doublons de villes + dept
villes_df.drop_duplicates(subset=["city", "dept"], keep="first", inplace=True)

In [25]:
# cas de test
creuse_df = villes_df.loc[ villes_df["dept"] == "23" ].set_index("city")
# calcule des distances des combinaisons à 2 des villes d'un département
# en trouvant le max
def max_geodesic(df: pd.DataFrame):
    max_d, itinerary = 0, ""
    for v1, v2 in combinations(df.index, r=2):
        point1 = LatLon(Latitude(df.loc[v1]["lat"]), Longitude(df.loc[v1]["lon"]))
        point2 = LatLon(Latitude(df.loc[v2]["lat"]), Longitude(df.loc[v2]["lon"]))
        d = point1.distance(point2)
        if d > max_d:
            max_d = d
            itinerary = f"{v1} <-> {v2}" 
    return  itinerary, max_d
        
# cas d'un département
# max_geodesic(creuse_df)
  

('SAINT-MERD-LA-BREUILLE <-> SAINT-SEBASTIEN', 100.33450422248873)

In [24]:
# cas unique
# point1 = LatLon(Latitude(creuse_df.loc["VIERSAT"]["lat"]), Longitude(creuse_df.loc["VIERSAT"]["lon"]))
# point2 = LatLon(Latitude(creuse_df.loc["LUSSAT"]["lat"]), Longitude(creuse_df.loc["LUSSAT"]["lon"]))
# d = point1.distance(point2)
# d

13.529874898086263

In [None]:
## WARNING: pour utiliser des process ou des threading ou des futures ...
## on utilise le bloc __name__ == "__main__" sinon erreur
if __name__ == "__main__":
    depts = ["01", "22", "13", "33", "44", "15" "29" "78", "81", "50"]