-
Notifications
You must be signed in to change notification settings - Fork 45
/
script_save_minio.py
86 lines (58 loc) · 2.49 KB
/
script_save_minio.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
import shutil
import requests
import zipfile
import os
import glob
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq
from pyarrow import fs
def import_by_decade(decennie = 1970):
url = f"https://www.insee.fr/fr/statistiques/fichier/4769950/deces-{decennie}-{decennie+9}-csv.zip"
req = requests.get(url)
with open(f"deces_{decennie}.zip",'wb') as f:
f.write(req.content)
with zipfile.ZipFile(f"deces_{decennie}.zip", 'r') as zip_ref:
zip_ref.extractall(f"deces_{decennie}")
csv_files = glob.glob(os.path.join(f"deces_{decennie}", "*.csv"))
df = [pd.read_csv(f, sep = ";", encoding="utf-8").assign(annee = f) for f in csv_files]
df = pd.concat(df)
df[['nom','prenom']] = df['nomprenom'].str.split("*", expand=True)
df['prenom'] = df['prenom'].str.replace("/","")
df['annee'] = df['annee'].str.rsplit("/").str[-1].str.replace("(Deces_|.csv|deces-)","").astype(int)
shutil.rmtree(f"deces_{decennie}")
os.remove(f"deces_{decennie}.zip")
return df
dfs = [import_by_decade(d) for d in [1970, 1980, 1990, 2000, 2010]]
deces = pd.concat(dfs)
# NAISSANCES -----------------
year = 2021
url_naissance = f"https://www.insee.fr/fr/statistiques/fichier/2540004/nat{year}_csv.zip"
req = requests.get(url_naissance)
with open(f"naissance_{year}.zip",'wb') as f:
f.write(req.content)
with zipfile.ZipFile(f"naissance_{year}.zip", 'r') as zip_ref:
zip_ref.extractall(f"naissance_{year}")
naissance = pd.read_csv(f"naissance_{year}/nat{year}.csv", sep = ";")
naissance = naissance.dropna(subset = ['preusuel'] )
# RESTRUCTURE --------------
jean_naiss = naissance.loc[naissance['preusuel'] == "JEAN"].loc[:, ['annais', 'nombre']]
jean_naiss = jean_naiss.rename({"annais": "annee"}, axis = "columns")
jean_naiss = jean_naiss.groupby('annee').sum().reset_index()
jean_deces = deces.loc[deces["prenom"] == "JEAN"]
jean_deces = jean_deces.groupby('annee').size().reset_index()
jean_deces.columns = ['annee', "nombre"]
jean_naiss.columns = ['annee', "nombre"]
df = pd.concat(
[
jean_deces.assign(source = "deces"),
jean_naiss.assign(source = "naissance")
])
df = df.loc[df['annee'] != "XXXX"]
df['annee']=df['annee'].astype(int)
df = df.loc[df['annee'] > 1971]
# SAVE IN MINIO --------------
s3 = fs.S3FileSystem(endpoint_override="http://"+"minio.lab.sspcloud.fr")
bucket = "lgaliana"
table = pa.Table.from_pandas(df, preserve_index=False)
pq.write_table(table, f'{bucket}/diffusion/prenoms.parquet', filesystem=s3)