# Stockage compatible S3 avec MinIO

In [91]:
import os
import io
from IPython.display import Image
from IPython.core.display import HTML

import s3fs
import boto3
import pandas as pd

## Architecture

In [2]:
Image(url="https://min.io/resources/img/products/encryption-worm.svg")

## Avantages

- **Portabilité**
- **Autonomie**
- **Découplage stockage / moteur de calcul**

## Démonstration

In [3]:
!mc ls s3

[m[32m[2022-02-10 16:50:28 UTC][0m[33m     0B[0m[36;1m avouacr/[0m
[0m[m[32m[2022-02-10 16:50:39 UTC][0m[33m     0B[0m[36;1m donnees-insee/[0m
[0m[m[32m[2022-02-10 16:51:24 UTC][0m[33m     0B[0m[36;1m projet-arc-resil-poc/[0m
[0m[m[32m[2022-02-10 16:51:25 UTC][0m[33m     0B[0m[36;1m projet-esane-poc/[0m
[0m[m[32m[2022-02-24 10:30:47 UTC][0m[33m     0B[0m[36;1m projet-formation/[0m
[0m[m[32m[2022-02-10 16:51:26 UTC][0m[33m     0B[0m[36;1m projet-onyxia/[0m
[0m[m[32m[2022-04-25 09:48:57 UTC][0m[33m     0B[0m[36;1m projet-poc-aida/[0m
[0m[m[32m[2022-04-26 16:02:54 UTC][0m[33m     0B[0m[36;1m projet-pynsee/[0m
[0m[m[32m[2022-02-10 16:51:27 UTC][0m[33m     0B[0m[36;1m projet-relevanc/[0m
[0m

In [4]:
!mc ls s3/projet-onyxia/demo/rp

[m[32m[2022-04-27 08:25:08 UTC][0m[33m  14GiB[0m[1m individus.csv[0m
[0m[m[32m[2022-04-27 15:52:45 UTC][0m[33m  28MiB[0m[1m sample.csv[0m
[0m[m[32m[2022-04-27 16:11:26 UTC][0m[33m     0B[0m[36;1m individus-region-residence.parquet/[0m
[0m[m[32m[2022-04-27 16:11:26 UTC][0m[33m     0B[0m[36;1m individus_partition_region/[0m
[0m[m[32m[2022-04-27 16:11:26 UTC][0m[33m     0B[0m[36;1m individus_snappy_parquet/[0m
[0m

### Déclaration du endpoint

Les paramètres d'authentification auprès de MinIO sont injectés comme variables d'environnement dans les différents services du Datalab.

In [9]:
endpoint_url='https://' + os.environ['AWS_S3_ENDPOINT']
print(endpoint_url)

https://minio.lab.sspcloud.fr


In [None]:
print(os.environ['MC_HOST_s3'])

### Importer les données en Python

#### Via un service

In [10]:
fs = s3fs.S3FileSystem(client_kwargs={'endpoint_url': endpoint_url})

with fs.open('projet-onyxia/demo/rp/sample.csv','r') as file:
    data = pd.read_csv(file, sep=";")

data.head(5)

Unnamed: 0,region_residence,departement_residence,commune_residence,region_travail,departement_travail,commune_travail,commune_anterieure,commune_etude,pays_naissance,poids,...,variable40,variable41,variable42,variable43,variable44,variable45,variable46,variable47,variable48,variable49
0,53,22,22170,53,22,22170,22170,22170,99,1.197605,...,1G,Z,1,0,930660902,1,93066,999,999,6
1,75,17,17082,27,71,71388,17082,17082,41,1.237221,...,1G,Z,1,0,930660902,1,93066,999,999,6
2,84,38,38002,84,38,38002,38002,38002,12,0.594802,...,1G,Z,1,0,930660902,1,93066,999,999,6
3,75,17,17377,75,17,17377,17377,17377,51,1.179806,...,1G,Z,1,0,930660902,1,93066,999,999,6
4,24,36,36189,84,43,43125,36189,36189,99,0.889843,...,1G,Z,1,0,930660902,1,93066,999,999,6


#### Via une URL publique

In [11]:
URL = "https://minio.lab.sspcloud.fr/projet-onyxia/demo/rp/sample.csv"
data2 = pd.read_csv(URL, sep=";")
data2.head(5)

Unnamed: 0,region_residence,departement_residence,commune_residence,region_travail,departement_travail,commune_travail,commune_anterieure,commune_etude,pays_naissance,poids,...,variable40,variable41,variable42,variable43,variable44,variable45,variable46,variable47,variable48,variable49
0,53,22,22170,53,22,22170,22170,22170,99,1.197605,...,1G,Z,1,0,930660902,1,93066,999,999,6
1,75,17,17082,27,71,71388,17082,17082,41,1.237221,...,1G,Z,1,0,930660902,1,93066,999,999,6
2,84,38,38002,84,38,38002,38002,38002,12,0.594802,...,1G,Z,1,0,930660902,1,93066,999,999,6
3,75,17,17377,75,17,17377,17377,17377,51,1.179806,...,1G,Z,1,0,930660902,1,93066,999,999,6
4,24,36,36189,84,43,43125,36189,36189,99,0.889843,...,1G,Z,1,0,930660902,1,93066,999,999,6


### Quand les données deviennent massives...

#### Le crash

In [None]:
# with fs.open('projet-onyxia/demo/rp/sample.csv','r') as file:
#     data = pd.read_csv(file, sep=";")

#### L'API S3 Select

In [128]:
s3 = boto3.client('s3', endpoint_url=endpoint_url)

QUERY = """
SELECT s.sexe, s.age, s.diplome, s.commune_residence, s.poids
FROM s3object s 
WHERE s.commune_residence='44109'
"""

# QUERY = "SELECT s.sexe, s.age, s.diplome, s.commune_residence, s.poids FROM s3object s LIMIT 100"

resp = s3.select_object_content(
    Bucket='projet-onyxia',
    Key='demo/rp/individus.csv',
    ExpressionType='SQL',
    Expression=QUERY,
    InputSerialization = {'CSV': {"FileHeaderInfo": "USE", 
                                  'FieldDelimiter': ';'}},
    OutputSerialization = {'CSV': {}},
)

In [129]:
records = []
for event in resp['Payload']:
    if 'Records' in event:
        records.append(event['Records']['Payload'])
        
file_str = ''.join(r.decode('utf-8') for r in records)
data3 = pd.read_csv(io.StringIO(file_str), header=None)
data3.columns = ["sexe", "age", "diplome", "commune_residence", "poids"]

In [131]:
data3.head(5)

Unnamed: 0,sexe,age,diplome,commune_residence,poids
0,2,9,0,44109,1.113611
1,2,7,1,44109,0.913017
2,2,2,7,44109,0.87511
3,1,2,5,44109,0.588311
4,2,1,1,44109,0.709778


In [132]:
data3.shape

(1415, 5)

#### Les moteurs de calcul distribué

- Spark
- Trino