# Préambule

A des fins d'évaluation les données générées (features et predictions) hébergé sur les serveurs du service Google Cloud Storage ont été extraites en local grâce à [CLI Google Cloud](https://cloud.google.com/sdk/docs), qui est un ensemble d'outils permettant de gérer les ressources et les applications hébergées sur Google Cloud en ligne de commande.

# Librairies

In [None]:
import pandas as pd
import os

# Config

In [2]:
STORAGE_PATH = '../data/fruits-classification'
OUTPUT_PATH = '../Déployez_un_modèle_dans le cloud_Guillaume_Léo'

# Import data

In [3]:
file_list = list()
for root, dirs, files in os.walk(os.path.join(STORAGE_PATH, 'features')):
    for file in files:
        if file.endswith('.parquet'):
            file_list.append(os.path.join(root, file))
            
print(f'Number of files: {len(file_list)}')

Number of files: 555


In [4]:
stack_df = pd.concat([pd.read_parquet(file) for file in file_list], ignore_index=True)

print(f'Dimension of data: {stack_df.shape}')

Dimension of data: (6676, 8)


In [5]:
# create feature that indicate the dataset (train, test or validation)
stack_df['set'] = stack_df.path.apply(lambda x: x.split('/')[-3])

In [6]:
file_list = [os.path.join(STORAGE_PATH, 'predictions', file) for file in os.listdir(os.path.join(STORAGE_PATH, 'predictions')) if file.endswith('.parquet')]
stack_pred_df = pd.concat([pd.read_parquet(file) for file in file_list], ignore_index=True)

print(f'Dimension of predictions data: {stack_pred_df.shape}')

Dimension of predictions data: (2140, 12)


In [7]:
# add predictions
stack_df = stack_df.merge(stack_pred_df[['path', 'prediction']], on='path', how='outer')

# Cleaning

In [8]:
# convert prediction index to label
labelIndex = dict(stack_df[['labelIndex', 'label']].drop_duplicates().values)
stack_df.prediction = stack_df.prediction.apply(lambda x: labelIndex[x] if not pd.isnull(x) else x)

In [12]:
stack_df[['label', 'features']].sample(5)

Unnamed: 0,label,features
5977,apple,"[0.0, 0.12709525, 0.0, 2.9404461, 0.0, 0.0, 4...."
5083,cabbage,"[0.0, 0.0, 0.0, 0.1813078, 0.0, 0.0, 0.4224908..."
4982,eggplant,"[0.0, 0.0, 1.6811758, 0.0, 0.0, 0.98431325, 0...."
5675,apple,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.1742842, 0.0,..."
818,apple,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."


# Saving

In [10]:
# writing data in csv
stack_df.to_csv(
    path_or_buf = os.path.join(OUTPUT_PATH, 'features_data.csv'),
    columns = ['path', 'set', 'label', 'prediction', 'features', 'pca_features'],
    index = False)