In [1]:
import csv
import json
import numpy as np

from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split

# Lire la donnée

In [2]:
with open("../data/raw/iris.csv", "r") as f:
    iris = [row for row in csv.DictReader(f)]

# Transformer la donnée

In [3]:
Y_col = "class"
feature_names = [col for col in iris[0].keys() if col not in Y_col]
names = list(set(row[Y_col] for row in iris))

X = np.array([[float(row[feature]) for feature in feature_names] for row in iris])
y = np.array([names.index(row[Y_col]) for row in iris])

In [4]:
enc = OneHotEncoder()
Y = enc.fit_transform(y[:, np.newaxis]).toarray()

# Scale data to have mean 0 and variance 1 
# which is important for convergence of the neural network
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

X_train, X_test, Y_train, Y_test = train_test_split(
    X_scaled, Y, test_size=0.5, random_state=2
)

n_features = X.shape[1]
n_classes = Y.shape[1]

# Ecrire la donnée

In [5]:
training_data = {
    "X_train": X_train.tolist(),
    "X_test": X_test.tolist(),
    "Y_train": Y_train.tolist(),
    "Y_test": Y_test.tolist(),
    "n_features": n_features,
    "n_classes": n_classes
}

In [6]:
with open("../data/processed/iris_training_data.json", "w+") as f:
    json.dump(training_data, f, ensure_ascii=False, indent=4)

In [9]:
visualisation_data = {
    "X": X.tolist(),
    "y": y.tolist(),
    "names": names,
    "feature_names": feature_names,
}

In [10]:
with open("../data/processed/iris_visualisation_data.json", "w+") as f:
    json.dump(visualisation_data, f, ensure_ascii=False, indent=4)