In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn import metrics
import joblib

data = pd.read_csv('iris.csv')

y = data['species']
X = data.drop(columns=['species'])

# TODO
# Train test split with 30% text and a random state of 23
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=23)

# Create an median Imputer
imputer = SimpleImputer(strategy='median')

# Create an instance of a ColumnTransformer that applys the Imputer on all columns
preprocessor = ColumnTransformer(
    transformers=[
        ('imputer', imputer, X.columns)
    ]
)

# Create a Random Forest Classifier (with random state 23)
clf = RandomForestClassifier(random_state=23)

# Create a pipeline with the ColumnTransformer as a preprocessor and the Random Forest Classifier as a classifier
pipe = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', clf)
])


# Train the pipeline on the training data
pipe.fit(X_train, y_train)

# Predict the results on the test data
y_pred = pipe.predict(X_test)

# print a classification report
print(metrics.classification_report(y_test, y_pred))

# save the the pipeline in the app folder
joblib.dump(pipe, '../app/iris.mdl')

                 precision    recall  f1-score   support

    Iris-setosa       1.00      1.00      1.00        18
Iris-versicolor       0.93      1.00      0.97        14
 Iris-virginica       1.00      0.92      0.96        13

       accuracy                           0.98        45
      macro avg       0.98      0.97      0.98        45
   weighted avg       0.98      0.98      0.98        45



['../app/iris.mdl']