# Machine Learning

Develop a decision tree classifier to predict the species of a penguin given the penguins:
- Bill Length
- Flipper Length
- Body Mass

## Import the necessary data (`pandas`) and machine learning (`sklearn`) libraries

In [49]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import FeatureUnion, make_pipeline
from sklearn.metrics import confusion_matrix
### To deal with missing values
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.tree import DecisionTreeClassifier

## Load the cleaned penguins data and create a training/testing data split

In [44]:
# Load penguins cleaned data
penguins = pd.read_csv("../data/cleaned/penguins_cleaned.csv")
penguins.head()


# Split data into features and labels
X = penguins.loc[:,["bill_length_mm", "flipper_length_mm", "body_mass_g"]]
y = penguins.loc[:,["species"]]

# Create training/testing split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=100,
                                            random_state=0)

## Impute missing values

In [45]:
# impute values for missing features
imp = IterativeImputer(max_iter=10, random_state=0)

## Train decision tree classifier

In [None]:
# Create a pipeline of estimators w/ imputer and decision tree classifier
clf = make_pipeline(imp, DecisionTreeClassifier())
clf = clf.fit(X_train, y_train)

# Test the trained model using test set
y_pred = clf.predict(X_test)

## Print the classifier results

### Confusion matrix

In [51]:
pd.DataFrame(
    confusion_matrix(y_test, y_pred, labels=['Adelie','Chinstrap','Gentoo']),
    index=['Adelie','Chinstrap','Gentoo'], 
    columns=['Adelie','Chinstrap','Gentoo'])

Unnamed: 0,Adelie,Chinstrap,Gentoo
Adelie,47,1,0
Chinstrap,1,13,2
Gentoo,2,0,34


### Classifier score

In [52]:
print(clf.score(X_test,y_test))

0.94
