# Objective
Create a notebook using JAI to solve a classification problem. In this first attempt we will use the Iris dataset. This dataset brings us 150 rows with 4 different flower characteristics that will be used to classify each flower as one of the 3 different types of irises’ (Setosa, Versicolour, and Virginica). Have fun! And if you have any doubts, check our documentation or ask us on our slack =].

# Imports 

In [1]:
import numpy as np
import pandas as pd
from sklearn import metrics
from tabulate import tabulate
from sklearn.datasets import load_iris
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split

# Generating your JAI account (if you don't have it already)

In [None]:
# ATTENTION: If you haven't generate your key yet, just run the command below
#Jai.get_auth_key(email='email@mail.com', firstName='Jai', lastName='Z')

# Instantiating JAI

In [2]:
from jai import Jai
j = Jai()

# Loading the the dataset and checking basic information

In [3]:
df = pd.DataFrame(load_iris(as_frame=True).data)
target = load_iris(as_frame=True).target
df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2


In [None]:
print(tabulate(df.head(), headers='keys', tablefmt='rst'))

In [None]:
df.shape

In [None]:
df.describe()

In [None]:
df.info()

In [None]:
target.value_counts()

# Classification model 

In [4]:
df = pd.DataFrame(load_iris(as_frame=True).data)
target = load_iris(as_frame=True).target

X_train, X_test, y_train, y_test = train_test_split(
            df, target, test_size=0.3, random_state=42)

In [5]:
train = pd.concat([X_train,y_train],axis=1)

Now we are gonna train, test and validate our model with **j.fit**. This will create a **collection** inside JAI which will contain one **vector for each line** of our train dataset. These vectores are a numerical representation of each row that compress the whole information and extracts the most important characteristics of these data, where the vectors of the examples (rows) that are similar will be close to each other on its vectorial space =].

In [None]:
j.fit(
    # Here you will name your collection inside JAI
    name="iris_supervised", 
    
    # data should always receive a dataframe, even if it is of one column. 
    data=train, 
    
    # Here you will define the type of model you want to. The other options you have are SelfSupervised,
    #Text, FastText, TextEdit, Image 
    db_type='Supervised', 
    
    # You can set these parameter to True if you want to overweite an already created collection
    #overwrite = True,
    
    # verbose =2 will bring the loss graph as well as the metrics result.
    verbose=2,
    
    # When we set task as *classification* we use Cross Entropy Loss. For more information, check our documentation
    label={"task": "classification",
           "label_name": "target"}
)

# Checking your collection information

In [None]:
# List all collections in your subscription and some info about them
j.info

In [None]:
# Download the generated vectors. If you have too many vectors, this can take a while
vectors = j.download_vectors('cc_fraud_supervised')

In [None]:
len(vectors)

In [None]:
vectors[0]

In [None]:
# The default size of each vector for the Supervised is 64
len(vectors[0])

**Hurray \0/!!!** Now your model is already deployed to be consumed by your applications. We will show below two way to apply your model to new data =].

# Make predictions and analysing the results

## Predictions without predict_proba

In [6]:
# Now we will make the predictions
#In this case, it will use 0.5 as threshold to return the predicted class
ans = j.predict(
    
    # Collection to be queried
    name='iris_supervised',
    
    # This will make your ansewer return as a dataframe
    as_frame=True,
    
    # Here you will pass a dataframe to predict which examples are Setosa, Versicolour, and Virginica
    data=X_test
)

Predict: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:04<00:00,  4.59s/it]
Predict Processing: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 45/45 [00:00<00:00, 199096.71it/s]


In [7]:
# ATTENTION: JAI ALWAYS RETURNS THE ANSWERS ORDERED BY ID! Bringin y_test like this will avoid mismathings.
ans["y_true"] = y_test

In [8]:
print(tabulate(ans.head(), headers='keys', tablefmt='rst'))

  id    predict    y_true
   4          0         0
   9          0         0
  10          0         0
  11          0         0
  12          0         0


In [9]:
print(metrics.classification_report( ans["y_true"],ans["predict"],target_names=['0','1','2']))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        19
           1       1.00      1.00      1.00        13
           2       1.00      1.00      1.00        13

    accuracy                           1.00        45
   macro avg       1.00      1.00      1.00        45
weighted avg       1.00      1.00      1.00        45



## Predictions using predict_proba

In [10]:
ans = j.predict(
    
    # Collection to be queried
    name='iris_supervised',
    
    # This will bring the probabilities predicted
    predict_proba = True,
    
    # This will make your ansewer return as a dataframe
    as_frame=True,
    
    # Here you will pass a dataframe to predict which examples are Setosa, Versicolour, and Virginica
    data=X_test
)

Predict: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:02<00:00,  2.36s/it]
Predict Processing: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 45/45 [00:00<00:00, 25042.28it/s]


In [11]:
# ATTENTION: JAI ALWAYS RETURNS THE ANSWERS ORDERED BY ID! Bringin y_test like this will avoid mismathings.
ans["y_true"] = y_test

In [12]:
print(tabulate(ans.head(), headers='keys', tablefmt='rst'))

  id         0          1          2    predict    probability(%)    y_true
   4  0.967401  0.0158325  0.0167661          0             96.74         0
   9  0.975747  0.0116164  0.0126364          0             97.57         0
  10  0.962914  0.0186806  0.0184058          0             96.29         0
  11  0.969209  0.0147728  0.0160187          0             96.92         0
  12  0.977361  0.0108368  0.0118019          0             97.74         0


In [13]:
# Calculating AUC Score
roc_auc_score(ans["y_true"], np.array(ans[["0","1","2"]]), multi_class='ovr')

1.0

Eventhough this result might scare you, JAI backend is made to provide a robust performance and prevent overfitting. 

# Model inference via REST API

In [None]:
# Import requests libraries
import requests

# Set Authentication header
header={'Auth': AUTH_KEY}

# Set collection name
db_name = 'iris_supervised' 

# Model inference endpoint
url_predict = f"https://mycelia.azure-api.net/predict/{db_name}"

# json body
# Note that we need to provide a column named 'id'
# Also note that we drop the 'PRICE' column because it is not a feature
body = X_test.reset_index().rename(columns={'index':'id'}).head().to_dict(orient='records')

# Make the request
ans = requests.put(url_predict, json=body, headers=header)
ans.json()

# Plotting embeddings

In [None]:
# Display images
from IPython.display import Image
from IPython.core.display import HTML 

# Import libraries
import pandas as pd
import numpy as np
from sklearn.manifold import TSNE
import requests

import plotly.express as px

In [None]:
# Download generated vectors
vectors = j.download_vectors('iris_supervised')

# 1115 stores, encoded into 64 dimensions
vectors.shape

# hamm...just numbers I guess ¯\_(ツ)_/¯ 
vectors

#fit tsne for dimensionality reduction
stores_embedded = TSNE(n_components=2, learning_rate='auto',
                init='random').fit_transform(vectors)

# 1115 stores, now encoded into 2 dimensions for visualization
stores_embedded.shape

# get collection ids to use as index
ids_list = j.ids(name='iris_supervised', mode='complete')

#merge with the full dataset for visualization
df_stores_embedded = pd.DataFrame(stores_embedded, columns=['x','y'], index = ids_list)
df_stores_embedded = pd.concat([train,df_stores_embedded],axis=1)

train.columns

# plot interactive product embeddings visualization using plotly
fig = px.scatter(df_stores_embedded,
           title='Iris - Learned Representation',
           x='x',
           y='y',
           #hover_name=,
           hover_data=train.columns,
           color='target')

fig.show()