# Iris Dataset Description

#### Dataset includes three iris species with 50 samples each as well as some properties about each flower. One flower species is linearly separable from the other two, but the other two are not linearly separable from each other.

#### The columns in this dataset are:

- Id
- SepalLengthCm
- SepalWidthCm
- PetalLengthCm
- PetalWidthCm
- Species

## Import the Libraries

In [1]:
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')
import seaborn as sns
import matplotlib 
from sklearn import linear_model
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import Perceptron
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import joblib
from pyspark.sql import SparkSession
from subprocess import run, Popen, PIPE
from pyspark.sql import DataFrame, SparkSession, Window
from pyspark.sql.functions import col, expr, monotonically_increasing_id, row_number,current_timestamp
import pandas as pd
import numpy as np
from typing import  List
from datetime import datetime
from minio import Minio
import pandas as pd
import uuid
import os
import shutil
import pickle

In [2]:
INCOME_MODEL_PATH="sklearn/iris/model"
EXPLAINER_MODEL_PATH="sklearn/iris/explainer"
OUTLIER_MODEL_PATH="sklearn/iris/outlier"

MINIO_HOST="minio-service.kubeflow:9000"
MINIO_ACCESS_KEY="minio"
MINIO_SECRET_KEY="minio123"
MINIO_MODEL_BUCKET="seldon"

DEPLOY_NAMESPACE="kubeflow"

EVENT_TIMESTAMP_ALIAS = "event_timestamp"
CREATED_TIMESTAMP_ALIAS = "created_timestamp"

STORAGEACCOUNTNAME= "katonicusecases"
STORAGEACCOUNTKEY= "kxGVhR3tKmJoNdEFbhauyHOvBaNMEJR8/uIH+4NKX9QLbHEsEhmo5YQmuiUmSaW2g/96Fq3RrV9f3FeMyizzgg=="    
CONTAINERNAME= "modelbuilding"

In [3]:
def get_minio():
    return Minio(MINIO_HOST,
                    access_key=MINIO_ACCESS_KEY,
                    secret_key=MINIO_SECRET_KEY,
                    secure=False)

def save_to_feature_store(spark,pandas_df,feaurestore_name,STORAGEACCOUNTNAME,CONTAINERNAME):

    pandas_df['Unique_id']=np.random.choice(len(pandas_df), size=len(pandas_df), replace=False)
    pandas_df['event_timestamp']=pd.to_datetime(datetime.now())
    pandas_df['created_timestamp']=pd.to_datetime(datetime.now())
    
    df_var = spark.createDataFrame(pandas_df)
    
    op_path = "stagging" + "/" + feaurestore_name

    op_path = "wasbs://" + CONTAINERNAME + "@" + STORAGEACCOUNTNAME + ".blob.core.windows.net/"+op_path
    
    df_var.write.mode("overwrite").parquet(op_path)
    


def create_entity_df(spark,unique_id,feaurestore_name,STORAGEACCOUNTNAME,CONTAINERNAME):
    
    
    op_path = "stagging" + "/" + feaurestore_name

    op_path = "wasbs://" + CONTAINERNAME + "@" + STORAGEACCOUNTNAME + ".blob.core.windows.net/"+op_path
    
    df_var = spark.read.parquet(op_path)
    
    
    entity_df= df_var.select(unique_id).withColumn('event_timestamp',current_timestamp())
    
    return entity_df

def fetch_df(spark,feaurestore_name,STORAGEACCOUNTNAME,CONTAINERNAME):
    
    #os.mkdir('fs_logs')

    op_path = "stagging" + "/" + feaurestore_name

    op_path = "wasbs://" + CONTAINERNAME + "@" + STORAGEACCOUNTNAME + ".blob.core.windows.net/"+op_path
    
    df_var = spark.read.parquet(op_path)
    
    #df_var = spark.createDataFrame(data)
    
    #shutil.rmtree('fs_logs')
    return df_var

def create_df_feature(spark,path_dict,feature_dict,STORAGEACCOUNTNAME,CONTAINERNAME):
    df_list = []
    f_list  = []

    for fs,obj,features in zip(path_dict.keys(),path_dict.values(),feature_dict.values()):
        #df = fetch_df(spark,fs,obj,STORAGEACCOUNTNAME,CONTAINERNAME)
        df = fetch_df(spark,fs,STORAGEACCOUNTNAME,CONTAINERNAME)
        df_list.append(df)
        f_list.append(features)
    return [df_list,f_list]

def as_of_join(
    entity_df: DataFrame,
    feature_table_entity_names : list,
    feature_table_df : DataFrame,
    feature_list : list,
    feature_table_name : str,
    max_age = [],
    entity_event_timestamp_column = 'event_timestamp'

    ) -> DataFrame:
    #print (feature_list)
    #print(type(feature_list))
    feature_table_df = feature_table_df.select(feature_list+[EVENT_TIMESTAMP_ALIAS,CREATED_TIMESTAMP_ALIAS,feature_table_entity_names[0]])
    entity_with_id = entity_df.withColumn("_row_nr", monotonically_increasing_id())
    feature_event_timestamp_column_with_prefix = (
        f"{feature_table_name}__{EVENT_TIMESTAMP_ALIAS}"
        )
    feature_created_timestamp_column_with_prefix = (
        f"{feature_table_name}__{CREATED_TIMESTAMP_ALIAS}"
        )

    projection = [
        col(col_name).alias(f"{feature_table_name}__{col_name}")
        for col_name in feature_table_df.columns
        ]

    aliased_feature_table_df = feature_table_df.select(projection)
    
    join_cond = (
    entity_with_id[entity_event_timestamp_column]
        >= aliased_feature_table_df[feature_event_timestamp_column_with_prefix]
    )
    if max_age:
        join_cond = join_cond & (
        aliased_feature_table_df[feature_event_timestamp_column_with_prefix]
        >= entity_with_id[entity_event_timestamp_column]
        - expr(f"INTERVAL {max_age[0]} seconds")
        )
    for key in feature_table_entity_names:
        join_cond = join_cond & (
        entity_with_id[key]
        == aliased_feature_table_df[f"{feature_table_name}__{key}"]
        )
    conditional_join = entity_with_id.join(
        aliased_feature_table_df, join_cond, "leftOuter"
        )
    for key in feature_table_entity_names:
        conditional_join = conditional_join.drop(
        aliased_feature_table_df[f"{feature_table_name}__{key}"]
        )
    window = Window.partitionBy("_row_nr", *feature_table_entity_names).orderBy(
        col(feature_event_timestamp_column_with_prefix).desc(),
        col(feature_created_timestamp_column_with_prefix).desc(),
        )
    filter_most_recent_feature_timestamp = conditional_join.withColumn(
        "_rank", row_number().over(window)
        ).filter(col("_rank") == 1)
    return filter_most_recent_feature_timestamp.select(
        entity_df.columns
        + [
            f"{feature_table_name}__{feature}"
            for feature in feature_list
        ]
    )
    

def retrieve_feature(
    entity_df: DataFrame,
    feature_table_dfs:List[DataFrame],
    feature_lists :List[list],
    feature_table_names:list,
    feature_table_entity_names : List[str],
    max_age=[],
    entity_event_timestamp_column='event_timestamp',
    ) -> DataFrame :
    
    joined_df = entity_df

    for (feature_table_df, feature_list,feature_table_name) in zip(feature_table_dfs, feature_lists,feature_table_names ):
            joined_df = as_of_join(
                joined_df, feature_table_entity_names,feature_table_df, feature_list,feature_table_name,
            max_age = max_age,
        entity_event_timestamp_column = entity_event_timestamp_column)
    
    return joined_df

In [4]:
os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages org.apache.hadoop:hadoop-azure:2.7.3,com.microsoft.azure:azure-storage:2.2.0,org.apache.hadoop:hadoop-aws:2.7.3 pyspark-shell'
spark =  SparkSession.builder.master("local[*]").getOrCreate()
spark._jsc.hadoopConfiguration().set("fs.azure", "org.apache.hadoop.fs.azure.NativeAzureFileSystem")
spark.conf.set("fs.wasbs.impl","org.apache.hadoop.fs.azure.NativeAzureFileSystem")
spark.conf.set('fs.azure.account.key.' + STORAGEACCOUNTNAME + '.blob.core.windows.net', STORAGEACCOUNTKEY)

In [5]:
iris= fetch_df(spark,'iris',STORAGEACCOUNTNAME,CONTAINERNAME).toPandas().drop(['Unique_id', 'event_timestamp','created_timestamp'],axis = 1)

In [6]:
iris.head()

Unnamed: 0,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0


## Model Training

- train = 70
- test = 30

##### Logistic Regression

In [7]:
X = iris.drop(columns = ['Species'])
Y = iris['Species']
X_train_lr, X_test_lr, y_train_lr, y_test_lr = train_test_split(X, Y, test_size=0.30)

In [8]:
logreg = LogisticRegression(solver='lbfgs', max_iter=110)
lr = logreg.fit(X_train_lr, y_train_lr)
acc_log = round(logreg.score(X_train_lr, y_train_lr) * 100, 2)
acc_log

98.1

##### Random Forest

In [9]:
X = iris.drop(columns = ['Species'])
Y = iris['Species']
X_train_rf, X_test_rf, y_train_rf, y_test_rf = train_test_split(X, Y, test_size=0.30)

In [10]:
random_forest = RandomForestClassifier(n_estimators=100)
rf = random_forest.fit(X_train_rf, y_train_rf)
acc_random_forest = round(random_forest.score(X_train_rf, y_train_rf) * 100, 2)
acc_random_forest

100.0

##### Gaussian Naive Bayes

In [11]:
X = iris.drop(columns = ['Species'])
Y = iris['Species']
X_train_gnb, X_test_gnb, y_train_gnb, y_test_gnb = train_test_split(X, Y, test_size=0.30)

In [12]:
gaussian = GaussianNB()
gnb = gaussian.fit(X_train_gnb, y_train_gnb)
acc_gaussian = round(gaussian.score(X_train_gnb, y_train_gnb) * 100, 2)
acc_gaussian

94.29

##### Support Vector Machine

In [13]:
X = iris.drop(columns = ['Species'])
Y = iris['Species']
X_train_svm, X_test_svm, y_train_svm, y_test_svm = train_test_split(X, Y, test_size=0.30)

In [14]:
linear_svc = SVC(gamma='auto')
svm = linear_svc.fit(X_train_svm, y_train_svm)
acc_linear_svc = round(linear_svc.score(X_train_svm, y_train_svm) * 100, 2)
acc_linear_svc

99.05

##### Decision Tree

In [15]:
X = iris.drop(columns = ['Species'])
Y = iris['Species']
X_train_dt, X_test_dt, y_train_dt, y_test_dt = train_test_split(X, Y, test_size=0.30)

In [16]:
decision_tree = DecisionTreeClassifier()
dt = decision_tree.fit(X_train_dt, y_train_dt)
acc_decision_tree = round(decision_tree.score(X_train_dt, y_train_dt) * 100, 2)
acc_decision_tree

100.0

##### Preceptron

In [17]:
X = iris.drop(columns = ['Species'])
Y = iris['Species']
X_train_per, X_test_per, y_train_per, y_test_per = train_test_split(X, Y, test_size=0.30)

In [18]:
perceptron = Perceptron()
per = perceptron.fit(X_train_per, y_train_per)
acc_perceptron = round(perceptron.score(X_train_per, y_train_per) * 100, 2)
acc_perceptron

80.95

##### K Nearest Neighbour

In [19]:
X = iris.drop(columns = ['Species'])
Y = iris['Species']
X_train_knn, X_test_knn, y_train_knn, y_test_knn = train_test_split(X, Y, test_size=0.30)

In [20]:
knearestneighbour = KNeighborsClassifier()
knn = knearestneighbour.fit(X_train_knn, y_train_knn)
acc_knearest = round(knearestneighbour.score(X_train_knn, y_train_knn) * 100, 2)
acc_knearest

97.14

##### Stochastic Gradient Descent

In [21]:
X = iris.drop(columns = ['Species'])
Y = iris['Species']
X_train_sgd, X_test_sgd, y_train_sgd, y_test_sgd = train_test_split(X, Y, test_size=0.30)

In [22]:
stochasticgrad = SGDClassifier()
sgd = stochasticgrad.fit(X_train_sgd, y_train_sgd)
acc_sgd = round(stochasticgrad.score(X_train_sgd, y_train_sgd) * 100, 2)
acc_sgd

65.71

##### Gradient Boosting Classifier

In [23]:
X = iris.drop(columns = ['Species'])
Y = iris['Species']
X_train_gbc, X_test_gbc, y_train_gbc, y_test_gbc = train_test_split(X, Y, test_size=0.30)

In [24]:
gradientboostingclassifier = GradientBoostingClassifier()
gbc = gradientboostingclassifier.fit(X_train_gbc, y_train_gbc)
acc_gbc = round(gradientboostingclassifier.score(X_train_gbc, y_train_gbc) * 100, 2)
acc_gbc

100.0

## Evaluating the Models

In [25]:
results = pd.DataFrame({
    'Model': ['Logistic Regression', 'Random Forest', 'Gaussian Naive Bayes', 'Support Vector Machine', 'Decision Tree', 'Preceptron', 'KNearest Neighbour', 'Stochastic Gradient Descent', 'Gradient Boosting Classifier'],
    'Score': [acc_log, acc_random_forest, acc_gaussian, acc_linear_svc, acc_decision_tree, acc_perceptron, acc_knearest, acc_sgd, acc_gbc],
    'Model_abb': [lr, rf, gnb, svm, dt, per, knn, sgd, gbc]})
result_df = results.sort_values(by='Score', ascending=False)
result_df = result_df.set_index('Score')
print(result_df)

                               Model  \
Score                                  
100.00                 Random Forest   
100.00                 Decision Tree   
100.00  Gradient Boosting Classifier   
99.05         Support Vector Machine   
98.10            Logistic Regression   
97.14             KNearest Neighbour   
94.29           Gaussian Naive Bayes   
80.95                     Preceptron   
65.71    Stochastic Gradient Descent   

                                                Model_abb  
Score                                                      
100.00  (DecisionTreeClassifier(max_features='auto', r...  
100.00                           DecisionTreeClassifier()  
100.00  ([DecisionTreeRegressor(criterion='friedman_ms...  
99.05                                   SVC(gamma='auto')  
98.10                    LogisticRegression(max_iter=110)  
97.14                              KNeighborsClassifier()  
94.29                                        GaussianNB()  
80.95              

In [26]:
best_model = result_df['Model_abb'].iloc[0]
best_model

RandomForestClassifier()

In [27]:
joblib.dump(best_model, 'model.joblib')
# print(get_minio().fput_object(MINIO_MODEL_BUCKET, f"{INCOME_MODEL_PATH}/model.joblib", 'model.joblib'))

['model.joblib']

In [28]:
Model_job = joblib.load("model.joblib")
Model_job

RandomForestClassifier()

In [29]:
type(Model_job)

sklearn.ensemble._forest.RandomForestClassifier

In [30]:
type(X_test_rf)

pandas.core.frame.DataFrame

In [31]:
pred = X_test_rf.values[44].reshape(1, -1)
pred

array([[5. , 3.6, 1.4, 0.2]])

In [32]:
Model_job.predict(pred)

array([0])

In [33]:
print(get_minio().fput_object(MINIO_MODEL_BUCKET, f"{INCOME_MODEL_PATH}/model.joblib", 'model.joblib'))

<minio.helpers.ObjectWriteResult object at 0x7fa732cf7550>
