# Automobile Dataset Description

#### This dataset consist of data From 1985 Ward's Automotive Yearbook.

#### This data set consists of three types of entities: 
 - (a) The specification of an auto in terms of various characteristics
 - (b) Its assigned insurance risk rating
 - (c) Its normalized losses in use as compared to other cars. 

#### The second rating corresponds to the degree to which the auto is more risky than its price indicates. Cars are initially assigned a risk factor symbol associated with its price. Then if it is more risky (or less), this symbol is adjusted by moving it up (or down) the scale. Actuarians call this process "symboling". A value of +3 indicates that the auto is risky, -3 that it is probably pretty safe.

#### The third factor is the relative averages loss payment per insured vehicle year. This value is normalized for all autos within a particular size classification (two-door small, station wagons, sports/speciality, etc...), and represents the averages loss per car per year.

## Importing the Libraries

In [1]:
# conda install cudatoolkit
#!pip install alibi

In [2]:
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')
#import seaborn as sns
# from matplotlib import pyplot as plt
# from matplotlib import style

from sklearn import linear_model
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import Perceptron
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
#from alibi.explainers import AnchorTabular
import joblib

from pyspark.sql import SparkSession


#from numba import jit, cuda



from subprocess import run, Popen, PIPE

from pyspark.sql import DataFrame, SparkSession, Window
from pyspark.sql.functions import col, expr, monotonically_increasing_id, row_number,current_timestamp
# from data_io import *
import pandas as pd
import numpy as np
from typing import  List
from datetime import datetime
from minio import Minio
# from minio.error import ResponseError

#from azure.storage.blob import BlockBlobService
import pandas as pd

import uuid
import os
import shutil
import pickle

In [3]:
INCOME_MODEL_PATH="sklearn/automobile/model"
EXPLAINER_MODEL_PATH="sklearn/automobile/explainer"
OUTLIER_MODEL_PATH="sklearn/automobile/outlier"

MINIO_HOST="minio-service.kubeflow:9000"
MINIO_ACCESS_KEY="minio"
MINIO_SECRET_KEY="minio123"
MINIO_MODEL_BUCKET="seldon"

DEPLOY_NAMESPACE="kubeflow"

EVENT_TIMESTAMP_ALIAS = "event_timestamp"
CREATED_TIMESTAMP_ALIAS = "created_timestamp"

STORAGEACCOUNTNAME= "katonicusecases"
STORAGEACCOUNTKEY= "kxGVhR3tKmJoNdEFbhauyHOvBaNMEJR8/uIH+4NKX9QLbHEsEhmo5YQmuiUmSaW2g/96Fq3RrV9f3FeMyizzgg=="    
CONTAINERNAME= "modelbuilding"
BLOBNAME= "automobile_data.csv"

In [4]:
def get_minio():
    return Minio(MINIO_HOST,
                    access_key=MINIO_ACCESS_KEY,
                    secret_key=MINIO_SECRET_KEY,
                    secure=False)

def save_to_feature_store(spark,pandas_df,feaurestore_name,STORAGEACCOUNTNAME,CONTAINERNAME):

    pandas_df['Unique_id']=np.random.choice(len(pandas_df), size=len(pandas_df), replace=False)
    pandas_df['event_timestamp']=pd.to_datetime(datetime.now())
    pandas_df['created_timestamp']=pd.to_datetime(datetime.now())
    
    df_var = spark.createDataFrame(pandas_df)
    
    op_path = "stagging" + "/" + feaurestore_name

    op_path = "wasbs://" + CONTAINERNAME + "@" + STORAGEACCOUNTNAME + ".blob.core.windows.net/"+op_path
    
    df_var.write.mode("overwrite").parquet(op_path)
    


def create_entity_df(spark,unique_id,feaurestore_name,STORAGEACCOUNTNAME,CONTAINERNAME):
    
    
    op_path = "stagging" + "/" + feaurestore_name

    op_path = "wasbs://" + CONTAINERNAME + "@" + STORAGEACCOUNTNAME + ".blob.core.windows.net/"+op_path
    
    df_var = spark.read.parquet(op_path)
    
    
    entity_df= df_var.select(unique_id).withColumn('event_timestamp',current_timestamp())
    
    return entity_df

def fetch_df(spark,feaurestore_name,STORAGEACCOUNTNAME,CONTAINERNAME):
    
    #os.mkdir('fs_logs')

    op_path = "stagging" + "/" + feaurestore_name

    op_path = "wasbs://" + CONTAINERNAME + "@" + STORAGEACCOUNTNAME + ".blob.core.windows.net/"+op_path
    
    df_var = spark.read.parquet(op_path)
    
    #df_var = spark.createDataFrame(data)
    
    #shutil.rmtree('fs_logs')
    return df_var

def create_df_feature(spark,path_dict,feature_dict,STORAGEACCOUNTNAME,CONTAINERNAME):
    df_list = []
    f_list  = []

    for fs,obj,features in zip(path_dict.keys(),path_dict.values(),feature_dict.values()):
        #df = fetch_df(spark,fs,obj,STORAGEACCOUNTNAME,CONTAINERNAME)
        df = fetch_df(spark,fs,STORAGEACCOUNTNAME,CONTAINERNAME)
        df_list.append(df)
        f_list.append(features)
    return [df_list,f_list]

def as_of_join(
    entity_df: DataFrame,
    feature_table_entity_names : list,
    feature_table_df : DataFrame,
    feature_list : list,
    feature_table_name : str,
    max_age = [],
    entity_event_timestamp_column = 'event_timestamp'

    ) -> DataFrame:
    #print (feature_list)
    #print(type(feature_list))
    feature_table_df = feature_table_df.select(feature_list+[EVENT_TIMESTAMP_ALIAS,CREATED_TIMESTAMP_ALIAS,feature_table_entity_names[0]])
    entity_with_id = entity_df.withColumn("_row_nr", monotonically_increasing_id())
    feature_event_timestamp_column_with_prefix = (
        f"{feature_table_name}__{EVENT_TIMESTAMP_ALIAS}"
        )
    feature_created_timestamp_column_with_prefix = (
        f"{feature_table_name}__{CREATED_TIMESTAMP_ALIAS}"
        )

    projection = [
        col(col_name).alias(f"{feature_table_name}__{col_name}")
        for col_name in feature_table_df.columns
        ]

    aliased_feature_table_df = feature_table_df.select(projection)
    
    join_cond = (
    entity_with_id[entity_event_timestamp_column]
        >= aliased_feature_table_df[feature_event_timestamp_column_with_prefix]
    )
    if max_age:
        join_cond = join_cond & (
        aliased_feature_table_df[feature_event_timestamp_column_with_prefix]
        >= entity_with_id[entity_event_timestamp_column]
        - expr(f"INTERVAL {max_age[0]} seconds")
        )
    for key in feature_table_entity_names:
        join_cond = join_cond & (
        entity_with_id[key]
        == aliased_feature_table_df[f"{feature_table_name}__{key}"]
        )
    conditional_join = entity_with_id.join(
        aliased_feature_table_df, join_cond, "leftOuter"
        )
    for key in feature_table_entity_names:
        conditional_join = conditional_join.drop(
        aliased_feature_table_df[f"{feature_table_name}__{key}"]
        )
    window = Window.partitionBy("_row_nr", *feature_table_entity_names).orderBy(
        col(feature_event_timestamp_column_with_prefix).desc(),
        col(feature_created_timestamp_column_with_prefix).desc(),
        )
    filter_most_recent_feature_timestamp = conditional_join.withColumn(
        "_rank", row_number().over(window)
        ).filter(col("_rank") == 1)
    return filter_most_recent_feature_timestamp.select(
        entity_df.columns
        + [
            f"{feature_table_name}__{feature}"
            for feature in feature_list
        ]
    )
    

def retrieve_feature(
    entity_df: DataFrame,
    feature_table_dfs:List[DataFrame],
    feature_lists :List[list],
    feature_table_names:list,
    feature_table_entity_names : List[str],
    max_age=[],
    entity_event_timestamp_column='event_timestamp',
    ) -> DataFrame :
    
    joined_df = entity_df

    for (feature_table_df, feature_list,feature_table_name) in zip(feature_table_dfs, feature_lists,feature_table_names ):
            joined_df = as_of_join(
                joined_df, feature_table_entity_names,feature_table_df, feature_list,feature_table_name,
            max_age = max_age,
        entity_event_timestamp_column = entity_event_timestamp_column)
    
    return joined_df



### retrieve from feature Store

In [5]:
#from pyspark.sql.functions import current_timestamp

#### User define features location, feature list

In [6]:
path_dict = {'featurestore':'automobile'}
feature_dict = {'featureSET':['symboling', 'normalized-losses', 'make', 'fuel-type', 'aspiration',
       'num-of-doors', 'body-style', 'drive-wheels', 'engine-location',
       'wheel-base', 'length', 'width', 'height', 'curb-weight', 'engine-type',
       'num-of-cylinders', 'engine-size', 'fuel-system', 'bore', 'stroke',
       'compression-ratio', 'horsepower', 'peak-rpm', 'city-mpg',
       'highway-mpg', 'price']}

In [7]:
os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages org.apache.hadoop:hadoop-azure:2.7.3,com.microsoft.azure:azure-storage:2.2.0,org.apache.hadoop:hadoop-aws:2.7.3 pyspark-shell'
spark =  SparkSession.builder.master("local[*]").getOrCreate()
spark._jsc.hadoopConfiguration().set("fs.azure", "org.apache.hadoop.fs.azure.NativeAzureFileSystem")
spark.conf.set("fs.wasbs.impl","org.apache.hadoop.fs.azure.NativeAzureFileSystem")
spark.conf.set('fs.azure.account.key.' + STORAGEACCOUNTNAME + '.blob.core.windows.net', STORAGEACCOUNTKEY)

In [8]:
entity_df =create_entity_df(spark,'Unique_id',"featurestore", STORAGEACCOUNTNAME,CONTAINERNAME)
df_fs_list = create_df_feature(spark,path_dict,feature_dict,STORAGEACCOUNTNAME,CONTAINERNAME)

In [9]:
entity_df = entity_df
feature_table_dfs = df_fs_list[0]
feature_lists = df_fs_list[1]
feature_table_names = feature_dict.keys()
feature_table_entity_names = ['Unique_id']


In [10]:
sdf = retrieve_feature(
        entity_df,
        feature_table_dfs,
        feature_lists,
        feature_table_names,
        feature_table_entity_names,
        )
pdf = sdf.toPandas()


In [11]:
spark.stop()

In [12]:
pdf.head(2)

Unnamed: 0,Unique_id,event_timestamp,featureSET__symboling,featureSET__normalized-losses,featureSET__make,featureSET__fuel-type,featureSET__aspiration,featureSET__num-of-doors,featureSET__body-style,featureSET__drive-wheels,...,featureSET__engine-size,featureSET__fuel-system,featureSET__bore,featureSET__stroke,featureSET__compression-ratio,featureSET__horsepower,featureSET__peak-rpm,featureSET__city-mpg,featureSET__highway-mpg,featureSET__price
0,130,2021-04-21 16:25:11.025,1,104,8,1,0,2,2,1,...,91,1,3,3,9,68,5000,31,38,6795
1,149,2021-04-21 16:25:11.025,0,161,13,0,1,1,3,2,...,152,3,3,3,21,95,4150,28,33,17950


## Model Training

##### Logistic Regression

In [13]:
X = pdf.drop(columns = ['Unique_id','event_timestamp','featureSET__price'])
Y = pdf['featureSET__price']
X_train_lr, X_test_lr, y_train_lr, y_test_lr = train_test_split(X, Y, test_size=0.30)

In [14]:
logreg = LogisticRegression(solver='lbfgs', max_iter=110)
lr = logreg.fit(X_train_lr, y_train_lr)
acc_log = round(logreg.score(X_train_lr, y_train_lr) * 100, 2)
acc_log

63.64

##### Random Forest

In [15]:
X = pdf.drop(columns = ['Unique_id','event_timestamp','featureSET__price'])
Y = pdf['featureSET__price']
X_train_rf, X_test_rf, y_train_rf, y_test_rf = train_test_split(X, Y, test_size=0.30)

In [16]:
random_forest = RandomForestClassifier(n_estimators=100)
rf = random_forest.fit(X_train_rf, y_train_rf)
acc_random_forest = round(random_forest.score(X_train_rf, y_train_rf) * 100, 2)
acc_random_forest

100.0

##### Gaussian Naive Bayes

In [17]:
X = pdf.drop(columns = ['Unique_id','event_timestamp','featureSET__price'])
Y = pdf['featureSET__price']
X_train_gnb, X_test_gnb, y_train_gnb, y_test_gnb = train_test_split(X, Y, test_size=0.30)

In [18]:
gaussian = GaussianNB()
gnb = gaussian.fit(X_train_gnb, y_train_gnb)
acc_gaussian = round(gaussian.score(X_train_gnb, y_train_gnb) * 100, 2)
acc_gaussian

96.5

##### Support Vector Machine


In [19]:
X = pdf.drop(columns = ['Unique_id','event_timestamp','featureSET__price'])
Y = pdf['featureSET__price']
X_train_svm, X_test_svm, y_train_svm, y_test_svm = train_test_split(X, Y, test_size=0.30)

In [20]:
linear_svc = SVC(gamma='auto')
svm = linear_svc.fit(X_train_svm, y_train_svm)
acc_linear_svc = round(linear_svc.score(X_train_svm, y_train_svm) * 100, 2)
acc_linear_svc

99.3

##### Decision Tree


In [21]:
X = pdf.drop(columns = ['Unique_id','event_timestamp','featureSET__price'])
Y = pdf['featureSET__price']
X_train_dt, X_test_dt, y_train_dt, y_test_dt = train_test_split(X, Y, test_size=0.30)

In [22]:
decision_tree = DecisionTreeClassifier()
dt = decision_tree.fit(X_train_dt, y_train_dt)
acc_decision_tree = round(decision_tree.score(X_train_dt, y_train_dt) * 100, 2)
acc_decision_tree

98.6

##### Preceptron


In [23]:
X = pdf.drop(columns = ['Unique_id','event_timestamp','featureSET__price'])
Y = pdf['featureSET__price']
X_train_per, X_test_per, y_train_per, y_test_per = train_test_split(X, Y, test_size=0.30)

In [24]:
perceptron = Perceptron()
per = perceptron.fit(X_train_per, y_train_per)
acc_perceptron = round(perceptron.score(X_train_per, y_train_per) * 100, 2)
acc_perceptron

2.1

##### K Nearest Neighbour


In [25]:
X = pdf.drop(columns = ['Unique_id','event_timestamp','featureSET__price'])
Y = pdf['featureSET__price']
X_train_knn, X_test_knn, y_train_knn, y_test_knn = train_test_split(X, Y, test_size=0.30)

In [26]:
knearestneighbour = KNeighborsClassifier()
knn = knearestneighbour.fit(X_train_knn, y_train_knn)
acc_knearest = round(knearestneighbour.score(X_train_knn, y_train_knn) * 100, 2)
acc_knearest

23.78

##### Stochastic Gradient Descent


In [27]:
X = pdf.drop(columns = ['Unique_id','event_timestamp','featureSET__price'])
Y = pdf['featureSET__price']
X_train_sgd, X_test_sgd, y_train_sgd, y_test_sgd = train_test_split(X, Y, test_size=0.30)

In [28]:
stochasticgrad = SGDClassifier()
sgd = stochasticgrad.fit(X_train_sgd, y_train_sgd)
acc_sgd = round(stochasticgrad.score(X_train_sgd, y_train_sgd) * 100, 2)
acc_sgd

1.4

##### Gradient Boosting Classifier


In [29]:
X = pdf.drop(columns = ['Unique_id','event_timestamp','featureSET__price'])
Y = pdf['featureSET__price']
X_train_gbc, X_test_gbc, y_train_gbc, y_test_gbc = train_test_split(X, Y, test_size=0.30)

In [30]:
gradientboostingclassifier = GradientBoostingClassifier()
gbc = gradientboostingclassifier.fit(X_train_gbc, y_train_gbc)
acc_gbc = round(gradientboostingclassifier.score(X_train_gbc, y_train_gbc) * 100, 2)
acc_gbc

97.9

## Evaluating the Models

In [31]:
results = pd.DataFrame({
    'Model': ['Logistic Regression', 'Random Forest', 'Gaussian Naive Bayes', 'Support Vector Machine', 'Decision Tree', 'Preceptron', 'KNearest Neighbour', 'Stochastic Gradient Descent', 'Gradient Boosting Classifier'],
    'Score': [acc_log, acc_random_forest, acc_gaussian, acc_linear_svc, acc_decision_tree, acc_perceptron, acc_knearest, acc_sgd, acc_gbc],
    'Model_abb': [lr, rf, gnb, svm, dt, per, knn, sgd, gbc]})
result_df = results.sort_values(by='Score', ascending=False)
result_df = result_df.set_index('Score')
result_df

Unnamed: 0_level_0,Model,Model_abb
Score,Unnamed: 1_level_1,Unnamed: 2_level_1
100.0,Random Forest,"(DecisionTreeClassifier(max_features='auto', r..."
99.3,Support Vector Machine,SVC(gamma='auto')
98.6,Decision Tree,DecisionTreeClassifier()
97.9,Gradient Boosting Classifier,([DecisionTreeRegressor(criterion='friedman_ms...
96.5,Gaussian Naive Bayes,GaussianNB()
63.64,Logistic Regression,LogisticRegression(max_iter=110)
23.78,KNearest Neighbour,KNeighborsClassifier()
2.1,Preceptron,Perceptron()
1.4,Stochastic Gradient Descent,SGDClassifier()


In [32]:
best_model = result_df['Model_abb'].iloc[0]
best_model

RandomForestClassifier()

In [33]:
joblib.dump(best_model, 'model.joblib')
# print(get_minio().fput_object(MINIO_MODEL_BUCKET, f"{INCOME_MODEL_PATH}/model.joblib", 'model.joblib'))

['model.joblib']

In [34]:
Model_job = joblib.load("model.joblib")
Model_job

RandomForestClassifier()

In [35]:
type(Model_job)

sklearn.ensemble._forest.RandomForestClassifier

In [41]:
type(X_test_rf)

pandas.core.frame.DataFrame

In [42]:
X_test_rf

Unnamed: 0,featureSET__symboling,featureSET__normalized-losses,featureSET__make,featureSET__fuel-type,featureSET__aspiration,featureSET__num-of-doors,featureSET__body-style,featureSET__drive-wheels,featureSET__engine-location,featureSET__wheel-base,...,featureSET__num-of-cylinders,featureSET__engine-size,featureSET__fuel-system,featureSET__bore,featureSET__stroke,featureSET__compression-ratio,featureSET__horsepower,featureSET__peak-rpm,featureSET__city-mpg,featureSET__highway-mpg
48,1,168,19,1,0,2,3,2,0,94,...,2,98,1,3,3,9,70,4800,29,34
7,3,122,11,1,1,2,2,1,0,95,...,2,156,6,3,3,7,145,5000,19,24
14,0,122,20,1,0,1,3,1,0,100,...,1,136,5,3,3,8,110,5500,19,24
75,1,104,8,1,0,2,2,1,0,93,...,2,91,1,3,3,9,68,5000,31,38
163,0,110,5,1,0,1,3,1,0,96,...,2,92,0,2,3,9,76,6000,30,34
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
91,1,129,8,1,0,2,2,1,0,98,...,2,122,1,3,3,8,84,4800,26,32
38,0,161,13,1,1,1,3,2,0,108,...,2,134,5,3,3,7,142,5600,18,24
128,3,122,11,1,1,2,2,1,0,95,...,2,156,6,3,3,7,145,5000,19,24
157,2,134,19,1,0,2,2,2,0,98,...,2,146,5,3,3,9,116,4800,24,30


In [43]:
pred = X_test_rf.values[55].reshape(1, -1)
pred

array([[   0,  188,    2,    1,    0,    2,    3,    2,    0,  101,  176,
          64,   54, 2710,    3,    3,  164,    5,    3,    3,    9,  121,
        4250,   21,   28]])

In [44]:
Model_job.predict(pred)

array([21105])

In [45]:
print(get_minio().fput_object(MINIO_MODEL_BUCKET, f"{INCOME_MODEL_PATH}/model.joblib", 'model.joblib'))

<minio.helpers.ObjectWriteResult object at 0x7f0df7af4150>
