sklearnserver: ignore converting instances into np.array (#1972)

* sklearnserver: ignore converting instances into np.array Signed-off-by: Suresh Nakkeran <suresh.n@ideas2it.com> * sklearnserver: added a test for multi datatype Signed-off-by: Suresh Nakkeran <suresh.n@ideas2it.com> * pylint and pybuild issue fix Signed-off-by: Suresh Nakkeran <suresh.n@ideas2it.com> * move sklearn multi datatype model build scripts to docs Signed-off-by: Suresh Nakkeran <suresh.n@ideas2it.com> * 1. added isvc example for sklearn mixedtype model 2. multi-datatype renamed to mixedtype Signed-off-by: Suresh Nakkeran <suresh.n@ideas2it.com>
kserve · Jan 13, 2022 · c666680 · c666680
1 parent 343bae6
commit c666680
Show file tree

Hide file tree

Showing 12 changed files with 1,839 additions and 9 deletions.
diff --git a/docs/samples/v1beta1/sklearn/v1/sklearn-mixedtype-model/__init__.py b/docs/samples/v1beta1/sklearn/v1/sklearn-mixedtype-model/__init__.py
diff --git a/docs/samples/v1beta1/sklearn/v1/sklearn-mixedtype-model/custom_transformer.py b/docs/samples/v1beta1/sklearn/v1/sklearn-mixedtype-model/custom_transformer.py
@@ -0,0 +1,11 @@
+import pandas as pd
+from sklearn.base import TransformerMixin
+
+
+class DictToDFTransformer(TransformerMixin):
+
+    def transform(self, X, y=None):
+        return pd.DataFrame(X)
+
+    def fit(self, X, y=None):
+        return self
diff --git a/docs/samples/v1beta1/sklearn/v1/sklearn-mixedtype-model/mixedtype-input.json b/docs/samples/v1beta1/sklearn/v1/sklearn-mixedtype-model/mixedtype-input.json
@@ -0,0 +1,5 @@
+{
+    "instances": [
+        {"MSZoning": "RL", "LotArea": 8450, "LotShape": "Reg", "Utilities": "AllPub", "YrSold": 2008, "Neighborhood": "CollgCr", "OverallQual": 7, "YearBuilt": 2003, "SaleType": "WD", "GarageArea": 548}
+    ]
+}
diff --git a/docs/samples/v1beta1/sklearn/v1/sklearn-mixedtype-model/sklearn-mixedtype-model.ipynb b/docs/samples/v1beta1/sklearn/v1/sklearn-mixedtype-model/sklearn-mixedtype-model.ipynb
@@ -0,0 +1,299 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "58b1517b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pandas as pd\n",
+    "from sklearn.impute import SimpleImputer\n",
+    "import numpy as np\n",
+    "import pandas as pd\n",
+    "from sklearn.pipeline import Pipeline, make_pipeline\n",
+    "from sklearn.compose import ColumnTransformer\n",
+    "from sklearn.preprocessing import StandardScaler, OneHotEncoder\n",
+    "from sklearn.linear_model import SGDRegressor\n",
+    "from custom_transformer import DictToDFTransformer\n",
+    "import joblib"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "137b49fd",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "train_data = pd.read_csv(\"train.csv\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "7f091668",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "features = [\"MSZoning\",\n",
+    " \"LotArea\",\n",
+    " \"LotShape\",\n",
+    " \"Utilities\",\n",
+    " \"YrSold\",\n",
+    " \"Neighborhood\",\n",
+    " \"OverallQual\",\n",
+    " \"YearBuilt\",\n",
+    " \"SaleType\",\n",
+    " \"GarageArea\"]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "66be31ab",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df = train_data[features]\n",
+    "y = np.log1p(train_data[\"SalePrice\"])\n",
+    "categorical_features = df.select_dtypes(object)\n",
+    "numerical_features = df.select_dtypes(exclude=object)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "23eae04b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "p = Pipeline(\n",
+    "        [\n",
+    "            (\"dicttodf\", DictToDFTransformer()),\n",
+    "            (\n",
+    "                \"preprocess\",\n",
+    "                ColumnTransformer(\n",
+    "                    [\n",
+    "                        (\n",
+    "                            \"numerical\",\n",
+    "                            make_pipeline(\n",
+    "                                SimpleImputer(strategy=\"mean\"),\n",
+    "                                StandardScaler(),\n",
+    "                            ),\n",
+    "                            sorted(numerical_features.columns),\n",
+    "                        ),\n",
+    "                        (\n",
+    "                            \"categorical\",\n",
+    "                            make_pipeline(\n",
+    "                                SimpleImputer(strategy=\"most_frequent\"),\n",
+    "                                OneHotEncoder(handle_unknown=\"ignore\", sparse=False),\n",
+    "                            ),\n",
+    "                            sorted(categorical_features.columns),\n",
+    "                        ),\n",
+    "                    ]\n",
+    "                ),\n",
+    "            ),\n",
+    "            (\"regressor\", SGDRegressor(random_state=666, learning_rate=\"adaptive\")),\n",
+    "        ]\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "3f6a55b5",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "model = p.fit(df, y)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "id": "c198c543",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "Pipeline(steps=[('dicttodf',\n",
+       "                 <custom_transformer.DictToDFTransformer object at 0x7fa5120c8e20>),\n",
+       "                ('preprocess',\n",
+       "                 ColumnTransformer(transformers=[('numerical',\n",
+       "                                                  Pipeline(steps=[('simpleimputer',\n",
+       "                                                                   SimpleImputer()),\n",
+       "                                                                  ('standardscaler',\n",
+       "                                                                   StandardScaler())]),\n",
+       "                                                  ['GarageArea', 'LotArea',\n",
+       "                                                   'OverallQual', 'YearBuilt',\n",
+       "                                                   'YrSold']),\n",
+       "                                                 ('categorical',\n",
+       "                                                  Pipeline(steps=[('simpleimputer',\n",
+       "                                                                   SimpleImputer(strategy='most_frequent')),\n",
+       "                                                                  ('onehotencoder',\n",
+       "                                                                   OneHotEncoder(handle_unknown='ignore',\n",
+       "                                                                                 sparse=False))]),\n",
+       "                                                  ['LotShape', 'MSZoning',\n",
+       "                                                   'Neighborhood', 'SaleType',\n",
+       "                                                   'Utilities'])])),\n",
+       "                ('regressor',\n",
+       "                 SGDRegressor(learning_rate='adaptive', random_state=666))])"
+      ]
+     },
+     "execution_count": 7,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "model"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "id": "5a96ee5e",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "['model.joblib']"
+      ]
+     },
+     "execution_count": 8,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "joblib.dump(model, \"model.joblib\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "id": "f3cdeb3f",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "['GarageArea',\n",
+       " 'LotArea',\n",
+       " 'OverallQual',\n",
+       " 'YearBuilt',\n",
+       " 'YrSold',\n",
+       " 'LotShape',\n",
+       " 'MSZoning',\n",
+       " 'Neighborhood',\n",
+       " 'SaleType',\n",
+       " 'Utilities']"
+      ]
+     },
+     "execution_count": 9,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "cols = sorted(numerical_features.columns) + sorted(categorical_features.columns)\n",
+    "cols"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "id": "c2093efa",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "[{'GarageArea': 548,\n",
+       "  'LotArea': 8450,\n",
+       "  'OverallQual': 7,\n",
+       "  'YearBuilt': 2003,\n",
+       "  'YrSold': 2008,\n",
+       "  'LotShape': 'Reg',\n",
+       "  'MSZoning': 'RL',\n",
+       "  'Neighborhood': 'CollgCr',\n",
+       "  'SaleType': 'WD',\n",
+       "  'Utilities': 'AllPub'}]"
+      ]
+     },
+     "execution_count": 10,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "sample_request = df[cols].head(1).to_dict('records')\n",
+    "sample_request"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "id": "b3d18862",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "request = [{'MSZoning': 'RL', 'LotArea': 8450, 'LotShape': 'Reg', 'Utilities': 'AllPub', 'YrSold': 2008, 'Neighborhood': 'CollgCr', 'OverallQual': 7, 'YearBuilt': 2003, 'SaleType': 'WD', 'GarageArea': 548}]\n",
+    "response = model.predict(sample_request)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "id": "3462c4e7",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "array([12.20283282])"
+      ]
+     },
+     "execution_count": 12,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "response"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "d0d82a16",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.10"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/docs/samples/v1beta1/sklearn/v1/sklearn-mixedtype-model/sklearn-mixedtype.yaml b/docs/samples/v1beta1/sklearn/v1/sklearn-mixedtype-model/sklearn-mixedtype.yaml
@@ -0,0 +1,8 @@
+apiVersion: "serving.kserve.io/v1beta1"
+kind: "InferenceService"
+metadata:
+  name: "sklearn-mixedtype"
+spec:
+  predictor:
+    sklearn:
+      storageUri: "gs://kfserving-examples/models/sklearn/1.0/mixedtype"