In [1]:
import pandas as pd
from sklearn.impute import SimpleImputer
import numpy as np
import pandas as pd
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import SGDRegressor
from custom_transformer import DictToDFTransformer
import joblib

In [2]:
train_data = pd.read_csv("train.csv")

In [3]:
features = ["MSZoning",
 "LotArea",
 "LotShape",
 "Utilities",
 "YrSold",
 "Neighborhood",
 "OverallQual",
 "YearBuilt",
 "SaleType",
 "GarageArea"]

In [4]:
df = train_data[features]
y = np.log1p(train_data["SalePrice"])
categorical_features = df.select_dtypes(object)
numerical_features = df.select_dtypes(exclude=object)

In [5]:
p = Pipeline(
        [
            ("dicttodf", DictToDFTransformer()),
            (
                "preprocess",
                ColumnTransformer(
                    [
                        (
                            "numerical",
                            make_pipeline(
                                SimpleImputer(strategy="mean"),
                                StandardScaler(),
                            ),
                            sorted(numerical_features.columns),
                        ),
                        (
                            "categorical",
                            make_pipeline(
                                SimpleImputer(strategy="most_frequent"),
                                OneHotEncoder(handle_unknown="ignore", sparse=False),
                            ),
                            sorted(categorical_features.columns),
                        ),
                    ]
                ),
            ),
            ("regressor", SGDRegressor(random_state=666, learning_rate="adaptive")),
        ]
)

In [6]:
model = p.fit(df, y)

In [7]:
model

Pipeline(steps=[('dicttodf',
                 <custom_transformer.DictToDFTransformer object at 0x7fa5120c8e20>),
                ('preprocess',
                 ColumnTransformer(transformers=[('numerical',
                                                  Pipeline(steps=[('simpleimputer',
                                                                   SimpleImputer()),
                                                                  ('standardscaler',
                                                                   StandardScaler())]),
                                                  ['GarageArea', 'LotArea',
                                                   'OverallQual', 'YearBuilt',
                                                   'YrSold']),
                                                 ('categorical',
                                                  Pipeline(steps=[('simpleimputer',
                                                                   SimpleImputer(st

In [8]:
joblib.dump(model, "model.joblib")

['model.joblib']

In [9]:
cols = sorted(numerical_features.columns) + sorted(categorical_features.columns)
cols

['GarageArea',
 'LotArea',
 'OverallQual',
 'YearBuilt',
 'YrSold',
 'LotShape',
 'MSZoning',
 'Neighborhood',
 'SaleType',
 'Utilities']

In [10]:
sample_request = df[cols].head(1).to_dict('records')
sample_request

[{'GarageArea': 548,
  'LotArea': 8450,
  'OverallQual': 7,
  'YearBuilt': 2003,
  'YrSold': 2008,
  'LotShape': 'Reg',
  'MSZoning': 'RL',
  'Neighborhood': 'CollgCr',
  'SaleType': 'WD',
  'Utilities': 'AllPub'}]

In [11]:
request = [{'MSZoning': 'RL', 'LotArea': 8450, 'LotShape': 'Reg', 'Utilities': 'AllPub', 'YrSold': 2008, 'Neighborhood': 'CollgCr', 'OverallQual': 7, 'YearBuilt': 2003, 'SaleType': 'WD', 'GarageArea': 548}]
response = model.predict(sample_request)

In [12]:
response

array([12.20283282])