In [1]:
import os
import tarfile
import urllib
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
import mlflow

In [2]:
DOWNLOAD_ROOT = "https://raw.githubusercontent.com/ageron/handson-ml2/master/"
HOUSING_PATH = os.path.join("datasets", "housing")
HOUSING_URL = DOWNLOAD_ROOT + "datasets/housing/housing.tgz"

In [3]:
def fetch_housing_data(housing_url=HOUSING_URL, housing_path=HOUSING_PATH):
    os.makedirs(housing_path, exist_ok=True)
    tgz_path = os.path.join(housing_path, "housing.tgz")
    urllib.request.urlretrieve(housing_url, tgz_path)
    housing_tgz = tarfile.open(tgz_path)
    housing_tgz.extractall(path=housing_path)
    housing_tgz.close()

In [4]:
fetch_housing_data()

In [5]:
def load_housing_data(housing_path=HOUSING_PATH):
    csv_path = os.path.join(housing_path, "housing.csv")
    return pd.read_csv(csv_path)

In [6]:
housing = load_housing_data()

In [7]:
rooms_ix, bedrooms_ix, population_ix, households_ix = 3, 4, 5, 6

class CombinedAttributesAdder(BaseEstimator, TransformerMixin):
    def __init__(self, add_bedrooms_per_room = True): # no *args or **kargs
        self.add_bedrooms_per_room = add_bedrooms_per_room
    def fit(self, X, y=None):
        return self  # nothing else to do
    def transform(self, X):
        rooms_per_household = X[:, rooms_ix] / X[:, households_ix]
        population_per_household = X[:, population_ix] / X[:, households_ix]
        if self.add_bedrooms_per_room:
            bedrooms_per_room = X[:, bedrooms_ix] / X[:, rooms_ix]
            return np.c_[X, rooms_per_household, population_per_household,
                         bedrooms_per_room]

        else:
            return np.c_[X, rooms_per_household, population_per_household]

In [8]:
num_pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy="median")),
        ('attribs_adder', CombinedAttributesAdder()),
        ('std_scaler', StandardScaler()),
    ])

In [9]:
def prepare_data(data):
    data["income_cat"] = pd.cut(data["median_income"],
                               bins=[0., 1.5, 3.0, 4.5, 6., np.inf],
                               labels=[1, 2, 3, 4, 5])
    split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
    for train_index, test_index in split.split(data, data["income_cat"]):
        strat_train_set = data.loc[train_index]
        strat_test_set = data.loc[test_index]
    for set_ in (strat_train_set, strat_test_set):
        set_.drop("income_cat", axis=1, inplace=True)
    data = strat_train_set.drop("median_house_value", axis=1)
    data_labels = strat_train_set["median_house_value"].copy()
    data_num = data.drop("ocean_proximity", axis=1)
    data_num_tr = num_pipeline.fit_transform(data_num)
    num_attribs = list(data_num)
    cat_attribs = ["ocean_proximity"]
    full_pipeline = ColumnTransformer([
            ("num", num_pipeline, num_attribs),
            ("cat", OneHotEncoder(), cat_attribs),
        ])
    data_prepared = full_pipeline.fit_transform(data)
    return data_prepared, data_labels

In [10]:
def train_model(data_prepared, data_labels):
    lin_reg = LinearRegression()
    lin_reg.fit(data_prepared, data_labels)
    lin_scores = cross_val_score(lin_reg, data_prepared, data_labels,
                             scoring="neg_mean_squared_error", cv=10)
    lin_rmse_scores = np.sqrt(-lin_scores)
    return lin_rmse_scores, lin_reg

In [11]:
remote_server_uri = "http://localhost:5000"
mlflow.set_tracking_uri(remote_server_uri)
mlflow.get_tracking_uri()

'http://localhost:5000'

In [12]:
exp_name = "House_price_predictions"
mlflow.set_experiment(exp_name)

<Experiment: artifact_location='./mlruns/2', experiment_id='2', lifecycle_stage='active', name='House_price_predictions', tags={}>

In [None]:
with mlflow.start_run(run_name='PARENT_RUN') as parent_run:
    mlflow.log_param("parent", "yes")
    with mlflow.start_run(run_name='DATA_PREPARATION', nested=True) as data_preparation:
        housing_prepared, housing_labels = prepare_data(housing)
        mlflow.log_params({"Scaler":"Standard Scaler","Imputer":"Simple Imputer","Categorical Encoder":"One Hot Encoder"})
        mlflow.log_artifact(HOUSING_PATH)
    with mlflow.start_run(run_name='MODEL_TRAINING', nested=True) as model_training:
        model_scores, model = train_model(housing_prepared, housing_labels)
        mlflow.log_metrics({"Mean":model_scores.mean(),"Standard_deviation":model_scores.std()})
        mlflow.sklearn.log_model(model, "model")
        print("Save to {}".format(mlflow.get_artifact_uri()))

The git executable must be specified in one of the following ways:
    - be included in your $PATH
    - be set via $GIT_PYTHON_GIT_EXECUTABLE
    - explicitly set via git.refresh()

All git commands will error until this is rectified.

$GIT_PYTHON_REFRESH environment variable. Use one of the following values:
    - error|e|raise|r|2: for a raised exception

Example:
    export GIT_PYTHON_REFRESH=quiet

