<a href="https://colab.research.google.com/github/vectice/vectice-examples/blob/master/Notebooks/MLflow/KC_housing_prices/KC_housing_prices.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

This notebooks walks you through the steps done to predict King-County housing prices using linear regression with the Vectice MLflow integration

## Install and import the required packages


In [3]:
!pip install -q vectice[github]
!pip install -q fsspec
!pip install -q gcsfs 
!pip install -q MLflow


In [16]:
import logging
from math import sqrt
from vectice import Vectice
from vectice.models import JobType
import os
import mlflow
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.model_selection import train_test_split


## Upload the GCS service account file to get the data from GCS

In [4]:
# Upload your GCS Storage json for access to GCS
# It can be downloaded from the Tutorial page (readerKey.json)
from google.colab import files
uploaded = files.upload()

Saving readerKey.json to readerKey.json


## Connect yo your Vectice project

In [8]:
# Project token from Vectice UI
PROJECT_TOKEN = "Project Token"
# The API Endpoint and Vectice API 
os.environ['VECTICE_API_ENDPOINT'] ='https://beta.vectice.com'
os.environ['VECTICE_API_TOKEN'] = 'API_Token'
# Your Google Cloud Storage json
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = 'readerKey.json'

## Get the data from GCS

In [9]:
# Get data from GCS Storage bucket
df = pd.read_csv("gs://vectice_tutorial/kc_house_data.csv") 
df.head()

Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
0,7129300520,20141013T000000,221900.0,3,1.0,1180,5650,1.0,0,0,3,7,1180,0,1955,0,98178,47.5112,-122.257,1340,5650
1,6414100192,20141209T000000,538000.0,3,2.25,2570,7242,2.0,0,0,3,7,2170,400,1951,1991,98125,47.721,-122.319,1690,7639
2,5631500400,20150225T000000,180000.0,2,1.0,770,10000,1.0,0,0,3,6,770,0,1933,0,98028,47.7379,-122.233,2720,8062
3,2487200875,20141209T000000,604000.0,4,3.0,1960,5000,1.0,0,0,5,7,1050,910,1965,0,98136,47.5208,-122.393,1360,5000
4,1954400510,20150218T000000,510000.0,3,2.0,1680,8080,1.0,0,0,3,8,1680,0,1987,0,98074,47.6168,-122.045,1800,7503


## Creating the inputs of our run

In [17]:
def create_inputs():
    return [
        Vectice.create_dataset_version().with_parent_name("kc_house_data"),
        Vectice.create_dataset_version().with_parent_name("cleaned_kc_house_data")
    ]

## Data preparation

In [18]:
def prepare_data():
    """Read and prepare data."""
    df = pd.read_csv("gs://vectice_tutorial/kc_house_data.csv") 

    df = df.drop(["id", "date"], axis=1)

    X = df.drop("price", axis=1)
    y = df["price"]

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    return X_train, X_test, y_train, y_test

## Modeling

In [None]:
X_train, X_test, y_train, y_test = prepare_data()

"""Vectice MLflow adapter sync-up usage."""
## Setting up the MLflow tracking uri
mlflow.set_tracking_uri("https://mlflow-beta.vectice.com")
MLFLOW_EXPERIMENT_NAME = "LinearRegression"

# Use MLflow as usual
with mlflow.start_run() as run:
    reg = LinearRegression().fit(X_train, y_train)

    y_pred = reg.predict(X_test)

    MAE = mean_absolute_error(y_pred, y_test)
    print("Mean Absolute Error:", MAE)
    RMSE = sqrt(mean_squared_error(y_pred, y_test))
    print("Root Mean Square Error:", RMSE)

    mlflow.log_param("algorithm", "linear regression")
    mlflow.log_metric("MAE", MAE)
    mlflow.log_metric("RMSE", RMSE)
    run_data = mlflow.get_run(run_id=run.info.run_id)
  
Vectice.save_after_run(project_token="WN6jn7gVuRdW5v5ngm1d", run=run_data, lib="MLflow")

In [None]:
# If you have a mlflow run that is still running and you need to end it, then run this cell.
mlflow.end_run()