In [1]:
import pandas as pd
import numpy as np

data_dict = {
    "id": [2, 3],
    "host_id": [2992450.0, 5651579.0],
    "accommodates": [4.0, 2.0],
    "bathrooms": [1.0, 1.0],
    "bedrooms": [2.0, 1.0],
    "beds": [2.2361068367009524, 0.0],
    "availability_30": [0.0, 11.0],
    "availability_60": [0.0, 15.0],
    "availability_90": [0.0, np.nan],
    "availability_365": [36.0, 15.0],
    "host_response_rate": [100.0, 100.0],
    "host_acceptance_rate": [100.0, 99.0],
    "host_listings_count": [1.0, 2.0],
    "host_total_listings_count": [5.0, 4.0],
    "number_of_reviews": [9.0, 64.12143150995708],
    "number_of_reviews_ltm": [0.0, 28.0],
    "number_of_reviews_l30d": [0.0, 2.0],
    "review_scores_rating": [np.nan, 4.51],
    "review_scores_accuracy": [4.879706019274778, 4.61],
    "review_scores_cleanliness": [4.812461133714737, 4.45],
    "review_scores_checkin": [np.nan, 4.82],
    "review_scores_communication": [4.944250346120556, 4.87],
    "review_scores_location": [4.867251870622564, 4.79],
    "review_scores_value": [4.790738114906661, 4.64],
    "reviews_per_month": [0.08, 3.13],
    "minimum_nights": [28.0, 1.0],
    "maximum_nights": [1125.0, 45.0],
    "minimum_minimum_nights": [28.0, np.nan],
    "maximum_minimum_nights": [28.0, 2.0],
    "minimum_maximum_nights": [1125.0, 1125.0],
    "maximum_maximum_nights": [1125.0, 1125.0],
    "minimum_nights_avg_ntm": [28.0, 2.0],
    "maximum_nights_avg_ntm": [1125.0, 1125.0]
}

data = pd.DataFrame(data_dict)

# Ensure all columns are of type float
data = data.astype(float)

In [1]:
import pandas as pd
import numpy as np
import os
from sqlalchemy import create_engine
from concurrent.futures import ThreadPoolExecutor

from feast import FeatureStore
import mlflow
import mlflow.sklearn
from mlflow.models.signature import ModelSignature
from mlflow.types.schema import Schema
from mlflow.types.schema import ParamSchema
from mlflow.types.schema import ParamSpec
from mlflow.types.schema import ColSpec

from sklearn.preprocessing import OrdinalEncoder, RobustScaler, PolynomialFeatures
from sklearn.decomposition import PCA
from sklearn.linear_model import LinearRegression, Ridge, Lasso, BayesianRidge, ElasticNet
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split, GridSearchCV

from utils.model.feature_vars import listing_features, host_features, review_features, fact_features
from utils.model.feast import query_data, get_historical_features
from utils.model.checking import df_description
from utils.model.transforming import yeo_johnson_transforming
from utils.model.imputation import missing_data_handling
from utils.mlflow.creating import create_mlflow_experiment

db_config = {
  'user': 'admin',
  'password': 'admin123',
  'host': 'feast_postgres',
  'port': '5432',
  'database': 'feast_postgres'
}

def data_extraction():
  fs = FeatureStore(repo_path="./feature_repo")

  connection_string = f"postgresql://{db_config['user']}:{db_config['password']}@{db_config['host']}:{db_config['port']}/{db_config['database']}"
  engine = create_engine(connection_string)

  queries = {
    'listing': "SELECT id, event_timestamp FROM listing_table",
    'host': "SELECT host_id, event_timestamp FROM host_table",
    'review': "SELECT id, event_timestamp FROM review_table",
    'fact': "SELECT id, event_timestamp FROM fact_table"
  }

  with ThreadPoolExecutor() as executor:
    listing_data, host_data, review_data, fact_data = executor.map(lambda q: query_data(engine, q), queries.values())

  listing_df = get_historical_features(fs, listing_data, listing_features)
  host_df = get_historical_features(fs, host_data, host_features)
  review_df = get_historical_features(fs, review_data, review_features)
  fact_df = get_historical_features(fs, fact_data, fact_features)

  # Drop event_timestamp columns
  for df in [listing_df, host_df, review_df, fact_df]:
    df.drop(columns=['event_timestamp'], inplace=True)

  # Merge dataframes
  df = pd.merge(listing_df, host_df, on="host_id", how="left").drop_duplicates(subset=['id'])
  df = pd.merge(df, review_df, on="id", how="left").drop_duplicates(subset=['id'])
  df = pd.merge(df, fact_df, on="id", how="left").drop_duplicates(subset=['id'])

  print(f"Length listing_df {len(listing_df)}")
  print(f"Length host_df {len(host_df)}")
  print(f"Length review_df {len(review_df)}")
  print(f"Length fact_df {len(fact_df)}")
  print(f"Length df {len(df)}")

  df.to_csv("./data.csv", index=False)

  return df

def data_validation(df):
  # report = {}

  # missing_values_count = df.isnull().sum()
  # vals = []

  # print(f"Length of dataframe: {len(df)}\n")

  # for col in df.columns:
  #   missing_count = missing_values_count[col]
  #   col_type = df[col].dtype
  #   vals.append(f"Column: {col}, Missing Values: {missing_count}, Type: {col_type}")

  # vals = "\n".join(vals)
  # print(vals)

  return df

def data_preparation(df):
  # Data dropping
  df.drop(columns=["id", "host_id"], axis=1, inplace=True)

  # Data transforming
  df = yeo_johnson_transforming(df)
  
  # Data splitting
  features = df.drop("price", axis=1)
  target = df["price"]
  X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.3, random_state=42)
  
  # Data standardizing
  scaler = RobustScaler()
  X_train_scaled = scaler.fit_transform(X_train)
  X_test_scaled = scaler.transform(X_test)
  print(X_train_scaled.shape)

  return X_train, X_test, X_train_scaled, X_test_scaled, y_train, y_test

def model_training(X_train_scaled, y_train):
  models_and_params = {
    "Linear_Regression": (LinearRegression(), {}),
    "Ridge_Regression": (Ridge(), {"alpha": [0.001, 0.01, 0.1, 1, 10]}),
    "Lasso_Regression": (Lasso(), {"alpha": [0.0001, 0.001, 0.01, 0.1, 1]}),
    "Bayesian_Ridge_Regression": (BayesianRidge(), {"alpha_1": [1e-6, 1e-5, 1e-4], "lambda_1": [1e-6, 1e-5, 1e-4]}),
    "ElasticNet_Regression": (ElasticNet(), {"alpha": [0.001, 0.01, 0.1], "l1_ratio": [0.1, 0.2, 0.5]}),
    "Decision_Tree_Regression": (DecisionTreeRegressor(), {"max_depth": [5, 10, 20, None]})
  }

  trained_models = {}
  for name, (model, param_grid) in models_and_params.items():
    grid_search = GridSearchCV(model, param_grid, cv=5, scoring='neg_mean_squared_error')
    grid_search.fit(X_train_scaled, y_train)
    trained_models[name] = grid_search.best_estimator_

  return trained_models

def model_scoring(trained_models, X_test_scaled, y_test):
  res = []
  best_model = None
  best_r2 = -float('inf')
  
  for name, model in trained_models.items():
    predictions = model.predict(X_test_scaled)
    rmse = mean_squared_error(y_test, predictions, squared=False)
    r2 = r2_score(y_test, predictions)
    res.append((name, model, model.get_params(), rmse, r2))

    if r2 > best_r2:
      best_r2 = r2
      best_model_name = name
      
  return res, best_model_name

def model_exporting(res, best_model_name, model_signature):
  mlflow.set_tracking_uri("http://mlflow:5000")
  experiment_id = create_mlflow_experiment(
    experiment_name="dev_model5",
    artifact_location="s3://artifacts"
    )
  best_model = None
  best_r2 = -float("inf")
  
  with mlflow.start_run(run_name="experiment", experiment_id=experiment_id) as run:
    for name, model, params, rmse, r2 in res:
      with mlflow.start_run(run_name=name, nested=True) as nested_run: 
        mlflow.log_params(params)
        mlflow.log_metric("rmse", rmse)
        mlflow.log_metric("r2", r2)

        if name == best_model_name:
          best_model_run_id = nested_run.info.run_id
          mlflow.sklearn.log_model(model, artifact_path=name, signature=model_signature, registered_model_name=name)
          print(f"The best model is: {name}")
          print(f"run_id of best_model: {best_model_run_id}")
              
    print(f"run_id of models: {run.info.run_id}")

  return best_model_run_id

def model_signature(X_train):
  cols_spec = []
  data_map = {
    "int64": "integer",
    "float64": "double",
    "bool": "boolean",
    "str": "string",
    "object": "string",
    "date": "datetime",
  }
  for name, dtype in X_train.dtypes.to_dict().items():
    cols_spec.append(ColSpec(name=name, type=data_map[str(dtype)]))
    
  input_schema = Schema(inputs=cols_spec)
  output_schema = Schema([ColSpec(name="price", type="double")])
  param = ParamSpec(name="model_name", dtype="string", default="model1")
  param_schema = ParamSchema(params=[param])

  model_signature = ModelSignature(inputs=input_schema, outputs=output_schema, params=param_schema)

  return model_signature

  import pkg_resources  # noqa: TID251
/opt/conda/lib/python3.11/site-packages/mlflow/gateway/config.py:61: PydanticDeprecatedSince20: Pydantic V1 style `@validator` validators are deprecated. You should migrate to Pydantic V2 style `@field_validator` validators, see the migration guide for more details. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.7/migration/
  @validator("togetherai_api_key", pre=True)
/opt/conda/lib/python3.11/site-packages/mlflow/gateway/config.py:390: PydanticDeprecatedSince20: Pydantic V1 style `@root_validator` validators are deprecated. You should migrate to Pydantic V2 style `@model_validator` validators, see the migration guide for more details. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.7/migration/
  @root_validator(skip_on_failure=True)
/opt/conda/lib/python3.11/site-packages/pydantic/_internal/_config.py:284: Pyd

In [2]:
if __name__ == "__main__": 
  # df = data_extraction()
  df = pd.read_csv("./data.csv")
  df = data_validation(df)
  X_train, X_test, X_train_scaled, X_test_scaled, y_train, y_test = data_preparation(df)
  ms = model_signature(X_train)
  trained_models = model_training(X_train_scaled, y_train)
  res, best_model_name = model_scoring(trained_models, X_test_scaled, y_test)
  best_model_run_id = model_exporting(res, best_model_name, ms)
  # best_model_run_id, best_model_name = "adc12572abe04f1eb83ab14a1fb0f111", "Decision_Tree_Regression"

  # sample_data = data.copy()
  # model = load_best_model(best_model_run_id, best_model_name)
  # model_serving = ModelServing(model)
  # prediction = model_serving.data_prediction(sample_data)
  # print(f"Prediction: {prediction}")

(180888, 31)




Experiment dev_model5 already exists.


The git executable must be specified in one of the following ways:
    - be included in your $PATH
    - be set via $GIT_PYTHON_GIT_EXECUTABLE
    - explicitly set via git.refresh(<full-path-to-git-executable>)

All git commands will error until this is rectified.

This initial message can be silenced or aggravated in the future by setting the
$GIT_PYTHON_REFRESH environment variable. Use one of the following values:
    - quiet|q|silence|s|silent|none|n|0: for no message or exception
    - error|e|exception|raise|r|2: for a raised exception

Example:
    export GIT_PYTHON_REFRESH=quiet

  return _bootstrap._gcd_import(name[level:], package, level)
Registered model 'Decision_Tree_Regression' already exists. Creating a new version of this model...
2024/06/05 04:55:50 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: Decision_Tree_Regression, version 3
Created version '3' of model 'Decision_Tree_Regression'.


The best model is: Decision_Tree_Regression
run_id of best_model: 7352439eb158417890265c912441d768
run_id of models: d37b6ac4ed2a4659b63486c5fd185b56


In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import RobustScaler
from utils.model.transforming import yeo_johnson_transforming
from utils.model.imputation import missing_data_handling

data_dict = {
    "id": [2, 3],
    "host_id": [2992450.0, 5651579.0],
    "accommodates": [4.0, 2.0],
    "bathrooms": [1.0, 1.0],
    "bedrooms": [2.0, 1.0],
    "beds": [2.2361068367009524, 0.0],
    "availability_30": [0.0, 11.0],
    "availability_60": [0.0, 15.0],
    "availability_90": [0.0, np.nan],
    "availability_365": [36.0, 15.0],
    "host_response_rate": [100.0, 100.0],
    "host_acceptance_rate": [100.0, 99.0],
    "host_listings_count": [1.0, 2.0],
    "host_total_listings_count": [5.0, 4.0],
    "number_of_reviews": [9.0, 64.12143150995708],
    "number_of_reviews_ltm": [0.0, 28.0],
    "number_of_reviews_l30d": [0.0, 2.0],
    "review_scores_rating": [np.nan, 4.51],
    "review_scores_accuracy": [4.879706019274778, 4.61],
    "review_scores_cleanliness": [4.812461133714737, 4.45],
    "review_scores_checkin": [np.nan, 4.82],
    "review_scores_communication": [4.944250346120556, 4.87],
    "review_scores_location": [4.867251870622564, 4.79],
    "review_scores_value": [4.790738114906661, 4.64],
    "reviews_per_month": [0.08, 3.13],
    "minimum_nights": [28.0, 1.0],
    "maximum_nights": [1125.0, 45.0],
    "minimum_minimum_nights": [28.0, np.nan],
    "maximum_minimum_nights": [28.0, 2.0],
    "minimum_maximum_nights": [1125.0, 1125.0],
    "maximum_maximum_nights": [1125.0, 1125.0],
    "minimum_nights_avg_ntm": [28.0, 2.0],
    "maximum_nights_avg_ntm": [1125.0, 1125.0]
}

data = pd.DataFrame(data_dict)

# Ensure all columns are of type float
data = data.astype(float)

from feast import FeatureStore
import mlflow
import mlflow.sklearn

def load_best_model(run_id, model_name):
  model_uri = f"runs:/{run_id}/{model_name}"
  model = mlflow.sklearn.load_model(model_uri)

  return model

class ModelServing:
  def __init__(self, model):
    self.model = model
    
  def data_transforming(self, df):
    # Data dropping
    df.drop(columns=["id", "host_id"], axis=1, inplace=True)
    
    # Outliers handling
    df = missing_data_handling(df)

    # Data transforming
    df = yeo_johnson_transforming(df)
    
    # Data standardizing
    scaler = RobustScaler()
    df = scaler.fit_transform(df)
    
    return df

  def data_prediction(self, sample_data):
    data = self.data_transforming(sample_data)
    pred = self.model.predict(data)

    return pred

if __name__ == "__main__": 
  best_model_run_id, best_model_name = "6f5ea1d794804da681cc85c0ff02c9dc", "Decision_Tree_Regression"
  model = load_best_model(best_model_run_id, best_model_name)
  print(model)
  sample_data = data.copy()
  model_serving = ModelServing(model)
  prediction = model_serving.data_prediction(sample_data)
  print(f"Prediction: {prediction}")

  import pkg_resources  # noqa: TID251
/opt/conda/lib/python3.11/site-packages/mlflow/gateway/config.py:61: PydanticDeprecatedSince20: Pydantic V1 style `@validator` validators are deprecated. You should migrate to Pydantic V2 style `@field_validator` validators, see the migration guide for more details. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.7/migration/
  @validator("togetherai_api_key", pre=True)
/opt/conda/lib/python3.11/site-packages/mlflow/gateway/config.py:390: PydanticDeprecatedSince20: Pydantic V1 style `@root_validator` validators are deprecated. You should migrate to Pydantic V2 style `@model_validator` validators, see the migration guide for more details. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.7/migration/
  @root_validator(skip_on_failure=True)
/opt/conda/lib/python3.11/site-packages/pydantic/_internal/_config.py:284: Pyd

DecisionTreeRegressor()
[IterativeImputer] Completing matrix with shape (2, 31)
[IterativeImputer] Ending imputation round 1/5, elapsed time 0.03
[IterativeImputer] Change: 0.0, scaled tolerance: 1.125 
[IterativeImputer] Early stopping criterion reached.
Prediction: [289. 103.]
