In [3]:
import pandas as pd
import os
from sqlalchemy import create_engine
from concurrent.futures import ThreadPoolExecutor

from feast import FeatureStore
import mlflow
import mlflow.sklearn

from sklearn.preprocessing import OrdinalEncoder, RobustScaler, PolynomialFeatures
from sklearn.decomposition import PCA
from sklearn.linear_model import LinearRegression, Ridge, Lasso, BayesianRidge, ElasticNet
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split, GridSearchCV

from utils.model.feature_vars import listing_features, host_features, review_features, fact_features
from utils.model.feast import query_data, get_historical_features
from utils.model.checking import df_description
from utils.model.outliers import outliers_handling
from utils.model.encoding import data_encoding
from utils.mlflow.creating import create_mlflow_experiment

db_config = {
  'user': 'admin',
  'password': 'admin123',
  'host': 'feast_postgres',
  'port': '5432',
  'database': 'feast_postgres'
}

mlflow.set_tracking_uri("http://mlflow:5000")

def data_extraction():
  fs = FeatureStore(repo_path="./feature_repo")

  connection_string = f"postgresql://{db_config['user']}:{db_config['password']}@{db_config['host']}:{db_config['port']}/{db_config['database']}"
  engine = create_engine(connection_string)

  queries = {
    'listing': "SELECT id, event_timestamp FROM listing_table",
    'host': "SELECT host_id, event_timestamp FROM host_table",
    'review': "SELECT id, event_timestamp FROM review_table",
    'fact': "SELECT id, event_timestamp FROM fact_table"
  }

  with ThreadPoolExecutor() as executor:
    listing_data, host_data, review_data, fact_data = executor.map(lambda q: query_data(engine, q), queries.values())

  listing_df = get_historical_features(fs, listing_data, listing_features)
  host_df = get_historical_features(fs, host_data, host_features)
  review_df = get_historical_features(fs, review_data, review_features)
  fact_df = get_historical_features(fs, fact_data, fact_features)

  # Drop event_timestamp columns
  for df in [listing_df, host_df, review_df, fact_df]:
    df.drop(columns=['event_timestamp'], inplace=True)

  # Merge dataframes
  df = pd.merge(listing_df, host_df, on="host_id", how="left").drop_duplicates(subset=['id'])
  df = pd.merge(df, review_df, on="id", how="left").drop_duplicates(subset=['id'])
  df = pd.merge(df, fact_df, on="id", how="left").drop_duplicates(subset=['id'])

  print(f"Length listing_df {len(listing_df)}")
  print(f"Length host_df {len(host_df)}")
  print(f"Length review_df {len(review_df)}")
  print(f"Length fact_df {len(fact_df)}")
  print(f"Length df {len(df)}")

  return df

def data_validation(df):
  report = {}

  missing_values_count = df.isnull().sum()
  vals = []

  print(f"Length of dataframe: {len(df)}\n")

  for col in df.columns:
    missing_count = missing_values_count[col]
    col_type = df[col].dtype
    vals.append(f"Column: {col}, Missing Values: {missing_count}, Type: {col_type}")

  vals = "\n".join(vals)
  print(vals)

  return df


def data_preparation(df):
  # Outliers handling
  df = outliers_handling(df)

  # Data encoding
  df = data_encoding(df)

  # Data splitting
  features = df.drop("price", axis=1)
  target = df["price"]
  X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.3, random_state=42)

  # Data standardizing
  scaler = RobustScaler()
  X_train_scaled = scaler.fit_transform(X_train)
  X_test_scaled = scaler.transform(X_test)

  return X_train_scaled, X_test_scaled, y_train, y_test

def model_training(X_train_scaled, y_train):
  models_and_params = {
    "Linear_Regression": LinearRegression(),
    "Ridge_Regression": Ridge(alpha=0.001),
    "Lasso_Regression": Lasso(alpha=0.0001),
    "Bayesian_Ridge Regression": BayesianRidge(alpha_1=1e-6, lambda_1=1e-6),
    "ElasticNet_Regression": ElasticNet(alpha=0.01, l1_ratio=0.2),
    "Decision_Tree_Regression": DecisionTreeRegressor(max_depth=3),
  }

  trained_models = {}
  for name, model in models_and_params.items():
    model.fit(X_train_scaled, y_train)
    trained_models[name] = model

  return trained_models

def model_validation(trained_models, X_test_scaled, y_test):
  res = []
  for name, model in trained_models.items():
    predictions = model.predict(X_test_scaled)
    rmse = mean_squared_error(y_test, predictions, squared=False)
    r2 = r2_score(y_test, predictions)
    res.append((name, model, model.get_params(), rmse, r2))

  print(res)

  return res

def model_exporting(res):
    mlflow.set_tracking_uri("http://mlflow:5000")
    experiment_id = create_mlflow_experiment(
      experiment_name="dev_model",
        # artifact_location="/home/jovyan/models"  # Absolute path to the models folder
      artifact_location="/mlflow/artifacts"
    )

    with mlflow.start_run(run_name="experiment", experiment_id=experiment_id):
        for name, model, params, rmse, r2 in res:
            sanitized_name = name.replace(" ", "_")
            with mlflow.start_run(run_name=name, nested=True) as run: 
                mlflow.log_params(params)
                mlflow.log_metric("rmse", rmse)
                mlflow.log_metric("r2", r2)
                mlflow.sklearn.log_model(model, artifact_path=name)

      
import joblib

if __name__ == "__main__":
  df = data_extraction()
  df = data_validation(df)
  X_train_scaled, X_test_scaled, y_train, y_test = data_preparation(df)
  trained_models = model_training(X_train_scaled, y_train)
  res = model_validation(trained_models, X_test_scaled, y_test)
  model_exporting(res)