# Production Infrastructure Part 2: Feature Store

### Loading Libraries

In [None]:
%pip install catboost

In [None]:
# Requests
import requests

# Numerical Computing
import numpy as np

# Data Manipulation

from dataclasses import dataclass
from typing import List

# Apache Spark
from pyspark.sql.types import *
from pyspark.sql import functions as F
from pyspark.sql.functions import when

from databricks import feature_store
from databricks.feature_store import FeatureLookup
import mlflow
from mlflow.tracking.client import MlflowClient

# Categorical Boosting
from catboost import CatBoostClassifier, metrics as cb_metrics

# Scikit-Learn
from sklearn import metrics
from sklearn.model_selection import train_test_split

#### Dataset Acquisition & Table Registration

In [None]:
class DataIngest:
  
  def __init__(self, url, local, source, sink, schema, database, table_name):
    self.url = url
    self.local = local
    self.source = source
    self.sink = sink
    self.schema = schema
    self.database = database
    self.table_name = table_name
    
  def _acquire_raw(self):
    response = requests.get(self.url, stream=True)
    with open(self.url.split("/")[-1], "wb") as data:
      data.write(response.content)
    response.close()
    
  def _transfer_local(self):
    dbutils.fs.mv(self.local, self.source)
    
  def _read_source(self):
    return spark.read.csv(self.source, header=True, inferSchema=False, schema=self.schema)
  
  def _write_source(self, data):
    data.write.format("delta").mode("overwrite").option("mergeSchema", "true").option("overwriteSchema", "true").save(self.sink)
    
  def _create_table(self):
    spark.sql(f"CREATE DATABASE IF NOT EXISTS {self.database};")
    spark.sql(f"""CREATE TABLE IF NOT EXISTS {self.database}.{self.table_name} USING DELTA LOCATION '{self.sink}';""")
    dbutils.fs.rm(self.source)
  
  def register_data(self):
    self._acquire_raw()
    self._transfer_local()
    data = self._read_source()
    self._write_source(data)
    self._create_table()
    
  def get_data(self):
    return spark.table(f"{self.database}.{self.table_name}")

#### Configuration for Data Ingestion & Registration, Followed by Execution of the Ingest.

In [None]:
DATA_URL = "https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data"
LOCAL_FILE = "file:/databricks/driver/adult.data"
SOURCE = "" # Change this to your local workspace pathing if you want to run this
SINK = "" # Change this to your local workspace pathing if you want to run this
DATABASE = "" # Change this if you want to run this
TABLE_NAME = "adult_classification"

ADULT_SCHEMA = StructType([
  StructField('age', IntegerType()),
  StructField('working_class', StringType()),
  StructField('final_weight', DoubleType()),
  StructField('education_level', StringType()),
  StructField('education_years', DoubleType()),
  StructField('marital_status', StringType()),
  StructField('occupation', StringType()),
  StructField('relationship_type', StringType()),
  StructField('race', StringType()),
  StructField('gender', StringType()),
  StructField('capital_gain', DoubleType()),
  StructField('capital_loss', DoubleType()),
  StructField('hours_worked_per_week', DoubleType()),
  StructField('native_country', StringType()),
  StructField('income', StringType())
])

data_handler = DataIngest(DATA_URL, LOCAL_FILE, SOURCE, SINK, ADULT_SCHEMA, DATABASE, TABLE_NAME)
data_handler.register_data()

In [None]:
raw_data = data_handler.get_data()

display(raw_data)