# **Goal to learn system design concepts through ML workflow**

## Concepts tackled:
1. Separation of concerns - Design choices like method precedence. using lazy initialization v using function bundling.

In [1]:
from sklearn.datasets import fetch_california_housing
import pandas as pd

house = fetch_california_housing(as_frame= True)


In [2]:
print(house.data.shape, house.target.shape, house.feature_names)

(20640, 8) (20640,) ['MedInc', 'HouseAge', 'AveRooms', 'AveBedrms', 'Population', 'AveOccup', 'Latitude', 'Longitude']


In [3]:
x = pd.DataFrame(house.data, columns= house.feature_names)
x.head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25


In [4]:
y = pd.DataFrame(house.target, columns= ['MedHouseVal'])
y.head()

Unnamed: 0,MedHouseVal
0,4.526
1,3.585
2,3.521
3,3.413
4,3.422


In [5]:
class Datapipline:
    def __init__(self, x, y, random_state = 47):
        self.x = x
        self.y = y
        self.random_state = random_state
        self.scaler = None
        self.x_train = None
        self.x_test = None
        self.y_train = None
        self.y_test = None

    # notes probably going to use a lot of lazy initialization.
    # should probably encapsulate x and y and random state so data cannot be messed with.

    # define method to check for colinearity and drop columns which are collinear ---> return x with dropped columns
    def check_colinearity(self):
        return self.x.corr()

    # check variance inflation factor
    def check_vif(self, model):
        r2_score = []

        for i in self.x.columns:
          r2 = model.fit(self.x.drop(i, axis = 1), self.x[i]).score(self.x.drop(i, axis = 1), self.x[i])
          r2_score.append(r2)

        vif = [1/(1-x) if x < 1 else float("inf") for x in r2_score ]
        vif_df =pd.DataFrame({'vif': vif}, index = self.x.columns)
        self.vif_df = list(vif_df[vif_df['vif'] > 5].index)
        return self.vif_df

    #drop columns which have high variance than

    def drop_vif_columns(self):

      if not self.vif_df:
        raise ValueError(" Run Check_VIF before drop_VIF_Columns")

      else:
        self.x = self.x.drop(self.vif_df, axis = 1)
      return self.x

    # define method to check for outliers --> return clipped columns
    def check_outliers(self):
      # may be could have separated stuff here more and used lambda for stuff here. but I'm new to this and using OOP for pipeline design
      lower_range = self.x.quantile(0.25)
      upper_range = self.x.quantile(0.75)
      iqr = upper_range - lower_range
      self.lower_bound = lower_range - 1.5 * iqr
      self.upper_bound = upper_range + 1.5 * iqr
      self.x = self.x.clip(self.lower_bound, self.upper_bound, axis = 1)
      return self.x

    # #split data
    def split_data(self, test_size = 0.3):
      from sklearn.model_selection import train_test_split
      self.x_train, self.x_test, self.y_train, self.y_test = train_test_split(self.x, self.y, test_size = test_size, random_state = self.random_state)
      return self.x_train, self.x_test, self.y_train, self.y_test
    # define method to standardize the data

    #define standard scaler transformations first

    def fit_scaling(self, scaler):
      self.scaler = scaler.fit(self.x_train)
      return self.scaler

    def transform_scaling(self):
      if self.x_train is None or self.x_test is None:
        raise ValueError("Run split_data before standardize_data")
      else:
        self.x_train = self.scaler.transform(self.x_train)
        self.x_test = self.scaler.transform(self.x_test)
      return self.x_train, self.x_test



    # define method to enforce method precedence.



In [6]:
training_data = Datapipline(x, y)

In [7]:
training_data.check_colinearity()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
MedInc,1.0,-0.119034,0.326895,-0.06204,0.004834,0.018766,-0.079809,-0.015176
HouseAge,-0.119034,1.0,-0.153277,-0.077747,-0.296244,0.013191,0.011173,-0.108197
AveRooms,0.326895,-0.153277,1.0,0.847621,-0.072213,-0.004852,0.106389,-0.02754
AveBedrms,-0.06204,-0.077747,0.847621,1.0,-0.066197,-0.006181,0.069721,0.013344
Population,0.004834,-0.296244,-0.072213,-0.066197,1.0,0.069863,-0.108785,0.099773
AveOccup,0.018766,0.013191,-0.004852,-0.006181,0.069863,1.0,0.002366,0.002476
Latitude,-0.079809,0.011173,0.106389,0.069721,-0.108785,0.002366,1.0,-0.924664
Longitude,-0.015176,-0.108197,-0.02754,0.013344,0.099773,0.002476,-0.924664,1.0


In [8]:
from sklearn.linear_model import LinearRegression
training_data.check_vif(LinearRegression())

['AveRooms', 'AveBedrms', 'Latitude', 'Longitude']

In [9]:
training_data.drop_vif_columns()

Unnamed: 0,MedInc,HouseAge,Population,AveOccup
0,8.3252,41.0,322.0,2.555556
1,8.3014,21.0,2401.0,2.109842
2,7.2574,52.0,496.0,2.802260
3,5.6431,52.0,558.0,2.547945
4,3.8462,52.0,565.0,2.181467
...,...,...,...,...
20635,1.5603,25.0,845.0,2.560606
20636,2.5568,18.0,356.0,3.122807
20637,1.7000,17.0,1007.0,2.325635
20638,1.8672,18.0,741.0,2.123209


In [10]:
training_data.check_outliers()

Unnamed: 0,MedInc,HouseAge,Population,AveOccup
0,8.013025,41.0,322.0,2.555556
1,8.013025,21.0,2401.0,2.109842
2,7.257400,52.0,496.0,2.802260
3,5.643100,52.0,558.0,2.547945
4,3.846200,52.0,565.0,2.181467
...,...,...,...,...
20635,1.560300,25.0,845.0,2.560606
20636,2.556800,18.0,356.0,3.122807
20637,1.700000,17.0,1007.0,2.325635
20638,1.867200,18.0,741.0,2.123209


In [11]:
training_data.split_data()

(         MedInc  HouseAge  Population  AveOccup
 4757   2.595500      35.0      2042.0  3.614159
 15755  3.350000      52.0      1275.0  2.305606
 8060   2.484400      46.0      1360.0  3.170163
 5100   3.725000      52.0       819.0  2.843750
 2100   1.654800      35.0      1676.0  3.484407
 ...         ...       ...         ...       ...
 19280  3.096900      26.0      2605.0  3.700284
 11528  2.270800      26.0       664.0  1.301961
 14663  3.871900      29.0      1392.0  2.666667
 18310  5.280300      35.0      1187.0  2.287091
 5255   8.013025      40.0      1564.0  2.990440
 
 [14448 rows x 4 columns],
        MedInc  HouseAge  Population  AveOccup
 10486  5.0527      15.0       727.0  2.481229
 16251  2.5000      43.0       367.0  2.414474
 8883   7.8705      52.0       547.0  2.940860
 15209  6.1949       5.0      2565.0  2.843681
 11965  4.2321       9.0      3015.0  3.513986
 ...       ...       ...         ...       ...
 7902   7.2628      21.0      1671.0  3.555319
 9431  

In [12]:
training_data.x_train.shape, training_data.x_test.shape, training_data.y_train.shape, training_data.y_test.shape

((14448, 4), (6192, 4), (14448, 1), (6192, 1))

In [13]:
from sklearn.preprocessing import StandardScaler
training_data.fit_scaling(StandardScaler())

In [14]:
training_data.transform_scaling()

(array([[-0.73043791,  0.50400247,  0.91722281,  1.02824578],
        [-0.27286806,  1.85437197, -0.08244663, -0.85574386],
        [-0.797815  ,  1.37777097,  0.02833812,  0.38900246],
        ...,
        [ 0.04364049,  0.02740148,  0.07004532, -0.33590653],
        [ 0.89777088,  0.50400247, -0.19714143, -0.88240116],
        [ 2.555044  ,  0.90116997,  0.29422152,  0.13024573]]),
 array([[ 0.75974186, -1.08466751, -0.79668242, -0.60289085],
        [-0.78835431,  1.13947047, -1.26588841, -0.69900133],
        [ 2.46860909,  1.85437197, -1.03128542,  0.05886358],
        ...,
        [-1.52956288,  1.06003697, -0.58423637,  1.0125231 ],
        [-0.03750311,  0.02740148,  0.24078417, -1.51091497],
        [-0.19451415, -1.64070201, -1.36494301, -0.53204636]]))

# Model Architecture in Jax/Flax

In [15]:
import jax
import jax.numpy as jnp
import flax.linen as nn


class RegressionModel(nn.Module):
    @nn.compact
    def __call__(self, x):
        x = nn.Dense(64)(x)
        x = nn.relu(x)
        x = nn.Dense(32)(x)
        x = nn.relu(x)
        x = nn.Dense(1)(x)
        return x

Initializing the model

In [17]:
model = RegressionModel()

x_train = jnp.asarray(training_data.x_train)
y_train = jnp.asarray(training_data.y_train)
x_test = jnp.asarray(training_data.x_test)
y_test = jnp.asarray(training_data.y_test)

params = model.init(jax.random.PRNGKey(42), x_train)

In [18]:
def regression_loss(params, x, y):
    y_pred = model.apply(params, x)
    return jnp.mean((y - y_pred) ** 2)

# Optimizer initialization and state

In [23]:
import optax
optimizer = optax.adam(0.001)
opt_state = optimizer.init(params) # seems to create a new pytree of new updates but mutable??

In [25]:
@jax.jit
def train_step(params, opt_state, x, y):
    loss, grads = jax.value_and_grad(regression_loss)(params, x, y)
    updates, opt_state = optimizer.update(grads, opt_state) # takes a single step and updates pytrees with state

    new_params = optax.apply_updates(params, updates) # apply updates to params to create new pytrees.
    return new_params, opt_state, loss

In [33]:
for epoch in range(1000):
    params, opt_state, loss = train_step(params, opt_state, x_train, y_train)
    print(f"Epoch {epoch+1}: Loss = {loss}")

Epoch 1: Loss = 0.48213157057762146
Epoch 2: Loss = 0.48173239827156067
Epoch 3: Loss = 0.4813402593135834
Epoch 4: Loss = 0.4809546172618866
Epoch 5: Loss = 0.48057591915130615
Epoch 6: Loss = 0.48020392656326294
Epoch 7: Loss = 0.4798392355442047
Epoch 8: Loss = 0.4794809818267822
Epoch 9: Loss = 0.47912904620170593
Epoch 10: Loss = 0.47878333926200867
Epoch 11: Loss = 0.47844353318214417
Epoch 12: Loss = 0.47810932993888855
Epoch 13: Loss = 0.4777815639972687
Epoch 14: Loss = 0.4774593412876129
Epoch 15: Loss = 0.4771433472633362
Epoch 16: Loss = 0.47683286666870117
Epoch 17: Loss = 0.4765280783176422
Epoch 18: Loss = 0.47622933983802795
Epoch 19: Loss = 0.4759366810321808
Epoch 20: Loss = 0.4756496846675873
Epoch 21: Loss = 0.4753682017326355
Epoch 22: Loss = 0.4750921428203583
Epoch 23: Loss = 0.4748217463493347
Epoch 24: Loss = 0.4745561480522156
Epoch 25: Loss = 0.47429531812667847
Epoch 26: Loss = 0.47403988242149353
Epoch 27: Loss = 0.4737894535064697
Epoch 28: Loss = 0.473544

In [34]:
pred = model.apply(params, x_test)
from sklearn.metrics import r2_score
r2_score(y_test, pred)

0.6462542414665222

In [35]:
pred = model.apply(params, x_train)
from sklearn.metrics import r2_score
r2_score(y_train, pred)

0.6674398183822632

In [40]:
def predict(params, input_data):
    input_array = jnp.asarray(input_data)

    # Handle single example: [1, 2, 3] -> shape (3,) -> reshape to (1, 3)
    if input_array.ndim == 1:
        input_array = input_array.reshape(1, -1)

    # Now safe to check feature dimension
    if input_array.shape[1] != training_data.x_train.shape[1]:
        raise ValueError(f"Expected {training_data.x_train.shape[1]} features, got {input_array.shape[1]}")

    input_scaled = training_data.scaler.transform(input_array)
    return model.apply(params, input_scaled)

In [41]:
predict(params, [5, 3,3000,2.5])



Array([[2.2701364]], dtype=float32)