Install Packages

In [1]:
!pip install supabase python-dotenv pandas tensorflow scikit-learn joblib

Collecting supabase
  Downloading supabase-2.15.0-py3-none-any.whl.metadata (11 kB)
Collecting python-dotenv
  Downloading python_dotenv-1.1.0-py3-none-any.whl.metadata (24 kB)
Collecting gotrue<3.0.0,>=2.11.0 (from supabase)
  Downloading gotrue-2.12.0-py3-none-any.whl.metadata (6.1 kB)
Collecting postgrest<1.1,>0.19 (from supabase)
  Downloading postgrest-1.0.1-py3-none-any.whl.metadata (3.5 kB)
Collecting realtime<2.5.0,>=2.4.0 (from supabase)
  Downloading realtime-2.4.2-py3-none-any.whl.metadata (6.6 kB)
Collecting storage3<0.12,>=0.10 (from supabase)
  Downloading storage3-0.11.3-py3-none-any.whl.metadata (1.8 kB)
Collecting supafunc<0.10,>=0.9 (from supabase)
  Downloading supafunc-0.9.4-py3-none-any.whl.metadata (1.2 kB)
Collecting pytest-mock<4.0.0,>=3.14.0 (from gotrue<3.0.0,>=2.11.0->supabase)
  Downloading pytest_mock-3.14.0-py3-none-any.whl.metadata (3.8 kB)
Collecting deprecation<3.0.0,>=2.1.0 (from postgrest<1.1,>0.19->supabase)
  Downloading deprecation-2.1.0-py2.py3-no

Bring in data

In [2]:
import os
os.environ['SUPABASE_URL'] = 'https://lgcrogvgnqphznuwdopu.supabase.co'
os.environ['SUPABASE_KEY'] = 'eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJpc3MiOiJzdXBhYmFzZSIsInJlZiI6ImxnY3JvZ3ZnbnFwaHpudXdkb3B1Iiwicm9sZSI6ImFub24iLCJpYXQiOjE3NDQ0MTQwMzcsImV4cCI6MjA1OTk5MDAzN30.2lozGgOq70UbrCm1_7Y1p38WbCqOMTjQ8Cs_ZSvNhSs'

Import PsiKit Learn

In [3]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

Database Setup

In [4]:
%%writefile db.py
import os
from dotenv import load_dotenv
from supabase import create_client, Client
import pandas as pd

# Load environment variables
load_dotenv()

SUPABASE_URL = os.getenv("SUPABASE_URL")
SUPABASE_KEY = os.getenv("SUPABASE_KEY")

def get_supabase_client() -> Client:
    return create_client(SUPABASE_URL, SUPABASE_KEY)

def fetch_housing_data() -> pd.DataFrame:
    client = get_supabase_client()
    # Replace "House" with your actual table name if different
    response = client.table("House").select("*").execute()
    data = response.data  # a list of dictionaries
    return pd.DataFrame(data)



Writing db.py


Database link

In [5]:
import os
import pandas as pd
from dotenv import load_dotenv
from supabase import create_client, Client

def fetch_housing_data() -> pd.DataFrame:
    load_dotenv()  # Make sure your .env file is uploaded or environment variables set
    SUPABASE_URL = os.getenv("SUPABASE_URL")
    SUPABASE_KEY = os.getenv("SUPABASE_KEY")

    # Create the Supabase client
    supabase: Client = create_client(SUPABASE_URL, SUPABASE_KEY)

    # Fetch the data from your table (replace 'housing_data' if your table name differs)
    response = supabase.table("House").select("*").execute()
    data = response.data  # This should be a list of dictionaries
    df = pd.DataFrame(data)
    return df


Model Script

In [6]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import tensorflow as tf
from db import fetch_housing_data  # Make sure db.py is in your PYTHONPATH or same folder structure
import joblib

def train_model():
    # 1. Fetch data from your Supabase database
    df = fetch_housing_data()
    print("Data loaded from Supabase:")
    print(df.head())

    # 2. Define input and target features based on your updated columns
    # You can adjust the columns if you wish to use additional ones.
    input_features = ['ListedPrice', 'MeanIncome', 'Bedroom', 'Bathroom', 'Area', '2022 Population']
    target_features = ['QualityOfLifeTotalScore', 'Cost of Living', '2016 Crime Rate']

    # Impute missing values with the median for both inputs and targets.
    X = df[input_features].fillna(df[input_features].median())
    y = df[target_features].fillna(df[target_features].median())

    # 3. Scale the input features.
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)

    # 4. Split the data into training and testing sets.
    X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

    # 5. Build the multioutput regression model with an explicit input layer.
    model = tf.keras.Sequential([
        tf.keras.Input(shape=(X_train.shape[1],)),
        tf.keras.layers.Dense(len(target_features))
    ])
    model.compile(optimizer='adam', loss='mean_squared_error')

    # 6. Train the model. Using 10% of the training data for validation.
    model.fit(X_train, y_train, epochs=50, validation_split=0.1)

    # 7. Evaluate the model on the test set.
    loss = model.evaluate(X_test, y_test)
    print("Test loss (MSE):", loss)

    # 8. Save the model and scaler for future prediction use.
    model.save("trained_model.h5")
    joblib.dump(scaler, "scaler.pkl")

if __name__ == "__main__":
    train_model()



Data loaded from Supabase:
  State        City  Bedroom  Bathroom    Area  ListedPrice Temperature  \
0    az  phoenix,az      3.0       2.0  1776.0       575000         Hot   
1    az  phoenix,az      4.0       2.0  1505.0       375000         Hot   
2    az  phoenix,az      3.0       2.0  1670.0       370000         Hot   
3    az  phoenix,az      3.0       1.0  1855.0       360000         Hot   
4    az  phoenix,az      4.0       3.0  1426.0       342000         Hot   

   2022 Population  2016 Crime Rate  Unemployment  ...  Cost of Living  \
0          4551524            0.032          3.46  ...        82847.38   
1          4551524            0.032          3.46  ...        82847.38   
2          4551524            0.032          3.46  ...        82847.38   
3          4551524            0.032          3.46  ...        82847.38   
4          4551524            0.032          3.46  ...        82847.38   

   AVG C2I  MeanIncome  QualityOfLifeTotalScore  QualityOfLifeQualityOfLife  



Test loss (MSE): 2287843584.0


Add-on

In [7]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


Predictions on Model

In [8]:
# services/prediction.py
import numpy as np
import tensorflow as tf
import joblib

# Load your saved model
model = tf.keras.models.load_model("trained_model.h5")
scaler = joblib.load("scaler.pkl")

def predict_house_metrics(user_input):
    """
    user_input: dict or list that includes
    [
      Desired House Price,
      Income,
      # Beds,
      # Baths,
      Sq. ft,
      Desired Population
    ]
    """
    # Convert to array (assuming user_input is a dictionary or list)
    data = np.array([[
      user_input["house_price"],
      user_input["income"],
      user_input["beds"],
      user_input["baths"],
      user_input["sq_ft"],
      user_input["population"]
    ]])
    # Scale
    scaled_data = scaler.transform(data)
    # Predict
    prediction = model.predict(scaled_data)
    return prediction.tolist()




Find closest match

In [9]:
import numpy as np

def find_closest_match(user_input, df, scaler):
    """
    Given a user_input dictionary, a DataFrame df with the housing records,
    and a scaler used for the training data, this function finds the record
    in df that is closest to the user's desired features.

    Parameters:
      - user_input: dict with keys "house_price", "income", "beds",
                    "baths", "sq_ft", "population"
      - df: DataFrame of housing records containing at least the following columns:
            ['ListedPrice', 'MeanIncome', 'Bedroom', 'Bathroom', 'Area', '2022 Population']
            plus location columns like "State", "City".
      - scaler: A StandardScaler fitted on the training data.

    Returns:
      - closest_match: A Pandas Series that represents the record closest to the input.
    """
    # Define the features used for matching (same as used in training)
    input_features = ['ListedPrice', 'MeanIncome', 'Bedroom', 'Bathroom', 'Area', '2022 Population']

    # Construct the user vector from the dictionary
    user_vector = np.array([[user_input["house_price"],
                              user_input["income"],
                              user_input["beds"],
                              user_input["baths"],
                              user_input["sq_ft"],
                              user_input["population"]]])

    # Scale the user input using the same scaler
    user_vector_scaled = scaler.transform(user_vector)

    # Extract the features from df and handle missing values if needed
    data_features = df[input_features].fillna(df[input_features].median())

    # Scale all these feature rows
    data_features_scaled = scaler.transform(data_features)

    # Compute Euclidean distances between the user vector and each row in data_features_scaled
    distances = np.linalg.norm(data_features_scaled - user_vector_scaled, axis=1)

    # Find the index of the closest match
    idx = np.argmin(distances)
    closest_match = df.iloc[idx]

    return closest_match


User Input and Output

In [10]:
import numpy as np
import pandas as pd

# Load the full dataset from Supabase (or your source)
df = fetch_housing_data()  # Make sure this cell runs so df is defined
print("Data loaded:")
print(df.head())

# 1. Collect user input
house_price = float(input("Enter your desired house price: "))
income = float(input("Enter your income: "))
beds = int(input("Enter number of beds: "))
baths = int(input("Enter number of baths: "))
sq_ft = float(input("Enter square footage: "))
population = float(input("Enter desired population: "))

user_input = {
    "house_price": house_price,
    "income": income,
    "beds": beds,
    "baths": baths,
    "sq_ft": sq_ft,
    "population": population
}

# 2. Get model prediction (if needed)
prediction = predict_house_metrics(user_input)
print("Prediction outputs (model's numerical predictions):", prediction)

# 3. Load the full dataset so df is defined
df = fetch_housing_data()  # Or load it from a CSV if needed
print("Full dataset loaded (first 5 rows):")
print(df.head())

# 4. Find the closest matching record
closest_match = find_closest_match(user_input, df, scaler)

# 5. Print out the details from the best match
print("\nClosest Matching House Record:")
print("State:", closest_match.get("State", "N/A"))
print("City:", closest_match.get("City", "N/A"))
print("Listed Price:", closest_match.get("ListedPrice", "N/A"))
print("Mean Income:", closest_match.get("MeanIncome", "N/A"))
print("Bedrooms:", closest_match.get("Bedroom", "N/A"))
print("Bathrooms:", closest_match.get("Bathroom", "N/A"))
print("Area:", closest_match.get("Area", "N/A"))
print("2022 Population:", closest_match.get("2022 Population", "N/A"))



Data loaded:
  State        City  Bedroom  Bathroom    Area  ListedPrice Temperature  \
0    az  phoenix,az      3.0       2.0  1534.0       407150         Hot   
1    az  phoenix,az      2.0       2.0   864.0       317000         Hot   
2    az  phoenix,az      4.0       2.0  1092.0       395000         Hot   
3    az  phoenix,az      3.0       2.0  1123.0       364900         Hot   
4    az  phoenix,az      2.0       1.0   922.0       249900         Hot   

   2022 Population  2016 Crime Rate  Unemployment  ...  Cost of Living  \
0          4551524            0.032          3.46  ...        82847.38   
1          4551524            0.032          3.46  ...        82847.38   
2          4551524            0.032          3.46  ...        82847.38   
3          4551524            0.032          3.46  ...        82847.38   
4          4551524            0.032          3.46  ...        82847.38   

   AVG C2I  MeanIncome  QualityOfLifeTotalScore  QualityOfLifeQualityOfLife  \
0    105.1  



Prediction outputs (model's numerical predictions): [[2687318.0, -925570.75, -1607212.5]]
Full dataset loaded (first 5 rows):
  State        City  Bedroom  Bathroom    Area  ListedPrice Temperature  \
0    az  phoenix,az      3.0       2.0  1166.0       449000         Hot   
1    az  phoenix,az      3.0       2.0  1482.0       429900         Hot   
2    az  phoenix,az      3.0       2.0  1357.0       379000         Hot   
3    az  phoenix,az      5.0       3.0  2931.0       659900         Hot   
4    az  phoenix,az      4.0       4.0  3139.0       677000         Hot   

   2022 Population  2016 Crime Rate  Unemployment  ...  Cost of Living  \
0          4551524            0.032          3.46  ...        82847.38   
1          4551524            0.032          3.46  ...        82847.38   
2          4551524            0.032          3.46  ...        82847.38   
3          4551524            0.032          3.46  ...        82847.38   
4          4551524            0.032          3.46  ..



Evaluate Model

In [11]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import tensorflow as tf
import os
import pandas as pd
from dotenv import load_dotenv
from supabase import create_client, Client

def fetch_housing_data() -> pd.DataFrame:
    load_dotenv()  # Make sure your environment variables are set
    SUPABASE_URL = os.getenv("SUPABASE_URL")
    SUPABASE_KEY = os.getenv("SUPABASE_KEY")
    client: Client = create_client(SUPABASE_URL, SUPABASE_KEY)
    response = client.table("House").select("*").execute()
    data = response.data
    return pd.DataFrame(data)


# Import additional metrics from scikit-learn
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# scripts/train_model.py
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import tensorflow as tf

def train_model():
    # 1. Fetch data
    df = fetch_housing_data()

    # 2. Define features/targets
    input_features = ['ListedPrice', 'MeanIncome', 'Bedroom', 'Bathroom', 'Area', '2022 Population']
    target_features = ['QualityOfLifeTotalScore', 'Cost of Living', '2016 Crime Rate']

    X = df[input_features].fillna(df[input_features].median())
    y = df[target_features].fillna(df[target_features].median())

    # 3. Scale
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)

    # 4. Split
    X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

    # 5. Build model
    model = tf.keras.Sequential([
        tf.keras.layers.Dense(len(target_features), input_shape=(X_train.shape[1],))
    ])

    # Change: Since there is only one Dense layer (output),
    # use a single loss function and remove loss_weights
    model.compile(optimizer='adam', loss='mean_squared_error')

    # 6. Train
    model.fit(X_train, y_train, epochs=50, validation_split=0.1)

    # 7. Evaluate
    loss = model.evaluate(X_test, y_test)
    print("Test loss:", loss)

    # 8. Save model & scaler if needed
    model.save("trained_model.h5")
    # Optionally pickle the scaler for predictions
    import joblib
    joblib.dump(scaler, "scaler.pkl")

if __name__ == "__main__":
    train_model()

Epoch 1/50


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 12ms/step - loss: 2283911936.0000 - val_loss: 2272470016.0000
Epoch 2/50
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - loss: 2284296448.0000 - val_loss: 2272468736.0000
Epoch 3/50
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 2281052928.0000 - val_loss: 2272467456.0000
Epoch 4/50
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - loss: 2282250240.0000 - val_loss: 2272466432.0000
Epoch 5/50
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 2283545344.0000 - val_loss: 2272464896.0000
Epoch 6/50
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - loss: 2285220096.0000 - val_loss: 2272463872.0000
Epoch 7/50
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 2282976512.0000 - val_loss: 2272462592.0000
Epoch 8/50
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37



Test loss: 2287834624.0


Weight Calc

In [12]:
import warnings
warnings.filterwarnings("ignore", message="X does not have valid feature names")


import numpy as np
import pandas as pd

def find_closest_match(user_input, df, scaler, weight_vector=None):
    """
    Finds the closest matching record for a given user_input.

    Parameters:
      - user_input: dict with keys "house_price", "income", "beds",
                    "baths", "sq_ft", "population"
      - df: DataFrame of housing records that contains at least the following columns:
            ['ListedPrice', 'MeanIncome', 'Bedroom', 'Bathroom', 'Area', '2022 Population']
            plus location columns like "State" or "City".
      - scaler: A fitted StandardScaler used to transform the numeric input features.
      - weight_vector (optional): Array-like weights for each feature. If provided,
                                  it multiplies the squared difference for that feature.

    Returns:
      - closest_match: A Pandas Series corresponding to the record with the smallest weighted distance.
      - distance: The computed weighted distance.
    """
    # Input features (same order as used in scaling/training)
    input_features = ['ListedPrice', 'MeanIncome', 'Bedroom', 'Bathroom', 'Area', '2022 Population']

    # Construct the user feature vector as a 2D array (one row)
    user_vector = np.array([[user_input["house_price"],
                             user_input["income"],
                             user_input["beds"],
                             user_input["baths"],
                             user_input["sq_ft"],
                             user_input["population"]]])

    # Scale the user input using the same scaler
    user_vector_scaled = scaler.transform(user_vector)

    # Extract corresponding features from the dataset and fill missing values
    data_features = df[input_features].fillna(df[input_features].median())
    data_features_scaled = scaler.transform(data_features)

    if weight_vector is not None:
        weight_vector = np.array(weight_vector)  # Ensure it's a NumPy array
        # Compute the weighted squared differences
        differences = data_features_scaled - user_vector_scaled  # shape: (n_samples, n_features)
        weighted_squared_diff = (differences ** 2) * weight_vector
        distances = np.sqrt(np.sum(weighted_squared_diff, axis=1))
    else:
        # Standard Euclidean distance
        distances = np.linalg.norm(data_features_scaled - user_vector_scaled, axis=1)

    # Find the index of the closest match
    idx = np.argmin(distances)
    closest_match = df.iloc[idx]
    return closest_match, distances[idx]

def evaluate_weight_vector(weight_vector, df, scaler):
    """
    Evaluates a given weight_vector by treating every row in df as a query.
    For each query, find the closest matching record (excluding itself)
    and compute the MSE between the target values of the query and the match.

    Parameters:
      - weight_vector: Array-like list of weights for the input features.
      - df: Full DataFrame containing both input features and target features.
      - scaler: The fitted StandardScaler used on the input features.

    Returns:
      - overall_mse: The average Mean Squared Error computed over all queries.
    """
    input_features = ['ListedPrice', 'MeanIncome', 'Bedroom', 'Bathroom', 'Area', '2022 Population']
    target_features = ['QualityOfLifeTotalScore', 'Cost of Living', '2016 Crime Rate']

    errors = []
    for idx, row in df.iterrows():
        # Use the current row as user input
        user_input = {
            "house_price": row["ListedPrice"],
            "income": row["MeanIncome"],
            "beds": row["Bedroom"],
            "baths": row["Bathroom"],
            "sq_ft": row["Area"],
            "population": row["2022 Population"]
        }

        # Find the closest match using the provided weight_vector
        match, distance = find_closest_match(user_input, df, scaler, weight_vector=weight_vector)
        # Skip if the best match is the same record
        if match.name == idx:
            continue

        # Compute the MSE between the target features of the query and its match
        true_target = row[target_features].values.astype(float)
        matched_target = match[target_features].values.astype(float)
        error = np.mean((true_target - matched_target) ** 2)
        errors.append(error)

    overall_mse = np.mean(errors)
    return overall_mse

#############################################
# Example usage:
#############################################

# Assume you have already loaded your full dataset df and your fitted scaler.
# For example:
# df = fetch_housing_data()   <-- Your function to load the data
# scaler = ...                <-- Your StandardScaler fitted on the input features

# For demonstration, here is a dummy setup:
# (Uncomment and replace with your actual data fetching logic)
# df = pd.read_csv("your_housing_data.csv")
# from sklearn.preprocessing import StandardScaler
# scaler = StandardScaler()
# df_features = df[['ListedPrice', 'MeanIncome', 'Bedroom', 'Bathroom', 'Area', '2022 Population']].fillna(df.median())
# scaler.fit(df_features)

# Define an arbitrary weight vector for the 6 input features.
# You can modify these numbers to adjust the relative importance.
weight_vector = [1.8, 1.0, 1.0, 1.0, 1.0, 1.0]

# Evaluate the matching MSE for this weight vector.
overall_mse = evaluate_weight_vector(weight_vector, df, scaler)
print("Overall Matching MSE for weight_vector {}: {:.2f}".format(weight_vector, overall_mse))


Overall Matching MSE for weight_vector [1.8, 1.0, 1.0, 1.0, 1.0, 1.0]: 834832.34
