Install Packages

In [1]:
!pip install supabase python-dotenv pandas tensorflow scikit-learn joblib

Collecting supabase
  Downloading supabase-2.15.0-py3-none-any.whl.metadata (11 kB)
Collecting python-dotenv
  Downloading python_dotenv-1.1.0-py3-none-any.whl.metadata (24 kB)
Collecting gotrue<3.0.0,>=2.11.0 (from supabase)
  Downloading gotrue-2.12.0-py3-none-any.whl.metadata (6.1 kB)
Collecting postgrest<1.1,>0.19 (from supabase)
  Downloading postgrest-1.0.1-py3-none-any.whl.metadata (3.5 kB)
Collecting realtime<2.5.0,>=2.4.0 (from supabase)
  Downloading realtime-2.4.2-py3-none-any.whl.metadata (6.6 kB)
Collecting storage3<0.12,>=0.10 (from supabase)
  Downloading storage3-0.11.3-py3-none-any.whl.metadata (1.8 kB)
Collecting supafunc<0.10,>=0.9 (from supabase)
  Downloading supafunc-0.9.4-py3-none-any.whl.metadata (1.2 kB)
Collecting pytest-mock<4.0.0,>=3.14.0 (from gotrue<3.0.0,>=2.11.0->supabase)
  Downloading pytest_mock-3.14.0-py3-none-any.whl.metadata (3.8 kB)
Collecting deprecation<3.0.0,>=2.1.0 (from postgrest<1.1,>0.19->supabase)
  Downloading deprecation-2.1.0-py2.py3-no

Bring in data

In [2]:
import os
os.environ['SUPABASE_URL'] = 'https://lgcrogvgnqphznuwdopu.supabase.co'
os.environ['SUPABASE_KEY'] = 'eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJpc3MiOiJzdXBhYmFzZSIsInJlZiI6ImxnY3JvZ3ZnbnFwaHpudXdkb3B1Iiwicm9sZSI6ImFub24iLCJpYXQiOjE3NDQ0MTQwMzcsImV4cCI6MjA1OTk5MDAzN30.2lozGgOq70UbrCm1_7Y1p38WbCqOMTjQ8Cs_ZSvNhSs'

Import PsiKit Learn

In [3]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

Database Setup

In [4]:
%%writefile db.py
import os
from dotenv import load_dotenv
from supabase import create_client, Client
import pandas as pd

# Load environment variables
load_dotenv()

SUPABASE_URL = os.getenv("SUPABASE_URL")
SUPABASE_KEY = os.getenv("SUPABASE_KEY")

def get_supabase_client() -> Client:
    return create_client(SUPABASE_URL, SUPABASE_KEY)

def fetch_housing_data() -> pd.DataFrame:
    client = get_supabase_client()
    # Replace "House" with your actual table name if different
    response = client.table("House").select("*").execute()
    data = response.data  # a list of dictionaries
    return pd.DataFrame(data)



Writing db.py


Database link

In [5]:
%%writefile db.py
import os
from dotenv import load_dotenv
from supabase import create_client, Client
import pandas as pd

# Load environment variables from a .env file or your environment
load_dotenv()

SUPABASE_URL = os.getenv("SUPABASE_URL")
SUPABASE_KEY = os.getenv("SUPABASE_KEY")

def get_supabase_client() -> Client:
    return create_client(SUPABASE_URL, SUPABASE_KEY)

def fetch_housing_data() -> pd.DataFrame:
    client = get_supabase_client()
    # Replace "House" with your actual table name if needed.
    response = client.table("House").select("*").execute()
    data = response.data  # a list of dictionaries
    return pd.DataFrame(data)


Overwriting db.py


Model Script

In [6]:
%%writefile train_model.py
import os
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import tensorflow as tf
from db import fetch_housing_data
import joblib

def train_model():
    # 1. Fetch data from Supabase.
    df = fetch_housing_data()
    print("Data loaded from Supabase:")
    print(df.head())

    # 2. Create a binary target from "QualityOfLifeTotalScore".
    # Here, a score above the median is set to 1 (high quality) and otherwise 0.
    threshold = df['QualityOfLifeTotalScore'].median()
    df['target'] = (df['QualityOfLifeTotalScore'] > threshold).astype(int)

    # Input feature selection.
    input_features = ['ListedPrice', 'MeanIncome', 'Bedroom', 'Bathroom', 'Area', '2022 Population']
    target_feature = 'target'

    # Impute missing values.
    X = df[input_features].fillna(df[input_features].median())
    y = df[target_feature]

    # 3. Scale the input features.
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)

    # 4. Split the data into training and testing sets.
    X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

    # 5. Build the logistic regression model.
    model = tf.keras.Sequential([
        tf.keras.Input(shape=(X_train.shape[1],)),
        tf.keras.layers.Dense(1, activation='sigmoid')
    ])

    # 6. Compile the model with binary_crossentropy and accuracy as metric.
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

    # 7. Train the model. (Using 10% of training data for validation.)
    model.fit(X_train, y_train, epochs=50, validation_split=0.1)

    # 8. Evaluate the model.
    loss, accuracy = model.evaluate(X_test, y_test)
    print("Test loss:", loss)
    print("Test accuracy:", accuracy)

    # 9. Save the model and scaler.
    model.save("trained_model.h5")
    joblib.dump(scaler, "scaler.pkl")

if __name__ == "__main__":
    train_model()


Writing train_model.py


Add-on

In [7]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


Predictions on Model

In [8]:
%%writefile prediction.py
import numpy as np
import tensorflow as tf
import joblib
from db import fetch_housing_data

# Load the trained logistic regression model and scaler.
model = tf.keras.models.load_model("trained_model.h5")
scaler = joblib.load("scaler.pkl")

def predict_house_quality(user_input):
    """
    Predicts the binary house quality class.

    Expected keys in user_input:
      - house_price: corresponds to 'ListedPrice'
      - income: corresponds to 'MeanIncome'
      - beds: corresponds to 'Bedroom'
      - baths: corresponds to 'Bathroom'
      - sq_ft: corresponds to 'Area'
      - population: corresponds to '2022 Population'

    Returns:
      - predicted_class: 0 (low quality) or 1 (high quality)
    """
    data = np.array([[
        user_input["house_price"],
        user_input["income"],
        user_input["beds"],
        user_input["baths"],
        user_input["sq_ft"],
        user_input["population"]
    ]])
    scaled_data = scaler.transform(data)
    probability = model.predict(scaled_data)
    predicted_class = (probability > 0.5).astype("int32")
    return predicted_class[0][0]

def find_closest_match(user_input, df, scaler):
    """
    Finds the closest matching record in the DataFrame to the user inputs based on the training features.

    Parameters:
      - user_input: dictionary with keys "house_price", "income", "beds", "baths", "sq_ft", "population"
      - df: DataFrame containing the housing records with at least the following columns:
            ['ListedPrice', 'MeanIncome', 'Bedroom', 'Bathroom', 'Area', '2022 Population']
      - scaler: a fitted StandardScaler instance used on the training data.

    Returns:
      - A Pandas Series corresponding to the closest matching record.
    """
    input_features = ['ListedPrice', 'MeanIncome', 'Bedroom', 'Bathroom', 'Area', '2022 Population']
    user_vector = np.array([[
        user_input["house_price"],
        user_input["income"],
        user_input["beds"],
        user_input["baths"],
        user_input["sq_ft"],
        user_input["population"]
    ]])

    user_vector_scaled = scaler.transform(user_vector)
    data_features = df[input_features].fillna(df[input_features].median())
    data_features_scaled = scaler.transform(data_features)
    distances = np.linalg.norm(data_features_scaled - user_vector_scaled, axis=1)
    idx = np.argmin(distances)
    closest_match = df.iloc[idx]
    return closest_match


Writing prediction.py


Find closest match

In [9]:
import numpy as np

def find_closest_match(user_input, df, scaler):
    """
    Given a user_input dictionary, a DataFrame df with the housing records,
    and a scaler used for the training data, this function finds the record
    in df that is closest to the user's desired features.

    Parameters:
      - user_input: dict with keys "house_price", "income", "beds",
                    "baths", "sq_ft", "population"
      - df: DataFrame of housing records containing at least the following columns:
            ['ListedPrice', 'MeanIncome', 'Bedroom', 'Bathroom', 'Area', '2022 Population']
            plus location columns like "State", "City".
      - scaler: A StandardScaler fitted on the training data.

    Returns:
      - closest_match: A Pandas Series that represents the record closest to the input.
    """
    # Define the features used for matching (same as used in training)
    input_features = ['ListedPrice', 'MeanIncome', 'Bedroom', 'Bathroom', 'Area', '2022 Population']

    # Construct the user vector from the dictionary
    user_vector = np.array([[user_input["house_price"],
                              user_input["income"],
                              user_input["beds"],
                              user_input["baths"],
                              user_input["sq_ft"],
                              user_input["population"]]])

    # Scale the user input using the same scaler
    user_vector_scaled = scaler.transform(user_vector)

    # Extract the features from df and handle missing values if needed
    data_features = df[input_features].fillna(df[input_features].median())

    # Scale all these feature rows
    data_features_scaled = scaler.transform(data_features)

    # Compute Euclidean distances between the user vector and each row in data_features_scaled
    distances = np.linalg.norm(data_features_scaled - user_vector_scaled, axis=1)

    # Find the index of the closest match
    idx = np.argmin(distances)
    closest_match = df.iloc[idx]

    return closest_match


Model Training Protocol

In [10]:
!python train_model.py

2025-04-24 07:28:51.477290: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1745479731.502809    1343 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1745479731.510481    1343 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-04-24 07:28:51.536371: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
Data loaded from Supabase:
  State        City  ...  QualityOfLifeSafety      id
0    az  phoenix,az  ...                   4

User Input and Output

In [11]:
import numpy as np
import pandas as pd
from db import fetch_housing_data
from prediction import predict_house_quality, find_closest_match, scaler

# Load the full dataset from Supabase.
df = fetch_housing_data()  # Ensure that df is defined.
print("Data loaded:")
print(df.head())

# 1. Collect user input.
house_price = float(input("Enter your desired house price: "))
income = float(input("Enter your income: "))
beds = int(input("Enter number of beds: "))
baths = int(input("Enter number of baths: "))
sq_ft = float(input("Enter square footage: "))
population = float(input("Enter desired population: "))

user_input = {
    "house_price": house_price,
    "income": income,
    "beds": beds,
    "baths": baths,
    "sq_ft": sq_ft,
    "population": population
}

# 2. Get the model prediction.
prediction = predict_house_quality(user_input)
print("Predicted house quality class (0 = low, 1 = high):", prediction)

# 3. Reload the full dataset if needed.
df = fetch_housing_data()  # or load from a CSV if applicable.
print("Full dataset loaded (first 5 rows):")
print(df.head())

# 4. Find the closest matching record.
closest_match = find_closest_match(user_input, df, scaler)

# 5. Print out details from the best match.
print("\nClosest Matching House Record:")
print("State:", closest_match.get("State", "N/A"))
print("City:", closest_match.get("City", "N/A"))
print("Listed Price:", closest_match.get("ListedPrice", "N/A"))
print("Mean Income:", closest_match.get("MeanIncome", "N/A"))
print("Bedrooms:", closest_match.get("Bedroom", "N/A"))
print("Bathrooms:", closest_match.get("Bathroom", "N/A"))
print("Area:", closest_match.get("Area", "N/A"))
print("2022 Population:", closest_match.get("2022 Population", "N/A"))





Data loaded:
  State        City  Bedroom  Bathroom    Area  ListedPrice Temperature  \
0    az  phoenix,az      4.0       3.0  2465.0       980000         Hot   
1    az  phoenix,az      4.0       4.0  3695.0       975000         Hot   
2    az  phoenix,az      4.0       2.0  2581.0       890000         Hot   
3    az  phoenix,az      4.0       2.0  2217.0      1144000         Hot   
4    az  phoenix,az      4.0       3.0  2578.0       850000         Hot   

   2022 Population  2016 Crime Rate  Unemployment  ...  Cost of Living  \
0          4551524            0.032          3.46  ...        82847.38   
1          4551524            0.032          3.46  ...        82847.38   
2          4551524            0.032          3.46  ...        82847.38   
3          4551524            0.032          3.46  ...        82847.38   
4          4551524            0.032          3.46  ...        82847.38   

   AVG C2I  MeanIncome  QualityOfLifeTotalScore  QualityOfLifeQualityOfLife  \
0    105.1  



Predicted house quality class (0 = low, 1 = high): 1
Full dataset loaded (first 5 rows):
  State        City  Bedroom  Bathroom    Area  ListedPrice Temperature  \
0    az  phoenix,az      3.0       1.0  1095.0       300000         Hot   
1    az   tucson,az      NaN       NaN     NaN        65000         Hot   
2    az   tucson,az      3.0       3.0  2645.0       825000         Hot   
3    az   tucson,az      NaN       NaN     NaN        80000         Hot   
4    az   tucson,az      3.0       2.0  1344.0       214500         Hot   

   2022 Population  2016 Crime Rate  Unemployment  ...  Cost of Living  \
0          4551524            0.032          3.46  ...        82847.38   
1          1057597            0.046          3.95  ...        70794.96   
2          1057597            0.046          3.95  ...        70794.96   
3          1057597            0.046          3.95  ...        70794.96   
4          1057597            0.046          3.95  ...        70794.96   

   AVG C2I  Mea



Evaluate Model

In [12]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import tensorflow as tf
import os
import pandas as pd
from dotenv import load_dotenv
from supabase import create_client, Client

def fetch_housing_data() -> pd.DataFrame:
    load_dotenv()  # Make sure your environment variables are set
    SUPABASE_URL = os.getenv("SUPABASE_URL")
    SUPABASE_KEY = os.getenv("SUPABASE_KEY")
    client: Client = create_client(SUPABASE_URL, SUPABASE_KEY)
    response = client.table("House").select("*").execute()
    data = response.data
    return pd.DataFrame(data)


# Import additional metrics from scikit-learn
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# scripts/train_model.py
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import tensorflow as tf

def train_model():
    # 1. Fetch data
    df = fetch_housing_data()

    # 2. Define features/targets
    input_features = ['ListedPrice', 'MeanIncome', 'Bedroom', 'Bathroom', 'Area', '2022 Population']
    target_features = ['QualityOfLifeTotalScore', 'Cost of Living', '2016 Crime Rate']

    X = df[input_features].fillna(df[input_features].median())
    y = df[target_features].fillna(df[target_features].median())

    # 3. Scale
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)

    # 4. Split
    X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

    # 5. Build model
    model = tf.keras.Sequential([
        tf.keras.layers.Dense(len(target_features), input_shape=(X_train.shape[1],))
    ])

    # Change: Since there is only one Dense layer (output),
    # use a single loss function and remove loss_weights
    model.compile(optimizer='adam', loss='mean_squared_error')

    # 6. Train
    model.fit(X_train, y_train, epochs=50, validation_split=0.1)

    # 7. Evaluate
    loss = model.evaluate(X_test, y_test)
    print("Test loss:", loss)

    # 8. Save model & scaler if needed
    model.save("trained_model.h5")
    # Optionally pickle the scaler for predictions
    import joblib
    joblib.dump(scaler, "scaler.pkl")

if __name__ == "__main__":
    train_model()

Epoch 1/50


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 12ms/step - loss: 1670638976.0000 - val_loss: 1670643840.0000
Epoch 2/50
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 1670642560.0000 - val_loss: 1670642688.0000
Epoch 3/50
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 1670637568.0000 - val_loss: 1670641408.0000
Epoch 4/50
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - loss: 1670641152.0000 - val_loss: 1670640384.0000
Epoch 5/50
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 1670638080.0000 - val_loss: 1670639360.0000
Epoch 6/50
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 1670636288.0000 - val_loss: 1670638336.0000
Epoch 7/50
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 1670633216.0000 - val_loss: 1670637312.0000
Epoch 8/50
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37



Test loss: 1670590976.0
