In [118]:
class KNNClassifier():
  def fit(self,X,y):
    self.X=X
    self.y=y.astype(int)
  def predict(self,X,K,epsilon=1e-8):
    N=len(X)
    y_hat=np.zeros(N)

    for i in range(N):
      dist2 = np.sum((self.X - X[i])**2, axis=1)
      idxt  = np.argsort(dist2)
      gamma_k = 1/(np.sqrt(dist2[idxt]+epsilon))
      y_hat[i] = np.bincount(self.y[idxt], weights=gamma_k).argmax()
    return y_hat

In [119]:
def price_sq(x):
  #if div by 20 then x 50
  return x//50

In [120]:
def accuracy(y,y_hat):
  return np.mean(y==y_hat)

In [121]:
# -------------------------------------------------------------
# 1. Imports
# -------------------------------------------------------------
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Drive (Colab)
from google.colab import drive
drive.mount('/content/drive')

# -------------------------------------------------------------
# 2. Load data
# -------------------------------------------------------------
df = pd.read_csv("/content/drive/MyDrive/ds/cleaned_final.csv")

# -------------------------------------------------------------
# 3. Basic cleaning
# -------------------------------------------------------------
df = df.drop(columns=["MLS"])

df['kitchen_features'] = df['kitchen_features'].fillna("Unknown")
df['floor_covering'] = df['floor_covering'].fillna("Unknown")

df['fireplaces'] = df['fireplaces'].apply(lambda x: 0 if str(x).lower() in ["none", "0"] else 1)

# -------------------------------------------------------------
# 4. Define features and target
# -------------------------------------------------------------
target = "cat"

numeric_features = ["longitude", "latitude"]

df['price/sqrft'] = round(df['sold_price']/df['sqrt_ft'], 2)

X = df[numeric_features]
y = df[target] = price_sq(df['price/sqrft'])

# -------------------------------------------------------------
# 5. Train-test split
# -------------------------------------------------------------
X_train, X_test = X.iloc[:4000], X.iloc[4000:]
y_train, y_test = y.iloc[:4000], y.iloc[4000:]

# -------------------------------------------------------------
# 6. Hybrid Scaler: log1p + robust + standard (pure Pandas)
# -------------------------------------------------------------
X_train = pd.DataFrame(X_train, columns=numeric_features, index=X.iloc[:4000].index)
X_test  = pd.DataFrame(X_test,  columns=numeric_features, index=X.iloc[4000:].index)

# -------------------- STEP 1: Detect skewed positive columns --------------------
# skew_vals = X_train.skew()
# log_cols = [c for c in X_train.columns if abs(skew_vals[c]) > 1 and X_train[c].min() >= 0]

# print("Log-transform columns:", log_cols)

# # Apply log1p to train & test on detected columns
# X_train_h = X_train.copy()
# X_test_h  = X_test.copy()

# if log_cols:
#     X_train_h[log_cols] = np.log1p(X_train_h[log_cols])
#     X_test_h[log_cols]  = np.log1p(X_test_h[log_cols])

# # -------------------- STEP 2: Detect outlier-heavy columns (for robust scaling) --------------------
# # A big IQR suggests strong outliers
# iqr_vals = X_train_h.quantile(0.75) - X_train_h.quantile(0.25)
# iqr_threshold = iqr_vals.median() * 2   # adaptive threshold

# robust_cols = [c for c in X_train_h.columns if iqr_vals[c] > iqr_threshold]

# print("Robust-scale columns:", robust_cols)

# # The remaining columns will be standard scaled
# standard_cols = [c for c in X_train_h.columns if c not in robust_cols]
# print("Standard-scale columns:", standard_cols)

# # -------------------- STEP 3A: Fit robust scaler on TRAIN only --------------------
# median_vals = X_train_h[robust_cols].median()
# iqr_vals_robust = (X_train_h[robust_cols].quantile(0.75) -
#                     X_train_h[robust_cols].quantile(0.25)).replace(0, 1.0)

# # Apply robust scaling
# X_train_h[robust_cols] = (X_train_h[robust_cols] - median_vals) / iqr_vals_robust
# X_test_h[robust_cols]  = (X_test_h[robust_cols]  - median_vals) / iqr_vals_robust

# # -------------------- STEP 3B: Fit standard scaler on TRAIN only --------------------
# mean_vals = X_train_h[standard_cols].mean()
# std_vals  = X_train_h[standard_cols].std().replace(0, 1.0)

# # Apply standard scaling
# X_train_h[standard_cols] = (X_train_h[standard_cols] - mean_vals) / std_vals
# X_test_h[standard_cols]  = (X_test_h[standard_cols]  - mean_vals) / std_vals

# # -------------------- Final scaled training/test sets --------------------
# X_train = X_train_h
# X_test  = X_test_h

# -------------------------------------------------------------
# 7. Convert to numpy (as your KNN requires)
# -------------------------------------------------------------
X_train = X_train.values
X_test = X_test.values
y_train = y_train.values
y_test = y_test.values

# -------------------------------------------------------------
# 8. Fit and predict
# -------------------------------------------------------------
knn_instance = KNNClassifier()
knn_instance.fit(X_train, y_train)

y_pred = knn_instance.predict(X_test, K=5)
mape = np.mean(np.abs((y_test-y_pred)/y_test*100))
accuracy=100-mape
print(accuracy)
# ---------------------a----------------------------------------
# 9. Evaluate
# -------------------------------------------------------------
# print("Model Accuracy: {:.2f}%".format(accuracy(y_test, y_pred) * 100))


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
74.52880952380953
