In [None]:

# Quick import + checks
required = ["pandas","numpy"]
import importlib, sys
missing = [p for p in required if importlib.util.find_spec(p) is None]
if missing:
    print("Missing packages:", missing)
    print("Install in notebook: %pip install " + " ".join(missing) + " (or run in terminal)")
    raise SystemExit("Install missing packages and re-run")

import pandas as pd, numpy as np

# robust file lookup (try data_path if present, common variants, and a glob search)
import os, glob

candidates = []
if 'data_path' in globals() and data_path:
    candidates.append(data_path)
# common path variants
candidates += [
    "../data/house_price.csv",
]
# search project for any csvs with 'house' in the name
candidates += glob.glob("**/*house*.csv", recursive=True)

# dedupe keeping order
seen = set()
candidates = [p for p in candidates if not (p in seen or seen.add(p))]

found = next((p for p in candidates if os.path.exists(p)), None)
if found is None:
    raise FileNotFoundError(f"No house_price csv found. Searched candidates: {candidates}")

path = found
print("Using dataset path:", path)

# read with fallback encoding
try:
    df = pd.read_csv(path, low_memory=False, encoding="utf-8")
except UnicodeDecodeError:
    df = pd.read_csv(path, low_memory=False, encoding="latin1")

# normalize column names and canonicalize geo cols
df.columns = df.columns.str.strip().str.lower().str.replace(" ", "_")
if "longitude" in df.columns and "lng" not in df.columns:
    df["lng"] = pd.to_numeric(df["longitude"].astype(str).str.replace(r"[^0-9.-]", "", regex=True), errors="coerce")
if "latitude" in df.columns and "lat" not in df.columns:
    df["lat"] = pd.to_numeric(df["latitude"].astype(str).str.replace(r"[^0-9.-]", "", regex=True), errors="coerce")

print("Loaded:", path, "shape:", df.shape)
print("Geo columns present:", [c for c in ["lng","lat","longitude","latitude"] if c in df.columns])
df.head()


Using dataset path: ../data/house_price.csv
Loaded: ../data/house_price.csv shape: (318851, 26)
Geo columns present: ['lng', 'lat']


Unnamed: 0,url,id,lng,lat,cid,tradetime,dom,followers,totalprice,price,...,buildingtype,constructiontime,renovationcondition,buildingstructure,ladderratio,elevator,fiveyearsproperty,subway,district,communityaverage
0,https://bj.lianjia.com/chengjiao/101084782030....,101084782030,116.475489,40.01952,1111027376244,2016-08-09,1464.0,106,415.0,31680,...,1.0,2005,3,6,0.217,1.0,0.0,1.0,7,56021.0
1,https://bj.lianjia.com/chengjiao/101086012217....,101086012217,116.453917,39.881534,1111027381879,2016-07-28,903.0,126,575.0,43436,...,1.0,2004,4,6,0.667,1.0,1.0,0.0,7,71539.0
2,https://bj.lianjia.com/chengjiao/101086041636....,101086041636,116.561978,39.877145,1111040862969,2016-12-11,1271.0,48,1030.0,52021,...,4.0,2005,3,6,0.5,1.0,0.0,0.0,7,48160.0
3,https://bj.lianjia.com/chengjiao/101086406841....,101086406841,116.43801,40.076114,1111043185817,2016-09-30,965.0,138,297.5,22202,...,1.0,2008,1,6,0.273,1.0,0.0,0.0,6,51238.0
4,https://bj.lianjia.com/chengjiao/101086920653....,101086920653,116.428392,39.886229,1111027381174,2016-08-28,927.0,286,392.0,48396,...,4.0,1960,2,2,0.333,0.0,1.0,1.0,1,62588.0


In [11]:

# Stage 1: Basic preprocessing
df.columns = df.columns.str.strip().str.lower().str.replace(' ', '_')
# drop obvious irrelevant columns if they exist
drop_cols = [c for c in ['url','id','cid','link'] if c in df.columns]
if drop_cols:
    df = df.drop(columns=drop_cols)
# canonicalize geo column names and preserve latitude/longitude
# create canonical `lng`/`lat` if raw columns exist (keep numeric)
if 'longitude' in df.columns and 'lng' not in df.columns:
    df['lng'] = pd.to_numeric(df['longitude'].astype(str).str.replace(r'[^0-9.-]', '', regex=True), errors='coerce')
if 'latitude' in df.columns and 'lat' not in df.columns:
    df['lat'] = pd.to_numeric(df['latitude'].astype(str).str.replace(r'[^0-9.-]', '', regex=True), errors='coerce')
# list of geo columns we want to keep
geo_cols = [c for c in ['lng','lat','longitude','latitude'] if c in df.columns]
# convert common price/area columns to numeric (include geo cols if present)
def to_numeric_col(s):
    return pd.to_numeric(s.astype(str).str.replace(r'[^0-9.-]', '', regex=True), errors='coerce')

for col in ['totalprice','price','square','area','buildingarea','lng','lat','longitude','latitude']:
    if col in df.columns:
        df[col] = to_numeric_col(df[col])


# create binary target using median totalPrice if available (fall back to price)

target_col = 'totalprice' if 'totalprice' in df.columns else ('price' if 'price' in df.columns else None)
if target_col is None:
    raise RuntimeError("No price column found. Add totalPrice or price to proceed.")
median_price = df[target_col].median()
df['target'] = (df[target_col] >= median_price).astype(int)
print("Target defined using:", target_col, "median:", median_price)
# confirm geo columns present (if any)
print("Geo columns preserved:", geo_cols)

Target defined using: totalprice median: 294.0
Geo columns preserved: ['lng', 'lat']


In [12]:

# Stage 2: Missing values, dtypes, derived features
# drop exact duplicates
df = df.drop_duplicates().reset_index(drop=True)

# impute numeric columns with median, categorical with mode
num_cols = df.select_dtypes(include=['number']).columns.tolist()
# exclude geo columns from the generic numeric imputation (handle them separately)
num_impute_cols = [c for c in num_cols if c not in geo_cols]
# exclude geo columns from categorical imputation too
cat_cols = df.select_dtypes(include=['object','category']).columns.difference(['address','title'] + geo_cols).tolist()

for c in num_impute_cols:
    if df[c].isna().any():
        df[c].fillna(df[c].median(), inplace=True)
# For geo cols, fill small gaps with median to avoid row loss in downstream steps (optional)
for c in geo_cols:
    if c in df.columns and df[c].isna().any():
        df[c].fillna(df[c].median(), inplace=True)

for c in cat_cols:
    if df[c].isna().any():
        df[c].fillna(df[c].mode().iloc[0] if not df[c].mode().empty else "missing", inplace=True)

# datetime parsing example (if present)
for date_col in ['publish_time','listing_date','built_year']:
    if date_col in df.columns:
        try:
            df[date_col] = pd.to_datetime(df[date_col], errors='coerce')
        except Exception:
            pass

# derived features (defensive checks)
if 'square' in df.columns and target_col in df.columns:
    df['price_per_sqm'] = df[target_col] / df['square']
    df['log_price'] = (df[target_col].clip(lower=1)).apply(np.log)

# age of building (if build year or date available)
if 'built_year' in df.columns and pd.api.types.is_datetime64_any_dtype(df['built_year']):
    df['building_age'] = pd.Timestamp.now().year - df['built_year'].dt.year
elif 'built_year' in df.columns:
    df['building_age'] = pd.to_numeric(df['built_year'], errors='coerce').apply(lambda x: pd.Timestamp.now().year - x if pd.notna(x) else np.nan)
    df['building_age'].fillna(df['building_age'].median(), inplace=True)

print("After imputation shape:", df.shape)
print("Geo columns after imputation:", {c: df[c].isna().sum() for c in geo_cols if c in df.columns})

After imputation shape: (318825, 26)
Geo columns after imputation: {'lng': 0, 'lat': 0}


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[c].fillna(df[c].median(), inplace=True)


In [13]:

# Stage 3: Outliers and encoding
num_cols = df.select_dtypes(include=['number']).columns.tolist()
# exclude target columns and geo columns from clipping
num_cols = [c for c in num_cols if c not in [target_col,'target'] + geo_cols]
for c in num_cols:
    qlow, qhigh = df[c].quantile(0.01), df[c].quantile(0.99)
    if pd.notna(qlow) and pd.notna(qhigh) and qlow < qhigh:
        df[c] = df[c].clip(qlow, qhigh)

# select top categorical cols by cardinality (small ones for get_dummies)
# ensure geo columns aren't treated as categoricals
cat_cols = [c for c in df.select_dtypes(include=['object','category']).nunique().sort_values().index.tolist() if c not in geo_cols]
cat_to_encode = [c for c in cat_cols if df[c].nunique() <= 20][:6]  # keep up to 6 small-cardinal cols
if cat_to_encode:
    df = pd.get_dummies(df, columns=cat_to_encode, drop_first=True)

print("Columns after encoding:", len(df.columns))


Columns after encoding: 51


In [17]:

# Stage 4: Feature selection (top 10)
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
import numpy as np

# prepare feature matrix
features = df.select_dtypes(include=[np.number]).drop(columns=[target_col,'target'], errors='ignore').columns.tolist()

# corr ranking (abs correlation with continuous price if present)
corr_rank = pd.Series(0, index=features)
if target_col in df.columns:
    corr_vals = df[features + [target_col]].corr()[target_col].abs().drop(target_col)
    corr_rank = corr_vals.rank(ascending=False)

# RandomForest feature importance on binary target
X = df[features].fillna(0)
y = df['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42, stratify=y)
rf = RandomForestClassifier(n_estimators=200, random_state=42, n_jobs=-1)
rf.fit(X_train, y_train)
imp = pd.Series(rf.feature_importances_, index=features)
imp_rank = imp.rank(ascending=False)

# combined rank average
combined_rank = (corr_rank.rank() + imp_rank.rank()) / 2

# initial top candidates
top10 = combined_rank.sort_values().head(10).index.tolist()

# ensure longitude/latitude are included among the top 10
geo_required = [c for c in ['lng','lat'] if c in df.columns]
for g in geo_required:
    if g not in top10 and g in combined_rank.index:
        top10.append(g)

# if we added geo cols and exceeded 10, drop the worst-ranked features among the current selection
if len(top10) > 10:
    ranks = combined_rank.loc[top10]
    to_drop = ranks.sort_values(ascending=False).head(len(top10) - 10).index.tolist()
    top10 = [f for f in top10 if f not in to_drop]

print("Top 10 features selected :", top10)

# create final dataframe with those features + target, but keep longitude/latitude if present
geo_cols = [c for c in ['lng','lat','longitude','latitude'] if c in df.columns]
final_cols = top10.copy()
for c in geo_cols:
    if c not in final_cols:
        final_cols.append(c)
# add target columns
if target_col in df.columns:
    final_cols += [target_col, 'target']
else:
    final_cols += ['target']
final_df = df[final_cols].copy()
print("Included geo columns (if present):", geo_cols)

# save (ensure directory exists) — save explicitly into repo `data/` folder
out_dir = os.path.abspath(os.path.join(os.getcwd(), "data"))
os.makedirs(out_dir, exist_ok=True)
out_path = os.path.join(out_dir, "house_price_top10.csv")
final_df.to_csv(out_path, index=False)
print("Saved final dataset to:", out_path, "shape:", final_df.shape)
# quick verification
import os as _os
print("File exists:", _os.path.exists(out_path), "size(bytes):", _os.path.getsize(out_path) if _os.path.exists(out_path) else "n/a")

Top 10 features selected : ['log_price', 'price_per_sqm', 'price', 'square', 'communityaverage', 'dom', 'ladderratio', 'renovationcondition', 'followers', 'elevator']
Included geo columns (if present): ['lng', 'lat']
Saved final dataset to: c:\Year 4\Quantum\Quantum_final_project\classical_ml\data\house_price_top10.csv shape: (318825, 14)
File exists: True size(bytes): 33766585
