In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from sklearn.metrics import r2_score,mean_absolute_error,confusion_matrix,mean_squared_error
from sklearn.linear_model import LinearRegression

df_king_house_data = pd.read_csv("../dataset/king_ country_ houses_aa.csv")
df_king_house_data

In [None]:
df_king_house_data.info()

In [None]:
df_king_house_data.isnull().sum()

In [None]:
df_king_house_data.dtypes

In [None]:
df_king_house_data["id"].duplicated().sum()

In [None]:
dup_ids = df_king_house_data[df_king_house_data["id"].duplicated()]["id"]

print("Duplicated IDs:")
print(dup_ids.unique())   # just the unique duplicate ids


In [None]:
dup_rows = df_king_house_data[df_king_house_data["id"].duplicated(keep= False)]
dup_rows

In [None]:
df_king_house_data["date"] = pd.to_datetime(df_king_house_data["date"])
df_king_house_data["date"]

In [None]:
df_clean = df_king_house_data.sort_values("date").drop_duplicates(subset="id", keep="last")
df_clean

In [None]:
corr_all = df_king_house_data.drop(columns=["id","date"]).corr()
mask = np.triu(np.ones_like(corr_all, dtype=bool))
plt.figure(figsize=(12,10))
sns.heatmap(corr_all, mask=mask, annot=True, fmt=".2f", cmap='coolwarm', center=0)
plt.title('Feature correlations')
plt.show()

In [None]:
plt.figure(figsize=(6,4))
sns.histplot(df_clean['price'])
plt.title("House Price Distribution")
plt.xlabel("Price")
plt.ylabel("Count")
plt.show()


plt.figure(figsize=(6,4))
sns.histplot(df_clean['price'], log_scale=True)
plt.title("House Price Distribution (log scale)")
plt.xlabel("Price (log)")
plt.ylabel("Count")
plt.show()

Linear regressor

In [None]:
X_original = df_king_house_data.drop(columns=["price","id","date"])
y_original = df_king_house_data["price"]

In [None]:
X_clean = df_clean.drop(columns=["price","id","date"])
y_clean = df_clean["price"]

In [None]:
X_train_lr , X_test_lr, y_train_lr ,y_test_lr = train_test_split(X_original,y_original,test_size=0.2,random_state=42)


In [None]:
linear_model = LinearRegression()
linear_model.fit(X_train_lr,y_train_lr)
y_test_pred_lr = linear_model.predict(X_test_lr)
y_train_pred_lr = linear_model.predict(X_train_lr)

In [None]:
print("Linear Regression model test R2 score \n",r2_score(y_test_pred_lr,y_test_lr),"\n"
      "Linear Regression model train R2 score \n",r2_score(y_train_pred_lr,y_train_lr))



In [None]:
X_train_cl_lr , X_test_cl_lr, y_train_cl_lr ,y_test_cl_lr = train_test_split(X_clean,y_clean,test_size=0.2,random_state=42)


In [None]:
linear_model = LinearRegression()
linear_model.fit(X_train_cl_lr,y_train_cl_lr)
y_test_pred_cl_lr = linear_model.predict(X_test_cl_lr)
y_train_pred_cl_lr = linear_model.predict(X_train_cl_lr)

In [None]:
print("Linear Regression model test R2 score (cleandf)\n",r2_score(y_test_pred_cl_lr,y_test_cl_lr),"\n"
      "Linear Regression model train R2 score (cleandf)\n",r2_score(y_train_pred_cl_lr,y_train_cl_lr))

looping all the models

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_original, y_original, test_size=0.2, random_state=42)

models = {"Linear Regression": LinearRegression(),"Ridge": Ridge(),"Lasso": Lasso(),
    "Decision Tree": DecisionTreeRegressor(max_depth=10, random_state=42),"Random Forest": RandomForestRegressor(n_estimators=120, random_state=42),
    "Gradient Boosting": GradientBoostingRegressor(random_state=42),"XGBoost": XGBRegressor(objective="reg:squarederror", random_state=42),
    "LightGBM": LGBMRegressor(random_state=42),"CatBoost": CatBoostRegressor(verbose=0, random_state=42)}

results = []

for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    y_train_pred = model.predict(X_train)
    
    r2_test = r2_score(y_test, y_pred)
    r2_train = r2_score(y_train_pred, y_train)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    
    results.append({"Model": name, "R²_test": r2_test,"R²_train":r2_train , "RMSE": rmse})

results_df = pd.DataFrame(results).sort_values(by="R²_test", ascending=False)

print(results_df)
plt.figure(figsize=(10,6))
plt.barh(results_df["Model"], results_df["R²_test"], color="skyblue")
plt.xlabel("R² Score")
plt.title("Model Comparison on King County House Prices")
plt.gca().invert_yaxis()
plt.show()


In [None]:
#for clean df
X_clean = df_clean.drop(columns=["price", "id", "date"]) 
y_clean = df_clean["price"]

X_train, X_test, y_train, y_test = train_test_split(
    X_clean, y_clean, test_size=0.2, random_state=42
)

models = {"Linear Regression": LinearRegression(),"Ridge": Ridge(),"Lasso": Lasso(),
    "Decision Tree": DecisionTreeRegressor(max_depth=10, random_state=42),"Random Forest": RandomForestRegressor(n_estimators=100, random_state=42),
    "Gradient Boosting": GradientBoostingRegressor(random_state=42),"XGBoost": XGBRegressor(objective="reg:squarederror", random_state=42),
    "LightGBM": LGBMRegressor(verbose = 0,random_state=42),"CatBoost": CatBoostRegressor(verbose=0, random_state=42)}

results = []

for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    r2 = r2_score(y_test, y_pred)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    
    results.append({"Model": name, "R²": r2, "RMSE": rmse})

results_df = pd.DataFrame(results).sort_values(by="R²", ascending=False)

print(results_df)
plt.figure(figsize=(10,6))
plt.barh(results_df["Model"], results_df["R²"], color="skyblue")
plt.xlabel("R² Score")
plt.title("Model Comparison on King County House Prices")
plt.gca().invert_yaxis()
plt.show()


In [None]:
df_king_house_data["month"] = df_king_house_data["date"].dt.month
df_king_house_data["month"]

In [None]:
df_king_house_data["month"].value_counts()

In [None]:
model = XGBRegressor(objective="reg:squarederror", random_state=42)
X_data = df_king_house_data.drop(columns=["price", "id", "date"])
y_data = df_king_house_data["price"]
X_train_xgb , X_test_xgb ,y_train_xgb,y_test_xgb = train_test_split(X_data,y_data,test_size=0.2,random_state=42)
model.fit(X_train_xgb, y_train_xgb)

In [None]:
y_xgb_test_pred = model.predict(X_test_xgb)
print(r2_score(y_xgb_test_pred,y_test_xgb))

In [None]:
y_xgb_train_pred = model.predict(X_train_xgb)
print(r2_score(y_xgb_train_pred,y_train_xgb))

In [None]:
from sklearn.ensemble import AdaBoostRegressor
X_data = df_king_house_data.drop(columns=["price", "id", "date"])
y_data = df_king_house_data["price"]
X_train_ada , X_test_ada ,y_train_ada,y_test_ada = train_test_split(X_data,y_data,test_size=0.2,random_state=42)

In [None]:
X_data.columns

In [None]:
abc = AdaBoostRegressor(n_estimators=250,learning_rate=0.05)
model = abc.fit(X_train_ada, y_train_ada)

In [None]:
y_test_pred_ada_data = model.predict(X_test_ada)
y_train_pred_ada_data= model.predict(X_train_ada)

In [None]:
print("R2 ADAregressor Train:", r2_score(y_train_ada, y_train_pred_ada_data))
print("R2 ADAregressor Test:", r2_score(y_test_ada, y_test_pred_ada_data))

In [None]:
feature_importances = pd.DataFrame({'Feature': X_data.columns,'Importance': model.feature_importances_}).sort_values(by='Importance', ascending=False)

print(feature_importances)

In [None]:
df_groupby = df_king_house_data.groupby("grade")["price"].sum()

In [None]:
df_groupby

In [None]:
df_king_house_data

In [None]:
x_final = df_king_house_data.drop(columns=["price","date","id"])
y_final = df_king_house_data["price"]
X_train_final , X_test_final ,y_train_final,y_test_final = train_test_split(x_final,y_final,test_size=0.2,random_state=42)

In [None]:
xgb_final = XGBRegressor(objective="reg:squarederror", random_state=4)
xgb_final.fit(x_final,y_final)
y_final_test_pred_xgb = xgb_final.predict(X_test_final)
print(r2_score(y_final_test_pred_xgb,y_test_final))

In [None]:
feature_importances = pd.DataFrame({'Feature': x_final.columns,'Importance': xgb_final.feature_importances_}).sort_values(by='Importance', ascending=False)

print(feature_importances)

In [None]:
X_train_final, X_test_final, y_train_final, y_test_final = train_test_split(
    x_final, y_final, test_size=0.1, random_state=42
)

models = {"Linear Regression": LinearRegression(),"Ridge": Ridge(),"Lasso": Lasso(),
    "Decision Tree": DecisionTreeRegressor(max_depth=10, random_state=42),"Random Forest": RandomForestRegressor(n_estimators=100, random_state=42),
    "Gradient Boosting": GradientBoostingRegressor(random_state=42),"XGBoost": XGBRegressor(objective="reg:squarederror", random_state=42),
    "LightGBM": LGBMRegressor(verbose = 0,random_state=42),"CatBoost": CatBoostRegressor(verbose=0, random_state=42)}

results_final = []

for name, model in models.items():
    model.fit(X_train_final, y_train_final)
    y_pred_test_final = model.predict(X_test_final)
    y_pred_train_final = model.predict(X_train_final)
    
    r2_test = r2_score(y_test_final, y_pred_test_final)
    r2_train = r2_score(y_pred_train_final, y_train_final)
    rmse = np.sqrt(mean_squared_error(y_test_final, y_pred_test_final))
    
    results_final.append({"Model": name, "R²_test": r2_test, "R²_train": r2_train,"RMSE": rmse})

results_df_final = pd.DataFrame(results_final).sort_values(by="R²_test", ascending=False)

print(results_df_final)
plt.figure(figsize=(10,6))
plt.barh(results_df["Model"], results_df["R²_test"], color="skyblue")
plt.xlabel("R² Score")
plt.title("Model Comparison on King County House Prices")
plt.gca().invert_yaxis()
plt.show()

In [None]:
cat = CatBoostRegressor(verbose=0,random_state=42)
cat.fit(X_train_final,y_train_final)


In [None]:
feature_importances = pd.DataFrame({'Feature': x_final.columns,'Importance': cat.feature_importances_}).sort_values(by='Importance', ascending=False)

print(feature_importances)