<a href="https://colab.research.google.com/github/karandeep7/House_Price_Preditor/blob/main/House_Pricing_prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [6]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import zipfile
from sklearn.experimental import enable_iterative_imputer  # noqa
from sklearn.impute import KNNImputer, IterativeImputer
from sklearn.preprocessing import LabelEncoder
from tqdm import tqdm
from joblib import Parallel, delayed

In [7]:
os.makedirs("date", exist_ok=True)

# Download the dataset using Kaggle API
dataset_name = "ahmedshahriarsakib/usa-real-estate-dataset"
output_path = "../data/"

# Construct the Kaggle command
download_command = f"kaggle datasets download -d {dataset_name} -p {output_path}"

# Execute the command
os.system(download_command)


0

In [8]:
zip_path = os.path.join(output_path, f"{dataset_name.split('/')[-1]}.zip")
if os.path.exists(zip_path):
    with zipfile.ZipFile(zip_path, "r") as zip_ref:
        zip_ref.extractall(output_path)
    os.remove(zip_path)  # Remove the ZIP file after extraction

print("Dataset downloaded and unzipped to 'data/' folder.")

Dataset downloaded and unzipped to 'data/' folder.


In [9]:
#Load the dataset
file_path = "../data/realtor-data.zip.csv"  # Update with the actual file name
df = pd.read_csv(file_path)

# Check the data
print("Dataset Shape:", df.shape)
print(df.head())

Dataset Shape: (2226382, 12)
   brokered_by    status     price  bed  bath  acre_lot     street  \
0     103378.0  for_sale  105000.0  3.0   2.0      0.12  1962661.0   
1      52707.0  for_sale   80000.0  4.0   2.0      0.08  1902874.0   
2     103379.0  for_sale   67000.0  2.0   1.0      0.15  1404990.0   
3      31239.0  for_sale  145000.0  4.0   2.0      0.10  1947675.0   
4      34632.0  for_sale   65000.0  6.0   2.0      0.05   331151.0   

         city        state  zip_code  house_size prev_sold_date  
0    Adjuntas  Puerto Rico     601.0       920.0            NaN  
1    Adjuntas  Puerto Rico     601.0      1527.0            NaN  
2  Juana Diaz  Puerto Rico     795.0       748.0            NaN  
3       Ponce  Puerto Rico     731.0      1800.0            NaN  
4    Mayaguez  Puerto Rico     680.0         NaN            NaN  


In [10]:
#missing values
print("Missing Values:\n", df.isnull().sum())

Missing Values:
 brokered_by         4533
status                 0
price               1541
bed               481317
bath              511771
acre_lot          325589
street             10866
city                1407
state                  8
zip_code             299
house_size        568484
prev_sold_date    734297
dtype: int64


In [11]:
# Step 4: Drop Non-Critical Columns
columns_to_drop = ['brokered_by', 'street', 'prev_sold_date']
df.drop(columns=columns_to_drop, inplace=True)

# Check the updated dataset
print("Updated Dataset Shape:", df.shape)
print("Remaining Columns:", df.columns)

Updated Dataset Shape: (2226382, 9)
Remaining Columns: Index(['status', 'price', 'bed', 'bath', 'acre_lot', 'city', 'state',
       'zip_code', 'house_size'],
      dtype='object')


In [None]:
# 1. Identify important and non-important fields
important_numeric_cols = ['bed', 'bath', 'house_size']  # Most important numeric fields
other_numeric_cols = ['acre_lot', 'street']  # Less important numeric fields

important_categorical_cols = ['status', 'city', 'zip_code']  # Most important categorical fields
non_important_categorical_col = ['state']  # Less important categorical fields

# 2. Encode categorical columns
label_encoders = {}
print("Encoding categorical columns...")
for col in tqdm(important_categorical_cols + non_important_categorical_col, desc="Encoding Categorical Columns"):
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col].astype(str))
    label_encoders[col] = le  # Save encoders for later decoding

# 3. KNN Imputation for the most important numeric fields (Parallelized)
def knn_impute_chunk(chunk, cols, n_neighbors=5):
    imputer = KNNImputer(n_neighbors=n_neighbors)
    return imputer.fit_transform(chunk[cols])

# Split data into chunks for parallel processing
chunk_size = 500000
chunks = [df.iloc[i:i + chunk_size] for i in range(0, df.shape[0], chunk_size)]

print("\nImputing missing values for important numeric fields (KNN)...")
knn_imputed_chunks = Parallel(n_jobs=-1)(
    delayed(knn_impute_chunk)(chunk, important_numeric_cols) for chunk in chunks
)

# Merge the results back into the DataFrame
for i, chunk in enumerate(knn_imputed_chunks):
    df.iloc[i * chunk_size:(i + 1) * chunk_size, df.columns.get_indexer(important_numeric_cols)] = chunk

# 4. Iterative Imputation for the other numeric fields
print("\nImputing missing values for other numeric fields (Iterative)...")
iterative_imputer = IterativeImputer(max_iter=10, random_state=0)
df[other_numeric_cols] = iterative_imputer.fit_transform(df[other_numeric_cols])

# 5. KNN Imputation for important categorical fields
print("\nImputing missing values for important categorical fields (KNN)...")
categorical_knn_imputed_chunks = Parallel(n_jobs=-1)(
    delayed(knn_impute_chunk)(chunk, important_categorical_cols) for chunk in chunks
)

# Merge the results back into the DataFrame
for i, chunk in enumerate(categorical_knn_imputed_chunks):
    df.iloc[i * chunk_size:(i + 1) * chunk_size, df.columns.get_indexer(important_categorical_cols)] = chunk

# 6. Simple Imputation for non-important categorical column (`state`)
print("\nImputing missing values for non-important categorical column (Mode)...")
df['state'] = df['state'].fillna(df['state'].mode()[0])  # Replace NaNs with the most common value

# Final imputed dataframe is df
print("\nImputation complete. Dataset after imputation:")
print(df.head())


Encoding categorical columns...


Encoding Categorical Columns: 100%|██████████| 4/4 [00:04<00:00,  1.09s/it]


Imputing missing values for important numeric fields (KNN)...



