In [None]:
!kaggle datasets download -d ahmedshahriarsakib/usa-real-estate-dataset

Dataset URL: https://www.kaggle.com/datasets/ahmedshahriarsakib/usa-real-estate-dataset
License(s): other
Downloading usa-real-estate-dataset.zip to /home/jthesmar/DSIII/House_Price_Forecaster-DSIII
 97%|████████████████████████████████████▊ | 37.0M/38.2M [00:03<00:00, 12.7MB/s]
100%|██████████████████████████████████████| 38.2M/38.2M [00:03<00:00, 12.2MB/s]


In [None]:
import zipfile

with zipfile.ZipFile("usa-real-estate-dataset.zip", 'r') as zip_ref:
    zip_ref.extractall("usa_real_estate")

In [36]:
import pandas as pd

# Load dataset
data = pd.read_csv('./usa_real_estate/realtor-data.zip.csv', usecols=['bed', 'bath', 'acre_lot', 'house_size', 'city', 'state', 'zip_code', 'price'])

# Display the first five rows
print(data.head())


      price  bed  bath  acre_lot        city        state  zip_code  \
0  105000.0  3.0   2.0      0.12    Adjuntas  Puerto Rico     601.0   
1   80000.0  4.0   2.0      0.08    Adjuntas  Puerto Rico     601.0   
2   67000.0  2.0   1.0      0.15  Juana Diaz  Puerto Rico     795.0   
3  145000.0  4.0   2.0      0.10       Ponce  Puerto Rico     731.0   
4   65000.0  6.0   2.0      0.05    Mayaguez  Puerto Rico     680.0   

   house_size  
0       920.0  
1      1527.0  
2       748.0  
3      1800.0  
4         NaN  


In [None]:
# Data Overview:
print("\nDataset Info:\n")
data.info()

# Data Cleaning:
unique_states = data['state'].unique()                          # all states available in dataset
states_to_remove = ['Virgin Islands', 'New Jersey', 'Tennessee', 'Rhode Island', 'Delaware', 'Louisiana', 'Missouri', 'District of Columbia', 'Wisconsin', 'New Brunswick', 
                    'Arkansas', 'Idaho', 'Indiana', 'Iowa','South Dakota', 'Nebraska', 'North Dakota','Montana', 'Oklahoma', 'Guam', 'Alaska']
data = data[~data['state'].isin(states_to_remove)]              # removing all rows with states we dont wnat

top_states = data['state'].value_counts().head(10).index        # count the frequency of each state and keep the 10 most frequent ones
data = data[data['state'].isin(top_states)]                     # filter data to keep only the top 10 most frequent states


# data.info()


Dataset Info:

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2226382 entries, 0 to 2226381
Data columns (total 8 columns):
 #   Column      Dtype  
---  ------      -----  
 0   price       float64
 1   bed         float64
 2   bath        float64
 3   acre_lot    float64
 4   city        object 
 5   state       object 
 6   zip_code    float64
 7   house_size  float64
dtypes: float64(6), object(2)
memory usage: 135.9+ MB
<class 'pandas.core.frame.DataFrame'>
Index: 1260104 entries, 5217 to 2177858
Data columns (total 8 columns):
 #   Column      Non-Null Count    Dtype  
---  ------      --------------    -----  
 0   price       1259460 non-null  float64
 1   bed         1007211 non-null  float64
 2   bath        994568 non-null   float64
 3   acre_lot    1063400 non-null  float64
 4   city        1259408 non-null  object 
 5   state       1260104 non-null  object 
 6   zip_code    1259975 non-null  float64
 7   house_size  964203 non-null   float64
dtypes: float64(6), object(2

In [None]:
# Data Cleaning: 
print("\nMissing Values per Column:\n")
print(data.isnull().sum())                                                  # remove missing values 
data = pd.get_dummies(data, columns=['city', 'state'], drop_first=True, sparse=True)     # objects need to be converted to numerical format for machine to understand
data = data.dropna(subset=['price'])                                        # drop rows with missing `price` since it's the label

# Useful features for prediction
features = ['bed', 'bath', 'acre_lot', 'house_size', 'city', 'state', 'zip_code']
label = 'price'  # we choose to forecast house prices in area

# Memory optimization
data['price'] = data['price'].astype('float32')
data['bed'] = data['bed'].astype('float32')
data['bath'] = data['bath'].astype('float32')
data['acre_lot'] = data['acre_lot'].astype('float32')
data['zip_code'] = data['zip_code'].astype('float32')
data['house_size'] = data['house_size'].astype('float32')

data.info()  # Summary of the dataset
data.describe()  # Basic statistics for numerical features


Missing Values per Column:

price            644
bed           252893
bath          265536
acre_lot      196704
city             696
state              0
zip_code         129
house_size    295901
dtype: int64


MemoryError: Unable to allocate 11.1 GiB for an array with shape (1260104, 9436) and data type bool

In [25]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

features = data[features]
label = data[label]

X = data[features]
y = data[label]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train_scaled, y_train)

y_pred = model.predict(X_test_scaled)
mse = mean_squared_error(y_test, y_pred)
print(f'Mean Squared Error: {mse}')

ValueError: Boolean array expected for the condition, not object