In [37]:
# Import the NumPy and Pandas libraries, which are commonly used for numerical operations and data manipulation
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

#  Import the os module, which provides a way of using operating system-dependent functionality
import os

# Use the os.walk function to iterate over the directory tree rooted at the current working director
# The function returns a tuple containing the current directory name (dirname), a list of subdirectories (_), and a list of filenames (filenames).
for dirname, _, filenames in os.walk(''):
    for filename in filenames:  #iterate through the list of filenames obtained from the os.walk function.
        full_path = os.path.join(dirname, filename)
        print(full_path) 

In [38]:
# Reads the data from the CSV file into a Pandas DataFrame named df.
df=pd.read_csv('housing_price_dataset.csv')
dff=df.copy()

In [39]:
# One-hot encoding is the conversion of categorical information into a format that may be fed into machine learning algorithms to improve prediction accuracy. 

# creates a new DataFrame (one_hot_encoded) by applying one-hot encoding to the 'Neighborhood' column.
one_hot_encoded = pd.get_dummies(df['Neighborhood']) #The pd.get_dummies function in the Pandas library is used for one-hot encoding categorical variables. 
one_hot_encoded= one_hot_encoded.astype(int) # converts the values in the one-hot encoded DataFrame to integers.
df = pd.concat([df, one_hot_encoded], axis=1) # concatenates df with one_hot_encoded along the columns (axis=1). 
print(df)

       SquareFeet  Bedrooms  Bathrooms Neighborhood  YearBuilt          Price  \
0            2126         4          1        Rural       1969  215355.283618   
1            2459         3          2        Rural       1980  195014.221626   
2            1860         2          1       Suburb       1970  306891.012076   
3            2294         2          1        Urban       1996  206786.787153   
4            2130         5          2       Suburb       2001  272436.239065   
...           ...       ...        ...          ...        ...            ...   
49995        1282         5          3        Rural       1975  100080.865895   
49996        2854         2          2       Suburb       1988  374507.656727   
49997        2979         5          3       Suburb       1962  384110.555590   
49998        2596         5          2        Rural       1984  380512.685957   
49999        1572         5          3        Rural       2011  221618.583218   

       Rural  Suburb  Urban

In [40]:
# remove the 'Neighborhood' column from the DataFrame df.
df=df.drop(columns='Neighborhood')
print(df)

       SquareFeet  Bedrooms  Bathrooms  YearBuilt          Price  Rural  \
0            2126         4          1       1969  215355.283618      1   
1            2459         3          2       1980  195014.221626      1   
2            1860         2          1       1970  306891.012076      0   
3            2294         2          1       1996  206786.787153      0   
4            2130         5          2       2001  272436.239065      0   
...           ...       ...        ...        ...            ...    ...   
49995        1282         5          3       1975  100080.865895      1   
49996        2854         2          2       1988  374507.656727      0   
49997        2979         5          3       1962  384110.555590      0   
49998        2596         5          2       1984  380512.685957      1   
49999        1572         5          3       2011  221618.583218      1   

       Suburb  Urban  
0           0      0  
1           0      0  
2           1      0  
3      

In [41]:
# using the train_test_split function from scikit-learn to split the dataset into training and testing sets
from sklearn.model_selection import train_test_split

y = df['Price'] # extracts variable 'Price' from your df and assigns it to the variable y.
X = df.drop(columns='Price') # creates a DataFrame X containing the features by dropping the 'Price' column from df.

# train_test_split splits the dataset into training and testing sets. 
#    The parameter test_size=0.2 specifies that 20% of the data will be used for testing, 
#    and random_state=42 sets the random seed for reproducibility.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [42]:
#  indicating the number of rows and columns in training feature set X.
X_train.shape

(40000, 7)

In [43]:
y_train.shape

(40000,)

In [44]:
# using the XGBoost library to train a regression model and evaluate its performance on a test set.
import xgboost as xgb
from sklearn.metrics import mean_squared_error # the mean_squared_error function from scikit-learn

xg_reg = xgb.XGBRegressor(objective ='reg:squarederror',  # Specifies that the objective is to minimize the mean squared error for regression
                          colsample_bytree = 0.3, # Fraction of features to be randomly sampled for building each tree.
                          learning_rate = 0.1, # Step size shrinkage used to prevent overfitting.
                          max_depth = 5, # Maximum depth of a tree.
                          alpha = 10, # L1 regularization term on weights
                          n_estimators = 100) # Number of boosting rounds.

# train the XGBoost model using the training set
xg_reg.fit(X_train, y_train)

# make predictions on the test set
y_pred = xg_reg.predict(X_test)

# Evaluate the model's performance using the root mean squared error (RMSE)
# The mean_squared_error function(part of the scikit-learn library) calculates the mean squared difference between each true and predicted value. 
rmse = mean_squared_error(y_test, y_pred,squared=False)
print("RMSE:", rmse)

RMSE: 49470.04862622066


In [45]:
# using the scikit-learn library to create and train a Random Forest Regression model 
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

# Create an instance of the RandomForestRegressor
rf_reg = RandomForestRegressor(n_estimators=1000, max_depth=5, random_state=42)

# Fit the model to the training set
rf_reg.fit(X_train, y_train)

# Predict on the test set
y_pred_rf = rf_reg.predict(X_test)

# Calculate and print the RMSE (Root Mean Squared Error)
rmse_rf = mean_squared_error(y_test, y_pred_rf, squared=False)
print("Random Forest RMSE:", rmse_rf)

Random Forest RMSE: 49432.73602639071


In [46]:
print(y_pred_rf)

[218573.92482078 129327.28142696 252282.38884532 ... 314612.0553523
 195951.62254363 244693.63000187]


In [47]:
# using the LabelEncoder from scikit-learn processing moduel to convert categorical labels into numerical values.
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder() # # Create an instance of LabelEncoder

# Use the fit_transform method to label encode the 'Neighborhood' column 
#     and create a new column 'Neighborhood_LabelEncoded' in your DataFrame (dff).
dff['Neighborhood_LabelEncoded'] = label_encoder.fit_transform(dff['Neighborhood'])
print(dff)

       SquareFeet  Bedrooms  Bathrooms Neighborhood  YearBuilt          Price  \
0            2126         4          1        Rural       1969  215355.283618   
1            2459         3          2        Rural       1980  195014.221626   
2            1860         2          1       Suburb       1970  306891.012076   
3            2294         2          1        Urban       1996  206786.787153   
4            2130         5          2       Suburb       2001  272436.239065   
...           ...       ...        ...          ...        ...            ...   
49995        1282         5          3        Rural       1975  100080.865895   
49996        2854         2          2       Suburb       1988  374507.656727   
49997        2979         5          3       Suburb       1962  384110.555590   
49998        2596         5          2        Rural       1984  380512.685957   
49999        1572         5          3        Rural       2011  221618.583218   

       Neighborhood_LabelEn

In [48]:
df=dff.copy()
df=df.drop(columns='Neighborhood')
print(df)

       SquareFeet  Bedrooms  Bathrooms  YearBuilt          Price  \
0            2126         4          1       1969  215355.283618   
1            2459         3          2       1980  195014.221626   
2            1860         2          1       1970  306891.012076   
3            2294         2          1       1996  206786.787153   
4            2130         5          2       2001  272436.239065   
...           ...       ...        ...        ...            ...   
49995        1282         5          3       1975  100080.865895   
49996        2854         2          2       1988  374507.656727   
49997        2979         5          3       1962  384110.555590   
49998        2596         5          2       1984  380512.685957   
49999        1572         5          3       2011  221618.583218   

       Neighborhood_LabelEncoded  
0                              0  
1                              0  
2                              1  
3                              2  
4       

In [49]:
from sklearn.model_selection import train_test_split
y = df['Price']
X = df.drop(columns='Price') 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [50]:
from sklearn.metrics import mean_squared_error

xg_reg = xgb.XGBRegressor(objective ='reg:squarederror', colsample_bytree = 0.3, learning_rate = 0.1,
                max_depth = 5, alpha = 10, n_estimators = 100)
xg_reg.fit(X_train, y_train)


y_pred = xg_reg.predict(X_test)

rmse = mean_squared_error(y_test, y_pred,squared=False)
print("RMSE:", rmse)

RMSE: 49996.93623584677


In [51]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

# Create an instance of the RandomForestRegressor
rf_reg = RandomForestRegressor(n_estimators=1000, max_depth=5, random_state=42)

# Fit the model to the training set
rf_reg.fit(X_train, y_train)

# Predict on the test set
y_pred_rf = rf_reg.predict(X_test)

# Calculate and print the RMSE (Root Mean Squared Error)
rmse_rf = mean_squared_error(y_test, y_pred_rf, squared=False)
print("Random Forest RMSE:", rmse_rf)

Random Forest RMSE: 49432.852932768066


In [52]:
# Specify the path where you want to save the XGBoost model
model_path = 'housePrediction.model'

# Save the XGBoost model
xg_reg.save_model(model_path)

# Print a message indicating where the model is saved
print(f"XGBoost model saved to {model_path}")


XGBoost model saved to housePrediction.model




In [53]:
import joblib

# Specify the path where you want to save the Random Forest model
rf_model_path = 'housePredictionModel.joblib'

# Save the Random Forest model
joblib.dump(rf_reg, rf_model_path)

# Print a message indicating where the model is saved
print(f"Random Forest model saved to {rf_model_path}")



Random Forest model saved to housePredictionModel.joblib
