In [1]:
!pip install selenium webdriver-manager



In [2]:
import pandas as pd
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup
import re
import time
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split

In [3]:
class RealtorScraper:
    def scrape_houses_data(self, page_url, ds_file_name):
        # Set up Selenium
        chrome_options = Options()
        chrome_options.add_argument("--headless")
        service = Service('C:/Users/IBTIHEL/Downloads/chromedriver-win64/chromedriver-win64/chromedriver.exe')
        driver = webdriver.Chrome(service=service, options=chrome_options)
        url = page_url
        driver.get(url)
        
        # Wait for the content to load and then get the page source
        time.sleep(5)
        page_source = driver.page_source

        # Use BeautifulSoup to parse the loaded content
        soup = BeautifulSoup(page_source, "html.parser")
        listings = soup.find_all('div', class_='CardContent__StyledCardContent-rui__sc-7ptz1z-0 kDqsxy card-content card-content')
        
        # Extract data 
        prices=[]
        beds=[]
        baths=[]
        sqfts=[]
        for listing in listings:
            try:
                raw_price = listing.find('div', class_='Pricestyles__StyledPrice-rui__btk3ge-0 kjbIiZ card-price')
                raw_price_text = raw_price.text
                price = int(re.sub(r'[^\d]', '', raw_price_text))
            except AttributeError:
                price = None
            
            try:
                raw_nb_beds= listing.find('li', class_='PropertyBedMetastyles__StyledPropertyBedMeta-rui__a4nnof-0 jkAoUn')
                raw_nb_beds_text = raw_nb_beds.text
                nb_beds = float(re.sub(r'[^\d\.]', '', raw_nb_beds_text))
            except AttributeError:
                nb_beds = None
            
            try:
                raw_nb_baths= listing.find('li', class_='PropertyBathMetastyles__StyledPropertyBathMeta-rui__sc-67m6bo-0 hGQdFx')
                raw_nb_baths_text = raw_nb_baths.text
                nb_baths = float(re.sub(r'[^\d\.]', '', raw_nb_baths_text))
            except AttributeError:
                nb_baths = None

            try:
                raw_sqft = listing.find('li', class_='PropertySqftMetastyles__StyledPropertySqftMeta-rui__sc-1gdau7i-0 cYyTDO')
                raw_sqft_text = raw_sqft.span.text
                sqft = float(re.sub(r'[^\d]', '', raw_sqft_text))
            except AttributeError:
                sqft = None

            prices.append(price)
            beds.append(nb_beds)
            baths.append(nb_baths)
            sqfts.append(sqft)

        # Close the browser after scraping
        driver.quit()
        
        # create a data frame with the scraped data
        data= {
            'num_beds': beds,
            'num_baths': baths,
            'sqft': sqfts,
            'price': prices
        }
        df= pd.DataFrame(data)
        df.to_excel(ds_file_name, index= False )
        return df

In [4]:
class DataFramePreprocessor:
    def __init__(self):
        self.combined_df= pd.DataFrame()

    def combine_dataframes(self, dataframes):
        self.combined_df= pd.concat(dataframes, ignore_index= True)
    
    def prepare_great_df_for_regression(self, final_ds="final_ds.xlsx"):
        # Dropping rows with missing values
        self.combined_df.dropna(inplace= True)
        self.combined_df.to_excel(final_ds, index=False)
        return self.combined_df

In [5]:
class PriceRegressor:
    def regress(self, df):
        y= df['price']
        X= df[['num_beds', 'num_baths', 'sqft']] 

        X_train, X_test, y_train, y_test= train_test_split(X, y, test_size= 0.2, random_state= 42)

        model= LinearRegression()
        model.fit(X_train ,y_train)

        y_pred= model.predict(X_test)

        r2= r2_score(y_test, y_pred)
        
        # Printing model evaluation metrics
        print("R-squared Score:", r2)
        
        # Additional model information
        print("Coefficients:", model.coef_)
        print("Intercept:", model.intercept_)
        return model

In [6]:
class PricePredictor:
    def __init__(self, trained_model):
        self.trained_model= trained_model

    def predictt(self, new_data, prediction_file= "prediction_file.xlsx"): # 'new_data' is a DataFrame with columns 'num_beds', 'num_baths', 'sqft', 'acre_lot'
        predictions= self.trained_model.predict(new_data)
        predictions_df= pd.DataFrame({'predicted_price': predictions})
        
        # Save the predictions to an Excel file
        predictions_df.to_excel(prediction_file, index=False)
        return predictions_df

In [7]:
class Manager:
    def __init__(self):
        self.scraper= RealtorScraper()
        self.preprocessor= DataFramePreprocessor()
        self.regressor= PriceRegressor()
        self.trained_model= None
        self.predictor= None
        self.all_scraped_data= []

    def manage_complete_tasks(self, page_urls, ds_file_names, new_data, final_ds, prediction_file):
        # scrape data from multiple pages
        for page_url, ds_file_name in zip(page_urls, ds_file_names):
            scraped_df= self.scraper.scrape_houses_data(page_url, ds_file_name)
            self.all_scraped_data.append(scraped_df)

        # combine all extracted dataframes
        self.preprocessor.combine_dataframes(self.all_scraped_data)
        great_df= self.preprocessor.prepare_great_df_for_regression(final_ds)

        # train the regression model
        self.trained_model= self.regressor.regress(great_df)

        # create price predictor - object from the PricePredictor class
        self.predictor= PricePredictor(self.trained_model)

        # predict using new data
        predictions_df= self.predictor.predictt(new_data, prediction_file)
        return predictions_df

In [8]:
# create an instance of the manager class
manager_1= Manager()
page_urls1= ['https://www.realtor.com/realestateandhomes-search/Birmingham_AL/show-newest-listings/sby-6',
            'https://www.realtor.com/realestateandhomes-search/Birmingham_AL/show-newest-listings/sby-6/pg-2',
            'https://www.realtor.com/realestateandhomes-search/Birmingham_AL/show-newest-listings/sby-6/pg-3',
            ]
ds_file_names1= ["page_1.xlsx", "page_2.xlsx", "page_3.xlsx"]
new_data1= pd.DataFrame({
    'num_beds': [3, 4, 1],
    'num_baths': [2, 3, 2.5],
    'sqft': [2000, 2500, 1905]})
final_ds1= "final_ds_1.xlsx"
prediction_file1= "prediction_file1.xlsx"
prediction_result1= manager_1.manage_complete_tasks(page_urls1, ds_file_names1, new_data1, final_ds1, prediction_file1)
prediction_result1

R-squared Score: 0.768956105743252
Coefficients: [-66705.20546755  19858.14818696    245.20469509]
Intercept: -2872.560648723156


Unnamed: 0,predicted_price
0,327137.509505
1,402892.79977
2,447182.5485
