In [None]:
import requests
import lxml
from bs4 import BeautifulSoup
from pydantic import BaseModel
from typing import List, Optional, Dict

from abc import ABC, abstractmethod
import math
from tqdm import tqdm

In [None]:
class Flat(BaseModel):
    description: str
    links_to_img: List[str]
    city: str
    street: str
    area: str
    floors: str
    floor: str
    number_of_rooms: str
    price_per_square_m: str
    total_price: str
    build_year: Optional[str]
    heating: Optional[str]
    furnishment: Optional[str]
    object_: Optional[str]
    building: Optional[str]
    operation: Optional[str]
    
        
class ContactPerson(BaseModel):
    cp_name: str
    cp_phone_number: str
    cp_email: str
    

class Ad(BaseModel):
    ad_status: str
    ad_added_date: str
    ad_modified_date: str 
    
class FlatUrl(BaseModel):
    url: str

In [None]:
class BaseScraper(ABC):
    __items_per_page__: int = 0
    __domain__: str = ""
    
    
    @abstractmethod
    def _retrieve_flat_links(self, pages_count: int, keyword: str) -> List[FlatUrl]:
        pass
    
    
    def _get_page_content(self, query: str) -> Optional[BeautifulSoup]:
        resp = requests.get(f"{self.__domain__}/{query}")
        if resp.status_code == 200:
            return BeautifulSoup(resp.content, 'lxml')
        raise Exception("Cannot reach the content")
        
        
    @abstractmethod
    def _retrieve_flat_info(self, link:FlatUrl) -> Optional[Flat]:
        pass
    
    
    def scrape(self, flats_count: int, keyword: str) -> List[Optional[Flat]]:
        try:
            pages_count = math.ceil(flats_count / self.__items_per_page__)
        except ZeroDivisionError:
            raise AttributeError("Flats per page is set to zero")
        
        flat_links = self._retrieve_flat_links(pages_count, keyword)
        scraped_flats: List[Optional[Flat]] = []
        for flat_link in tqdm(flat_links):
            scraped_flat = self._retrieve_flat_info(flat_link)
            if scraped_flat:
                scraped_flats.append(scraped_flat)
        return scraped_flats

In [None]:
class Realu_Lt(BaseScraper):
    
    __items_per_page__: int = 16
    __domain__: str = 'https://www.realu.lt/nekilnojamasis-turtas'


    def _retrieve_flat_links(self, pages_count: int, keyword: str) -> List[FlatUrl]:
        url_prefix: str = "https://www.realu.lt"
        links: List[FlatUrl] = []
        
        for page_num in range(1, pages_count + 1):
            query = f'?op=sale&estate_type={keyword}&page={page_num}'
            
            content = self._get_page_content(query= query)
            if content:
                divs = content.find_all('div', class_='col-md-6 col-xl-3')
            
                for div in divs:
                    flat_url = FlatUrl(url= url_prefix + div.find('a', class_='info')['href'])
                    links.append(flat_url)   
                
            else:
                continue
        return links



    def _retrieve_flat_info(self, link: FlatUrl) -> Flat: 
        links_to_img: List[str] = []

        content = self._get_page_content(link)    

        address = content.find('span', class_='address').text.split(",")
        city = address[-1].strip()
        street = address[0].strip()

        accordion_body_class = content.find_all('div', class_='accordion-body')
        extra_info_on_flat = accordion_body_class[0].find_all('span')
        length_of_extra_info = len(extra_info_on_flat)
        
        if(length_of_extra_info == 7):
            ad_status = extra_info_on_flat[0].text
            object_ = extra_info_on_flat[1].text
            build_year = extra_info_on_flat[2].text
            furnishment = extra_info_on_flat[3].text
            heating = extra_info_on_flat[4].text
            building = extra_info_on_flat[5].text
            operation = extra_info_on_flat[6].text
        elif(length_of_extra_info == 6):
            ad_status = extra_info_on_flat[0].text
            object_ = extra_info_on_flat[1].text
            build_year = extra_info_on_flat[2].text
            furnishment = extra_info_on_flat[3].text
            heating = extra_info_on_flat[4].text
            building = extra_info_on_flat[5].text
            operation = None
        elif(length_of_extra_info == 5):
            ad_status = extra_info_on_flat[0].text
            object_ = extra_info_on_flat[1].text
            build_year = extra_info_on_flat[2].text
            furnishment = extra_info_on_flat[3].text
            heating = extra_info_on_flat[4].text
            building = None
            operation = None
        elif(length_of_extra_info == 4):
            ad_status = extra_info_on_flat[0].text
            object_ = extra_info_on_flat[1].text
            build_year = extra_info_on_flat[2].text
            furnishment = extra_info_on_flat[3].text
            heating = None
            building = None
            operation = None
        elif(length_of_extra_info == 3):
            ad_status = extra_info_on_flat[0].text
            object_ = extra_info_on_flat[1].text
            build_year = extra_info_on_flat[2].text
            furnishment = None
            heating = None
            building = None
            operation = None
        elif(length_of_extra_info == 2):
            ad_status = extra_info_on_flat[0].text
            object_ = extra_info_on_flat[1].text
            build_year = None
            furnishment = None
            heating = None
            building = None
            operation = None
        elif(length_of_extra_info == 1):
            ad_status = extra_info_on_flat[0].text
            object_ = None
            build_year = None
            furnishment = None
            heating = None
            building = None
            operation = None
        elif(length_of_extra_info == 0):
            ad_status = None
            object_ = None
            build_year = None
            furnishment = None
            heating = None
            building = None
            operation = None

        description_ps = accordion_body_class[1].find_all('p')
        description = ""
        for p in description_ps:
            description += p.text + " "

        imgs = content.find('div', class_='swiper-wrapper').find_all('img')
        for img in imgs:
            links_to_img.append(img['src']) 
    
        data_grid = content.find_all('span', class_='dd')
        area_field: str = data_grid[0].text.split(" ")
        area = area_field[0]
        floor_field: str = data_grid[1].text.split("/")
        floor = floor_field[0]
        floors = floor_field[1]
        number_of_rooms: str = data_grid[2].text
    
        price_per_square_m_field: str = content.find('span', class_='area').text.split()
        price_per_square_m = ''.join(price_per_square_m_field[ : -1])

        total_price_field: str = content.find('span', class_='price').text.split()
        total_price = ''.join(total_price_field[ : -1])
            
        flat: Flat = Flat(
            city= city,
            street= street,
            description = description,
            area = area,
            floors= floors,
            floor = floor,
            number_of_rooms = number_of_rooms,
            price_per_square_m = price_per_square_m,
            total_price = total_price,
            links_to_img = links_to_img,
            object_ = object_,
            build_year = build_year,
            furnishment = furnishment,
            heating = heating,
            building = building,
            operation = operation 
            )
        
        return flat

In [None]:
from pydantic import BaseSettings
#from dotenv import dotenv_values # sitas neisprestas

class Settings(BaseSettings):
    database_hostname: str
    database_port: str
    database_password: str
    database_name: str
    database_username: str
    
settings = Settings(database_hostname='localhost', database_port='5432', database_password='Lietuva!984', database_name='scraper_db', database_username='postgres')

In [None]:
from sqlalchemy import create_engine
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy.orm import sessionmaker



SQLALCHEMY_DATABASE_URL = f'postgresql://{settings.database_username}:{settings.database_password}@{settings.database_hostname}:{settings.database_port}/{settings.database_name}'
engine = create_engine(SQLALCHEMY_DATABASE_URL)

SessionLocal = sessionmaker(autocommit=False, autoflush=False, bind=engine)

Base = declarative_base()

def get_db():
    db = SessionLocal()
    try:
        yield db
    finally:
        db.close()


In [None]:
from sqlalchemy import Column, Integer, String, ForeignKey
from sqlalchemy.orm import relationship
from sqlalchemy.sql.expression import text
from sqlalchemy.sql.sqltypes import TIMESTAMP


class Flat(Base):
    __tablename__: str = 'flats'
    
    id = Column(Integer, primary_key=True, nullable=False)
    city = Column(String, nullable=True)
    street = Column(String, nullable=True)
    object_ = Column(String, nullable=True)
    area = Column(String, nullable=True)
    number_of_rooms = Column(String, nullable=True)
    floors = Column(String, nullable=True)
    floor = Column(String, nullable=True)
    price_per_square_m = Column(String, nullable=True)
    total_price = Column(String, nullable=True)
    description = Column(String, nullable=True)
    links_to_img = Column(String, nullable=True)
    build_year = Column(String, nullable=True)
    heating = Column(String, nullable=True)
    furnishment = Column(String, nullable=True)
    building = Column(String, nullable=True)
    operation = Column(String, nullable=True)
    


In [None]:
from fastapi import FastAPI, status, HTTPException, Response, Depends
from sqlalchemy.orm import Session

app = FastAPI()

@app.get("/")
def root() -> Dict:
    return{'message': 'Welcome to the flat price predicting app!'}

@app.get("/create_flats")
def write_flats_data(db: Session = Depends(get_db)):
    
    realu_lt: Realu_Lt = Realu_Lt()
    links = realu_lt.scrape(flats_count=16, keyword='flat')
    
    new_flat = realu_lt._retrieve_flat_info(links[1])
    db.add(new_flat)
    db.commit()
    db.refresh(new_flat)
    
    return new_flat


In [None]:
import asyncio
import uvicorn

if __name__ == "__main__":
    config = uvicorn.Config(app)
    server = uvicorn.Server(config)
    await server.serve()