In [None]:
# https://github.com/let-robots-reign/real_estate_parsing
# https://github.com/OlegYurchik/cian  
# https://github.com/lenarsaitov/cianparser
    


In [None]:
%load_ext autoreload
%autoreload 2


import pandas as pd
from Utils import *


In [None]:
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager

In [None]:
# url = 'https://www.cian.ru/cat.php?deal_type=rent&engine_version=2&is_by_homeowner=1&offer_type=flat&region=1&room1=1&room2=1&room3=1&room7=1&room9=1&type=4'
# url = 'https://www.cian.ru/cat.php?deal_type=rent&engine_version=2&offer_type=flat&region=1&room1=1&room2=1&room3=1&room9=1&type=4'
url_base = 'https://www.cian.ru/cat.php?deal_type=rent&engine_version=2&offer_type=flat'



In [None]:
%%time
full_df = scrap_cian(url_base, 2, 380)
full_df.to_csv('flats_dump.csv')

In [None]:
full_df.shape

In [None]:
full_df.drop_duplicates().shape

In [None]:
full_df.groupby('rooms')['price'].mean()

In [None]:
full_df.groupby(['zone', 'rooms'])['price'].median().sort_values()

In [None]:
links = offer.find_all('a')

In [None]:
flats_dict = {}
for i, a in enumerate(links):
    if ('https://www.cian.ru/rent/flat' in a['href']):
        if '/cat.php?' not in a['href']:
            print(a['href'])
            flats_dict[a['href']] = i
            print()

In [None]:
offer

In [None]:
url = 'https://www.cian.ru/rent/flat/263160487/'
soup = get_html_page(url)

In [None]:
soup.find_all('div', attrs={'class': 'fotorama__img'})

In [None]:
soup.find_all('content')

In [None]:
soup

In [None]:
url = 'https://www.cian.ru/rent/flat/263424762/'
options = webdriver.ChromeOptions()
options.add_argument('headless')
driver = webdriver.Chrome(chrome_options=options)
driver.get(url)


In [None]:
driver.find_elements_by_name('a')

In [None]:
images_list = driver.find_elements_by_class_name("fotorama__img")

In [None]:
images_list

In [None]:
images_links = [x.get_attribute("src") for x in images_list if "-2." in x.get_attribute("src")]

In [None]:
images = []
for image in images_links:
    link = image.replace("-2.", "-1.")
    images.append(link)

In [None]:
images

In [None]:
import cv2
import numpy as np
from urllib.request import urlopen
from matplotlib import pyplot as plt

def pseudo_download_image(url):
    print(f'[INFO] Downloading {url}')
    resp = urlopen(url)
    image = np.asarray(bytearray(resp.read()), dtype="uint8")
    image = cv2.imdecode(image, cv2.IMREAD_COLOR)

    return image


In [None]:
img = pseudo_download_image('https://cdn-p.cian.site/images/57/333/411/kvartira-moskva-1y-krasnogvardeyskiy-proezd-1143337524-2.jpg')

In [None]:

import asyncio
import yarl
from typing import NamedTuple, Sequence, Optional

import aiohttp
    
import enum


In [None]:
import nest_asyncio
nest_asyncio.apply()

In [None]:
# class CianException(Exception):
#     """Base class for exceptions"""


# class CianStatusException(CianException):
#     """Incorrect status in response from cian server"""

#     def __init__(self, status):
#         super().__init__(f"Status in response from cian is not 'ok'. Status: {status}")



class Region(enum.Enum):
    MOSCOW = 1
    SPB = 2


class AdType(enum.Enum):
    FLAT_SALE = "flatsale"
    HOME_SALE = "suburbansale"
    FLAT_RENT = "flatrent"
    HOME_RENT = "suburbanrent"
    COMMERCIAL_SALE = "commercialsale"
    COMMERCIAL_RENT = "commercialrent"


class Room(enum.Enum):
    ROOM = 0
    ONE_ROOMED = 1
    TWO_ROOMED = 2
    THREE_ROOMED = 3
    FOUR_ROOMED = 4
    FIVE_ROOMED = 5
    SIX_ROOMED = 6
    FREE_LAYOUT = 7
    PART_FLAT = 8
    STUDIO = 9


class BuildingStatus(enum.Enum):
    NEW = 1
    OLD = 2


class ObjectType(enum.Enum):
    HOUSE = 1
    HOUSE_PART = 2
    AREA = 3
    TOWNHOUSE = 4


class Advertiser(enum.Enum):
    DEVELOPER = 1
    OWNER_AND_AGENT = 2
    
    
class CianClient:
    API_URL = yarl.URL("https://api.cian.ru/search-offers/v2/search-offers-desktop/")

    def __init__(self):
        self._session = aiohttp.ClientSession()
        self._loop = asyncio.get_event_loop()

    def __del__(self):
        self._loop.run_until_complete(self._session.close())
        self._loop.close()

    async def a_request(self, data: dict):
        request_args = {
            "url": self.API_URL,
            "json": {"jsonQuery": data},
        }
        async with self._session.post(**request_args) as response:
            payload = await response.json()
            status = payload["status"]
            if status != "ok":
                raise CianStatusException(status)
            return payload["data"]

    def request(self, data: dict):
        return self._loop.run_until_complete(self.a_request(data=data))

    def search(self, *args, **kwargs):
        return Search(self, *args, **kwargs)
    


class Search:
    def __init__(
            self,
            client: CianClient,
            region: Region,
            ad_type: AdType,
            rooms: Sequence[Room] = (),
            building_status: Optional[BuildingStatus] = None,
            object_types: Sequence[ObjectType] = (),
            advertiser: Optional[Advertiser] = None,
    ):
        self._client = client
        self._loop = asyncio.get_event_loop()
        self._cache_results = []
        self._results_count = None

        self.region = region
        self.ad_type = ad_type
        self.rooms = rooms
        self.building_status = building_status
        self.object_types = object_types
        self.advertiser = advertiser

        self.page = 0

    def __len__(self):
        return self._results_count

    async def __aiter__(self):
        return self

    def __iter__(self):
        return self

    async def __anext__(self):
        if not self._cache_results:
            self.page += 1
            request_data = self._get_request_data()
            response_data = await self._client.a_request(request_data)
            self._results_count = response_data["offerCount"] 
            self._cache_results = list(map(self._get_result, response_data["offersSerialized"]))

        return self._cache_results.pop(0)

    def __next__(self):
        return self._loop.run_until_complete(self.__anext__())

    @staticmethod
    def _get_result(data):
        return data

    def _get_request_data(self):
        data = {
            "_type": self.ad_type.value,
            "engine_version": {"type": "term", "value": 2},
            "page": {"type": "term", "value": self.page},
            "region": {"types": "term", "value": [self.region.value]},
        }
        if self.rooms:
            data["room"] = {
                "type": "terms",
                "value": [room.value for room in self.rooms],
            }
        if self.building_status:
            data["building_status"] = {"type": "term", "value": self.building_status.value}
            
        if self.object_types:
            data["object_type"] = {
                "type": "terms",
                "value": [object_type.value for object_type in self.object_types],
            }
        if self.advertiser:
            data["suburban_offer_filter"] = {"type": "term", "value": self.advertiser.value}

        return data


In [None]:
cian_client = CianClient()

for offer in cian_client.search(region=Region.MOSCOW, ad_type=AdType.FLAT_RENT):
    print(offer)
    break

In [None]:
offer.keys()
