In [4]:
import scrapy
from scrapy import Selector
from datetime import date
import requests
import pandas as pd
from tqdm import tqdm
from tqdm.notebook import tqdm_notebook
import time

In [5]:
def parse_price(str):
    """This function parses property price from str format to numeric. 
    It also strips price from spaces, currency id and dots

    Args:
        str (string): property price in string format scraped from ad

    Returns:
        int: property price in Ukrainian Hryvnias

    Examples:
        >>> parse_price('12 000 грн.')
        12000
    """
    str_nocurrency = str.replace('грн.', '')
    str_clean = str_nocurrency.replace(' ', '').replace('.', '')
    return int(str_clean)

def parse_tags(tags):
    """Takes list of tags and retrieves 
    property details: apartment area, kitchen area, rooms quantity, apartment floor, building height 
    and posting details: was the ad posted by private person or business

    Args:
        tags (list): list of tags scraped from ad

    Returns:
        dict: dictionaty of parsed tags
    """
    tags_dict={'other_tags':[]}
    for tag in tags:
        if tag == 'Бизнес':
            tags_dict['posted_by']='Business'
        elif tag == 'Частное лицо':
            tags_dict['posted_by']='Private'
        elif tag[:9] == "Этажность":
            try:
                tags_dict['building_height_floors']=int(tag.replace('Этажность: ', ''))
            except:
                tags_dict['other_tags'].append(tag)
        elif tag[:4] == "Этаж":
            try:
                tags_dict['floor']=int(tag.replace('Этаж: ', ''))
            except:
                tags_dict['other_tags'].append(tag)
        elif tag[:5] == 'Общая':
            try:
                tags_dict['apt_area_sqm']=float(tag.replace('Общая площадь: ', '').replace(' м²',''))
            except:
                tags_dict['other_tags'].append(tag)
        elif tag[:13] == 'Площадь кухни':
            try:
                tags_dict['kitchen_area_sqm'] = float(tag.replace('Площадь кухни: ', '').replace(' м²',''))
            except:
                tags_dict['other_tags'].append(tag)
        elif tag[:17] == 'Количество комнат':
            tags_dict['rooms'] = int(tag.replace('Количество комнат: ', '')[0])
        else:
            tags_dict['other_tags'].append(tag)
    return tags_dict  

def get_post(url):
    """This function takes link to ad and returns ad text and tags

    Args:
        url (str): link to ad

    Returns:
        tuple: list of tags, ad text
    """
    #retrieves data: text, and tags
    property = requests.get(url).text
    sel = Selector(text=property)
    tags_xpath = '//*[@id="root"]/div[1]/div[3]/div[2]/div[1]/div/ul/li/p/text()'
    text_xpath = '//*[@id="root"]/div[1]/div[3]/div[2]/div[1]/div/div[8]/div/text()'
    tags = sel.xpath(tags_xpath).extract()
    text = sel.xpath(text_xpath).extract()
    return (tags, process_post(text))
   
def process_post(post):
    """Takes post text as string or list. Combines it i single string and removes special characters

    Args:
        post (list or str): Olx ad post text

    Returns:
        str: cleaned olx ad post text
    """
    #Cleans post text. Joins into single string, removes /n
    post_str = "".join(post)
    return post_str.replace('\n', ' ').replace('\r', ' ')

def strip_links(links):
    """Removes trailing sharp and special information from the link

    Args:
        links (string): Standard olx link. Looks like this:
        https://www.olx.ua/d/uk/obyavlenie/sdam-2k-kvartiru-na-ul-knyazhiy-zaton-v-5-min-metro-osokorki-IDNC3Oo.html#a87b8ce1bb;promoted

    Returns:
        str: clean olx link

    Examples:
        >>> strip_links('https://www.olx.ua/d/uk/obyavlenie/sdam-2k-kvartiru-na-ul-knyazhiy-zaton-v-5-min-metro-osokorki-IDNC3Oo.html#a87b8ce1bb;promoted')
        'https://www.olx.ua/d/uk/obyavlenie/sdam-2k-kvartiru-na-ul-knyazhiy-zaton-v-5-min-metro-osokorki-IDNC3Oo.html'
    """
    return [st.split('#',1)[0] for st in links]

def get_links(page):
    """This functions takes content of olx summary page or search results and returns tuples of links and prices for each object on the page

    Args:
        page (str): Content of olx page

    Returns:
        zip: zip object generaiting tuples of (link, price) for each property shown on page
    """
    sel = Selector(text=page)
    item_xpath = '//*[@id="offers_table"]/tbody/tr/td/div[@class="offer-wrapper"]'
    item_link_xpath='.//h3/a/@href'
    item_price_xpath='./table/tbody/tr[1]/td[3]/div/p/strong/text()'
    links=sel.xpath(item_xpath).xpath(item_link_xpath).extract()
    prices=sel.xpath(item_xpath).xpath(item_price_xpath).extract()
    prices_clean = map(parse_price, prices)
    links_clean=strip_links(links)
    return zip(links_clean, prices_clean)

def get_pages_qty(page):
    """This function takes search results page and returns quantity of pages in search

    Args:
        page (str): content of the search page

    Returns:
        int: quantity of pages in search
    """
    sel = Selector(text=page)
    link_xpath='//*[@id="body-container"]/div[3]/div/div[6]/span[16]/a/span/text()'
    return int(sel.xpath(link_xpath).extract_first())


In [8]:
base_url = 'https://www.olx.ua/nedvizhimost/kvartiry/dolgosrochnaya-arenda-kvartir/' #Starting link
today = date.today()
try:
    data = pd.read_csv('Data/rent_offers.csv') #reading file
except:
    data=pd.DataFrame([],columns=['url', 'price', 'city', 'tags', 'description']) # empty df, used for the first run
new_data=[]
cities=['Kiev', 'Lvov', 'Odessa', 'Kharkov', 'Dnepr']

In [9]:
for city in cities:
    city_url = base_url + city + '/?page='
    r=requests.get(city_url+'1')
    last_page = get_pages_qty(r.text)
    for page in tqdm_notebook(range(1, last_page+1), desc = 'Scrapping ' + city):
        url = city_url + str(page)
        pages = get_links(requests.get(url).content)
        for url, price in pages:
            if url in data.url.values: #checking if record already esists
                if data[data.url == url].price.values[0] != price:
                    data[data.url == url].price = price
            else:
                new_data.append([url, price, city, today, *get_post(url)])

TypeError: int() argument must be a string, a bytes-like object or a real number, not 'NoneType'

In [5]:
upd_data = pd.DataFrame(new_data, columns=['url', 'price','city', 'date', 'tags', 'description']) #converting to Dataframe 
tags_df = pd.DataFrame(upd_data.tags.map(parse_tags).tolist()) #parsing tags
parsed_data = pd.concat([upd_data, tags_df], axis=1).drop(['tags'], axis=1) #join together & drop initial tags

In [6]:
result=pd.concat([data,parsed_data],ignore_index=True)
result.to_csv('Data/rent_offers.csv', index=False)