In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
from tqdm.notebook import tqdm
from datetime import date

In [2]:
def get_page(link):
    '''
    Function
    :param link: link to the page
    :return: soup object
    '''
    return BeautifulSoup(requests.get(link).text,'lxml')

def get_next_page(page_soup):
    '''
    This function returns link of next page for scrapping. If there is no next page returns False
    Takes only one argument - BeautifulSoup object
    :param page_soup: bs4 object
    :return: str: link to next page
    '''
    if  s:= page_soup.find('a', attrs={'data-testid':'pagination-forward'}):
        return s.get('href').replace('/uk/', 'https://www.olx.ua/uk/')
    else:
        return s

def get_data(page_soup):
    '''
    This function returns all ids, links and description found on page
    :param page_soup: bs4 object
    :return: zip object
    '''
    #selecting link to the page and stripping of hashtag in one go
    links = [x.get('href').replace('/d/', 'https://www.olx.ua/d/') for x in page_soup.select("a.css-rc5s2u")]
    description  = [process_description(x.string) for x in page_soup.select('h6.css-16v5mdi')]
    id = [x.get('id') for x in page_soup.select('div.css-1sw7q4x')][:-1]
    price, negotiable = zip(*[x.text.split('.') for x in page_soup.select('p.css-10b0gli')])
    return zip(id, links, price, negotiable, description)

def get_all_pages(category, city):
    '''
    Scraps all links in category for given city
    :param category: string. Url to the whole category without city specification
    :param city: string. Should be spelled same way as in the site links
    :return:
    '''
    data=[]
    page = category + city
    while page:
        soup=get_page(page)
        data.extend(list(get_data(soup)))
        page = get_next_page(soup)
    return data

def process_description(post):
    #Cleans post text. Joins into single string, removes /n
    post_str = "".join(post)
    return post_str.replace('\n', ' ').replace('\r', ' ')

In [3]:
try:
    data = pd.read_csv('Data/rent_links.csv') #reading file
except:
    data=pd.DataFrame([],columns=['id', 'url', 'price', 'negotiable', 'description', 'city', 'date'])
base_url = 'https://www.olx.ua/uk/nedvizhimost/kvartiry/dolgosrochnaya-arenda-kvartir/'
cities = ['Kiev', 'Lvov', 'Odessa', 'Kharkov', 'Dnepr']
today = date.today()
new_data=pd.DataFrame()

In [4]:
for city in tqdm(cities):
    pages = get_all_pages(base_url, city)
    city_data = pd.DataFrame(pages, columns=['id', 'url', 'price', 'negotiable', 'description'])
    city_data['city'] = city
    new_data = pd.concat([new_data,city_data],ignore_index=True)
new_data['date']=today

  0%|          | 0/5 [00:00<?, ?it/s]

In [5]:
upd_data=new_data[~new_data.id.isin(data.id.values)]

In [6]:
result = pd.concat([data,upd_data],ignore_index=True)
result.drop_duplicates(subset='id', inplace=True)
result.to_csv('Data/rent_links.csv', index=False)

In [7]:
qty = len(upd_data)
import logging
from datetime import time
logging.basicConfig(filename='links.log', encoding='utf-8', level=logging.DEBUG)
logging.info(f"File updated successfully. {qty} new entries added")

In [8]:
qty

6490