# Web scrapping using BeautifulSoup

Web scrapping experiment using BeautifulSoup.
Actions performed:
- Read all house posters acording to filter selected (rentals in montreal, e.g.)
- Read 10 pages of search, clean texts, put in dataframe
- Save into CSV file

In [6]:
from bs4 import BeautifulSoup
import requests
import urllib.request
import pandas as pd
from pandas import DataFrame
import datetime
import os

def pagebot():
    #Search filtered only by rentals in montreal
    #Full url - https://www.kijiji.ca/b-a-louer/ville-de-montreal/page-1/c30349001l1700281
    base_url = 'https://www.kijiji.ca/b-a-louer/ville-de-montreal/page-'
    url_separator = '/c30349001l1700281'
    housedata_Format = []

    for page_num in range(10):
        url = base_url + str(page_num) + url_separator
        response = requests.get(url)
        soupcontent = BeautifulSoup(response.text, 'html.parser')
        houses = soupcontent.find_all(lambda tag: tag.name == 'div' and tag.get('class') == ['clearfix']) #lambda to specify DIV and CLEARFIX exactly
        #Verify
        #for test in soupcontent:
        #    print(test.text, "////")

        for i in houses:
            house_name = i.find('div', {'class':'title'}).get_text()
            house_price = i.find('div', {'class':'price'}).get_text()
            #try catch on bedrooms because some are not showing the info
            try:
                house_bed = i.find('span', {'class':'bedrooms'}).get_text()
            except AttributeError:
                house_bed = ""
            house_location = i.find('div', {'class':'location'}).get_text()
            house_description = i.find('div', {'class':'description'}).get_text()

            format_housename = ' '.join(house_name.strip().replace('\n', '').split())
            format_houseprice = ' '.join(house_price.strip().replace('$', '').split())
            format_housebed = ' '.join(house_bed.strip().replace('\n', '').split())
            format_houselocation = ' '.join(house_location.strip().replace('\n', '').split())
            format_housedescription = ' '.join(house_description.strip().replace('\n', '').split())

            house_record = [format_housename, format_houseprice, format_housebed, format_houselocation, format_housedescription, datetime.date.today()]
            housedata_Format.append(house_record)

    #Verify
    #for row in housedata_Format:
    #    print(row)

    #Save into CSV
    #print(df)
    df = pd.DataFrame(housedata_Format, columns = ['House Poster', 'Price', '# of Bedrooms', 'Location', 'Description', 'Record ts'])
    if not os.path.isfile('filename.csv'):
        df.to_csv('HousingMontrealScrapping.csv')
    else:
        df.to_csv('HousingMontrealScrapping.csv', mode='a', header=False)



In [None]:
pagebot()