In [2]:
import requests
import csv
import json
import random
from time import sleep

from tqdm.notebook import tqdm
from bs4 import BeautifulSoup
import pandas as pd

urls = pd.read_csv("kijiji_listings.csv")["0"]

In [23]:
# Main Scrape
def getData(url, req=None):
    if req == None:
        req = requests.get("https://kijiji.ca" + url)
    info = {}

    soup = BeautifulSoup(req.text)
    cost_text = soup.select("div[class^='priceWrapper'] > span")[0].text.replace(",", "")
    try:
        info["cost"] = int(cost_text.lstrip("$"))
    except:
        info["cost"] = -1

    info["address"] = soup.select("span[itemprop='address']")[0].text
    info["unitType"] = soup.select("svg:has(> use[xlink\:href='#icon-attributes-unittype']) + span")[0].text
    info['bedrooms'] = soup.select("span:contains('Bedrooms: ')")[0].text[len("Bedrooms: "):]
    info['bathrooms'] = float(soup.select("span:contains('Bathrooms: ')")[0].text[len("Bathrooms: "):])

    info["hydro"] = int(bool(soup.select("svg[aria-label='Yes: Hydro']")))
    info["heat"] = int(bool(soup.select("svg[aria-label='Yes: Heat']")))
    info["water"] = int(bool(soup.select("svg[aria-label='Yes: Water']")))

    info["wifi_info"] = soup.select("h4:contains('Wi-Fi and More') + ul")[0].text
    info["parking"] = soup.select("dt:contains('Parking Included') + dd")[0].text
    info["agreement"] = soup.select("dt:contains('Agreement Type') + dd")[0].text
    info["pet"] = int(bool(soup.select("dt:contains('Pet Friendly') + dd")[0].text == "Yes"))

    size_text = soup.select("dt:contains('Size (sqft)') + dd")[0].text.replace(",", "")
    try:
        info["size"] = int(size_text)
    except:
        info["size"] = -1
    info["furnished"] = int(bool(soup.select("dt:contains('Furnished') + dd")[0].text == "Yes"))
    info["appliances"] = soup.select("h4:contains('Appliances') + ul")[0].get_text(separator=',')
    info['ac'] = int(bool(soup.select("dt:contains('Air Conditioning') + dd")[0].text == "Yes"))
    info['outdoor'] = soup.select("h4:contains('Personal Outdoor Space') + ul")[0].get_text(separator=',')
    info['smoking'] = int(bool(soup.select("dt:contains('Smoking Permitted') + dd")[0].text == "Yes"))
    
    amenities_text = soup.select("h4:contains('Amenities') + ul")
    if amenities_text:
        info['amenities'] = amenities_text[0].get_text(separator=',')
    else:
        info['amenities'] = "Not Included"

    return info

# df = pd.DataFrame()
url = urls[1]

def repeatGetData(url, wait=30):
    req = requests.get("https://kijiji.ca" + url)
    try:
        return getData(url, req)
    except Exception as e:
        open("index.html", "w").write(req.text)
        if req.text.find("Hmm... Apparently this page no longer exists.") > 0:
            print(f"Page https://kijiji.ca{url} no longer exists")
            return None
        print(f"- Failed for url https://kijiji.ca{url}, with error {e} and request status {req}, waiting {wait}s")
        sleep(wait)
        return repeatGetData(url, wait=wait + 30)

fprintData = lambda url: print(json.dumps(repeatGetData(url), indent=4))
failedURLs = []

# skipped 649, 701
for url in tqdm(urls[702:]):
    data = repeatGetData(url)
    sleep(5)
    if data == None:
        continue
    data["url"] = url
    df = df.append(pd.DataFrame(data, index=[0]))
    df.to_csv("kijiji_listing_data.csv")

df

  0%|          | 0/151 [00:00<?, ?it/s]

Unnamed: 0,cost,address,unitType,bedrooms,bathrooms,hydro,heat,water,wifi_info,parking,agreement,pet,size,furnished,appliances,ac,outdoor,smoking,amenities,url
0,933,"22-41 Munroe Pl., Regina, SK, S4S 6A7",Apartment,2,1.0,0,1,1,Cable / TV,1,1 Year,1,860,0,"Laundry (In Building),Fridge / Freezer",1,Balcony,1,"24 Hour Security,Storage Space",/v-apartments-condos/regina/2-bedroom-22-41-mu...
0,850,"2175 Rae Street, Regina, SK, S4T 2E8",Apartment,Bachelor/Studio,1.0,0,1,1,Not Included,0,1 Year,1,380,0,"Laundry (In Building),Fridge / Freezer",1,Not Included,0,Not Included,/v-apartments-condos/regina/beautiful-bachelor...
0,799,"1424 Victoria Ave, Regina, SK, S4P 0P3",Apartment,1,1.0,0,1,1,Not Included,0,1 Year,1,520,0,Fridge / Freezer,0,Not Included,0,Not Included,/v-apartments-condos/regina/downtown-apartment...
0,814,"3864 Rae Street, Regina, SK, S4S 3A1",Apartment,1,1.0,0,1,1,Not Included,0,1 Year,1,675,0,Fridge / Freezer,0,Not Included,0,Not Included,/v-apartments-condos/regina/parliament-place-a...
0,1030,"2240 Robinson Street, Regina, SK, S4T 2P9",Apartment,2,1.0,0,1,1,Not Included,0,1 Year,1,870,0,"Laundry (In Building),Fridge / Freezer",1,Not Included,0,Not Included,/v-apartments-condos/regina/pet-friendly-2-bed...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
0,-1,"3951 3rd Ave N, S4R 8K3, Regina, SK",Apartment,1,1.0,0,1,1,Not Included,1,1 Year,0,500,0,Laundry (In Building),0,Not Included,0,Elevator in Building,/v-apartments-condos/regina/silver-sage-housin...
0,895,"309 Belfast St, North Portal, SK S0C 1W0, Canada",House,2 + Den,1.0,0,0,1,Not Included,3+,Month-to-month,1,1300,0,"Laundry (In Unit),Dishwasher,Fridge / Freezer",0,Yard,0,Not Included,/v-apartments-condos/regina/house-for-rent/156...
0,900,"2030 Queen Street, Regina s4t 4c1 SK",House,3,1.0,0,0,0,Not Included,2,1 Year,0,1,0,"Laundry (In Unit),Fridge / Freezer",0,Yard,0,Not Included,/v-apartments-condos/regina/3-bedroom-house-fo...
0,1750,"Trinity Way, Regina, SK S0G 4G0, Canada",Condo,2,2.0,0,0,0,Not Included,2,1 Year,1,1400,0,"Laundry (In Unit),Dishwasher,Fridge / Freezer",1,Balcony,0,Elevator in Building,/v-apartments-condos/regina/luxury-harbour-lan...
