# Data Crawling
This small project shows two methods of data crawling based on selenium and beautifulsoup.

In [24]:
import numpy as np
import pandas as pd

from selenium import webdriver
from selenium.common import NoSuchElementException
import time

#We know that BeautifulSoup is also a powerful pachage in data crawling, hence we'll compare it with selenium to discuss their differences.
from bs4 import BeautifulSoup
import lxml

import re

import matplotlib.pyplot as plt
from pylab import mpl

Let's crawl the data of second-hand house from https://esf.fang.com/ as a prerequisite of further data analyses.

In [25]:
#Enter the targeted website through selenium.
driver = webdriver.Edge()
#To access the target url
url = "https://esf.fang.com/"
driver.get(url)

In [26]:
#Enter the page of targeted region, the region I was allocated is Guanyin Temple in Daxing.
address1 = '大兴'
address2 = '观音寺'
driver.find_element('partial link text',address1).click()
driver.find_element('partial link text',address2).click()

Through self learning I know there is a package called BeautifulSoup,</br>
while Selenium could interact with the web page which enable it to scape dynamic web pages,</br>
BeautifulSoup could scape a static web page more efficiently.</br>
Hence I'll construct two functions, while one is contructed on Selenium individually and another combines Selenium and BeautifulSoup, and compare their efficiency.
#### Selenium Edition

In [27]:
def get_data(driver):
    #Get the dataset with the same class name
    basic_info = driver.find_elements('class name','tel_shop')
    loc_info = driver.find_elements('class name','add_shop')
    price_info = driver.find_elements('class name','price_right')
    info_list = []
    
    #Split the information from different houses
    for i in range(len(basic_info)):
        #The location and the community of houses
        title,loc = loc_info[i].text.split('\n')
        block = loc.split('-')[0]

        #Several basic information of the houses, use regular expressions to help me get the them while some might be absent.
        basic = basic_info[i].text.replace('|',' ')
        size = re.findall('\d+[.\d+]{0,}㎡',basic)[0].strip('㎡') if re.findall('\d+㎡',basic) else None
        year = re.findall('\d+年建',basic)[0].strip('年建') if re.findall('\d+年建',basic) else None

        #Price information of the houses
        total_p,per_p = price_info[i].text.split('\n')
        total_p = total_p.strip('万')
        per_p = per_p.strip('元/㎡')

        #Sort the information into a dictionary.
        house_info = {'title':title,
                      'size(㎡)':size,
                      'year':year,
                      'location':loc,
                      'total price(万)':total_p,
                      'price per square(元/㎡)':per_p,
                      'block':block}
        
        info_list.append(house_info)
        
    return pd.DataFrame(info_list)

In [28]:
%%time
house_list = pd.DataFrame()
i = 0
while i == 0:
    house_list = pd.concat([house_list,get_data(driver)])
    
    #Turn to the next page
    try:
        next_page = driver.find_element('partial link text','下一页')
        driver.execute_script("arguments[0].click();",next_page)
    except NoSuchElementException:
        break

CPU times: total: 766 ms
Wall time: 37.3 s


#### BeautifulSoup Edition

In [24]:
def get_data_s(driver):
    #Transfer the driver from Selenium to BeautifulSoup
    source = driver.page_source
    soup = BeautifulSoup(source, 'lxml')
    
    #Get all the data in one time
    all_data = soup.find(class_='shop_list shop_list_4').find_all('dl')
    info_list = []
    
    #Split the information from different houses
    for house in all_data:
        ##The location and the community of houses
        title = house.find(class_='add_shop').find('a').text
        loc = house.find(class_='add_shop').find('span').text
        block = loc.split('-')[0]
        
        #Several basic information of the houses, use regular expressions to help me get the them while some might be absent.
        basic = house.find(class_='tel_shop').text.replace('\n','').replace('\t','').replace('|',' ')
        size = re.findall('\d+[.\d+]{0,}㎡',basic)[0].strip('㎡') if re.findall('\d+㎡',basic) else None
        year = re.findall('\d+年建',basic)[0].strip('年建') if re.findall('\d+年建',basic) else None
    
        #Price information of the houses
        price = house.find(class_='price_right').text.split()
        total_p = price[0]
        per_p = price[1]
        
        #Sort the information into a dictionary.
        house_info = {'title':title,
                      'size(㎡)':size,
                      'year':year,
                      'location':loc,
                      'total price(万)':total_p,
                      'price per square(元/㎡)':per_p,
                      'block':block}
        
        info_list.append(house_info)
        
    return pd.DataFrame(info_list)

In [26]:
%%time
house_list_s = pd.DataFrame()
j = 0
while j == 0:
    house_list_s = pd.concat([house_list_s,get_data_s(driver)])
    
    #Turn to the next page
    try:
        next_page = driver.find_element('partial link text','下一页')
        driver.execute_script("arguments[0].click();",next_page)
    except NoSuchElementException:
        break

CPU times: total: 1.78 s
Wall time: 17.1 s


We could see that BeautifulSoup is more efficient then Selenium in scape a static page,</br>
hence a combination of BeautifulSoup and Selenium would be more powerful.

In [29]:
house_list.head(10)

Unnamed: 0,title,size(㎡),year,location,total price(万),price per square(元/㎡),block
0,双河南里,99.36,1999,观音寺-(大兴)双河南里,330,33212,观音寺
1,海子角西里,82.72,2000,观音寺-(大兴)海子角西里,249,30101,观音寺
2,观音寺南里,84.64,2005,观音寺-(大兴)观音寺南里,275,32490,观音寺
3,观音寺小区,83.12,1994,观音寺-观音寺街20号,328,39461,观音寺
4,观音寺南里,130.04,2001,观音寺-(大兴)观音寺南里,411,31605,观音寺
5,海子角南里,77.75,2006,观音寺-(大兴)海子角南里,190,24437,观音寺
6,观音寺南里,130.04,2001,观音寺-(大兴)观音寺南里,405,31144,观音寺
7,观音寺新区,86.58,2005,观音寺-(大兴)观音寺北里,370,42735,观音寺
8,海子角南里,77.0,1995,观音寺-(大兴)海子角南里,220,28571,观音寺
9,观音寺小区,70.65,2006,观音寺-观音寺街20号,218,30856,观音寺


In [32]:
#Turn the information we crawl into a csv file and share them with the groupmates.
house_list.to_csv('观音寺二手房房价.csv',index=False)