### Remax

In [1]:
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup
import time
from splinter import Browser
from sqlalchemy import create_engine
import warnings
warnings.filterwarnings('ignore')
print('Libraries imported!')

Libraries imported!


In [2]:
house_address = []
house_details = []

base_url = 'https://www.remax.ca/ab/calgary-real-estate?page='
urls = [base_url + str(x) for x in range(1,301)]

for url in urls:
    # Parse HTML with Beautiful Soup
    time.sleep(5)
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')

    try:
        addresses = soup.find_all('div', class_='left-content flex-one')
        for address in addresses:
            house_address.append(address.text)
    except:
        house_address.append('N/A')
        
    try:
        details = soup.find_all('div', class_='property-details')
        for detail in details:
            house_details.append(detail.text)
    except:
        house_details.append('N/A')

In [3]:
address_df = pd.DataFrame(house_address)

new_df = address_df[0].str.split(' ', 2, expand=True)
new_df["price"] = new_df[1].str.replace("$", "")
new_df["price"] = new_df["price"].str.replace(",", "")
new_df["price"] = pd.to_numeric(new_df["price"])

del new_df[0]
del new_df[1]
new_df.head()

Unnamed: 0,2,price
0,"9803 ELBOW DR SW, Calgary, AB, T2V 1M4",489900
1,"106 - 790 KINGSMERE CRES SW, Calgary, AB, T2V 2G9",239900
2,"4508 16A ST SW, Calgary, AB, T2T 4L7",789900
3,"230 20 AVE NW, Calgary, AB, T2M 1C2",659900
4,"24 SHAWNEE WAY SW, Calgary, AB, T2Y 2V4",499900


In [4]:
new_df["price"].dtype

dtype('int64')

In [5]:
final_df = new_df[2].str.split(', Calgary, AB, ', expand=True)
final_df.head()

Unnamed: 0,0,1
0,9803 ELBOW DR SW,T2V 1M4
1,106 - 790 KINGSMERE CRES SW,T2V 2G9
2,4508 16A ST SW,T2T 4L7
3,230 20 AVE NW,T2M 1C2
4,24 SHAWNEE WAY SW,T2Y 2V4


In [6]:
df_add = pd.concat([new_df, final_df], axis=1)
del df_add[2]
df_add.columns = ["price", "address", "postal_code"]
df_add.head()

Unnamed: 0,price,address,postal_code
0,489900,9803 ELBOW DR SW,T2V 1M4
1,239900,106 - 790 KINGSMERE CRES SW,T2V 2G9
2,789900,4508 16A ST SW,T2T 4L7
3,659900,230 20 AVE NW,T2M 1C2
4,499900,24 SHAWNEE WAY SW,T2Y 2V4


In [7]:
details = pd.DataFrame(house_details)

details_df = details[0].str.split('|', expand=True)

del details_df[2]

details_df.columns = ["bedrooms", "bath", "property_type"]
details_df.head()

Unnamed: 0,bedrooms,bath,property_type
0,4 bed,2 bath,house
1,2 bed,1 bath,condo
2,1 bed,1 bath,house
3,5 bed,3 + 1 bath,house
4,3 bed,2 bath,house


In [8]:
calgary_df_dup = pd.concat([df_add, details_df], axis=1)
calgary_df = calgary_df_dup.drop_duplicates()
calgary_df.head()

Unnamed: 0,price,address,postal_code,bedrooms,bath,property_type
0,489900,9803 ELBOW DR SW,T2V 1M4,4 bed,2 bath,house
1,239900,106 - 790 KINGSMERE CRES SW,T2V 2G9,2 bed,1 bath,condo
2,789900,4508 16A ST SW,T2T 4L7,1 bed,1 bath,house
3,659900,230 20 AVE NW,T2M 1C2,5 bed,3 + 1 bath,house
4,499900,24 SHAWNEE WAY SW,T2Y 2V4,3 bed,2 bath,house


In [9]:
calgary_df.to_csv('calgary_df.csv', index=False)

----------------

### Walk Score

In [10]:
calgary_df = pd.read_csv('calgary_df.csv')
calgary_df.head()

Unnamed: 0,price,address,postal_code,bedrooms,bath,property_type
0,489900,9803 ELBOW DR SW,T2V 1M4,4 bed,2 bath,house
1,239900,106 - 790 KINGSMERE CRES SW,T2V 2G9,2 bed,1 bath,condo
2,789900,4508 16A ST SW,T2T 4L7,1 bed,1 bath,house
3,659900,230 20 AVE NW,T2M 1C2,5 bed,3 + 1 bath,house
4,499900,24 SHAWNEE WAY SW,T2Y 2V4,3 bed,2 bath,house


In [11]:
post_code_list = []

for i in calgary_df["postal_code"]:
    post_code_list.append(i)

In [12]:
scores_walk = []
scores_bike = []
scores_transit = []

for i in post_code_list:

    try:
        postal_code = i.replace(" ", "%20")
        url_score = "https://www.walkscore.com/score/" + str(postal_code)
        time.sleep(5)

        # Parse HTML with Beautiful Soup
        response = requests.get(url_score)
        code_soup = BeautifulSoup(response.text, 'html.parser')

        if 'pp.walk.sc/badge/walk/score' in str(code_soup):
            ws = str(code_soup).split('pp.walk.sc/badge/walk/score/')[1][:2].replace('.','')
            scores_walk.append(ws)
        else:
            ws = 'N/A'
            scores_walk.append(ws)
        if 'pp.walk.sc/badge/bike/score' in str(code_soup):
            bs = str(code_soup).split('pp.walk.sc/badge/bike/score/')[1][:2].replace('.','')
            scores_bike.append(bs)
        else:
            bs = 'N/A'
            scores_bike.append(bs)
        if 'pp.walk.sc/badge/transit/score' in str(code_soup):
            ts = str(code_soup).split('pp.walk.sc/badge/transit/score/')[1][:2].replace('.','')
            scores_transit.append(ts)
        else:
            ts = 'N/A'
            scores_transit.append(ts)
    except:
        ws = 'N/A'
        scores_walk.append(ws)
        bs = 'N/A'
        scores_bike.append(bs)
        ts = 'N/A'
        scores_transit.append(ts)

In [14]:
score_df_trans = {'postal_code':post_code_list, 
                  'walk_score':scores_walk, 
                  'bike_score':scores_bike, 
                  'transit_score':scores_transit}
score_df_dup = pd.DataFrame(score_df_trans)
score_df = score_df_dup.drop_duplicates()
score_df.head()

Unnamed: 0,postal_code,walk_score,bike_score,transit_score
0,T2V 1M4,58,61,55
1,T2V 2G9,57,75,57
2,T2T 4L7,38,82,38
3,T2M 1C2,77,86,56
4,T2Y 2V4,19,60,45


In [15]:
score_df.to_csv('score_df.csv', index=False)

-------------------

### SQL

In [16]:
calgary_df = pd.read_csv('calgary_df.csv')
score_df = pd.read_csv('score_df.csv')

In [17]:
rds_connection_string = "postgres:1@localhost:5432/realestate_db"
engine = create_engine(f'postgresql://{rds_connection_string}')

calgary_df.to_sql(name= "calgary_df", con=engine, if_exists="replace", index=False)
score_df.to_sql(name= "score_df", con=engine, if_exists="replace", index=False)

In [18]:
import pymongo
from pymongo import MongoClient

In [19]:
calgary_df_html = calgary_df.to_html()
score_df_html = score_df.to_html()

### MongoDB

In [20]:
conn = 'mongodb://localhost:27017'
# Making a Connection with MongoClient
client = MongoClient(conn)
# database
db = client.realestate_db

collection = db.calgary_df_html
calgary_dict = calgary_df.to_dict('records')
collection.insert_many(calgary_dict)

collection = db.score_df_html
score_dict = score_df.to_dict('records')
collection.insert_many(score_dict)

<pymongo.results.InsertManyResult at 0x2957256a908>