## My Project - Condos.CA

In [2]:
# imports

from bs4 import BeautifulSoup # For HTML parsing
import requests # Website connections
from time import sleep # To prevent overwhelming the server between connections
from collections import Counter # Keep track of our term counts
import pandas as pd # For converting results to a dataframe and bar chart plots
import json # For parsing json
%matplotlib inline

In [2]:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys

driver = webdriver.Chrome('/Users/karansingh/WeCloud/Python/Lecture8_Webscraping/chromedriver')

In [3]:
driver.get("https://condos.ca")

In [4]:
element = driver.find_element_by_class_name('_2PKdn') 
element

<selenium.webdriver.remote.webelement.WebElement (session="e952b33f30d4d589d0641e347d113ac6", element="62daaf02-de31-4212-b37f-f87ab7f9c672")>

### Making a request

In [4]:
# Using Selenium
page_source = driver.page_source
soup = BeautifulSoup(page_source, 'lxml')

In [5]:
soup


<html lang="en" xmlns="http://www.w3.org/1999/xhtml"><head>
<meta content="width=device-width, initial-scale=1.0, maximum-scale=1.0, user-scalable=no" name="viewport"/>
<meta content="0AK6GHtHYeOBq-KHW1YWfFdtBGmn7p4sU0bYr1YM5c4" name="google-site-verification"/>
<meta content="yes" name="apple-mobile-web-app-capable"/>
<meta content="yes" name="mobile-web-app-capable"/>
<title>Condos.ca | Search &amp; Analyze all Toronto Condos | Buy Sell Rent Invest</title>
<meta charset="utf-8" content="en" data-react-helmet="true" http-equiv="Content-Language"/><meta content="Search &amp; Analyze All condos for Sale &amp; Rent in Toronto.
          See historical SQFT trends &amp; market analysis for Toronto condos." data-react-helmet="true" name="description"/><meta content="Toronto,Toronto condos for sale,
            Toronto condos for rent" data-react-helmet="true" name="keywords"/><meta content="Condos.ca | Search &amp; Analyze all Toronto Condos | Buy Sell Rent Invest" data-react-helmet="true"

### Request Status

In [5]:
# anything other than 200 is an error
#result.status_code
#Gives an Error 403

403

### Get the page content

In [6]:
# Finding price of one listing
soup.find('div', class_="_2PKdn").get_text()

'$2,700,000'

## Defining function to get relevant results


In [7]:
def get_page(city, mode,page):
    url= f'https://condos.ca/{city}/condos-for-{mode}?mode={mode}&page={page}'
    driver.get(url) 
    page_source = driver.page_source
    soup = BeautifulSoup(page_source, 'lxml')
    return soup

## Defining Soup

### Performing Analysis for 1st Page 

In [8]:
soup = get_page('toronto', 'Sale',1)
soup

<html lang="en" xmlns="http://www.w3.org/1999/xhtml"><head>
<meta content="width=device-width, initial-scale=1.0, maximum-scale=1.0, user-scalable=no" name="viewport"/>
<meta content="0AK6GHtHYeOBq-KHW1YWfFdtBGmn7p4sU0bYr1YM5c4" name="google-site-verification"/>
<meta content="yes" name="apple-mobile-web-app-capable"/>
<meta content="yes" name="mobile-web-app-capable"/>
<title data-react-helmet="true">Condos for Sale in Toronto | Condos.ca</title>
<meta content="Condos.ca" data-react-helmet="true" property="og:site_name"/><meta content="Condos for Sale in Toronto | Condos.ca" data-react-helmet="true" property="og:title"/><meta content="Condos for Sale in Toronto. Browse all Toronto Real Estate and MLS listings, find out about past prices and the latest market data for Toronto Condos." data-react-helmet="true" property="og:description"/><meta content="Condos for Sale in Toronto. Browse all Toronto Real Estate and MLS listings, find out about past prices and the latest market data for To

In [11]:
#total Condo Listings in Toronto
soup.find('div',class_ = 'sc-AxjAm dWkXrE').find('span',class_ = '_5FYo1').get_text()

'3560'

In [12]:

#Returns top 5 listings from the first page
soup.find_all('div',class_ = '_2PKdn')[0:5]

[<div class="_2PKdn">$1,350,000</div>,
 <div class="_2PKdn">$799,900</div>,
 <div class="_2PKdn">$519,000</div>,
 <div class="_2PKdn">$1,099,000</div>,
 <div class="_2PKdn">$1,288,000</div>]

In [13]:
#These are the total number of listings on the first page
len(soup.find_all('div',class_ = '_3O0GU'))

43

In [14]:
#Obtaining all prices on page 1
prices=[]
for tag in soup.find_all('div',class_ = '_2PKdn'):
    prices.append(tag.get_text())
prices

['$1,350,000',
 '$799,900',
 '$519,000',
 '$1,099,000',
 '$1,288,000',
 '$599,000',
 '$949,000',
 '$780,000',
 '$679,000',
 '$629,900',
 '$595,000',
 '$999,000',
 '$999,000',
 '$999,000',
 '$939,900',
 '$799,900',
 '$599,000',
 '$1,319,900',
 '$629,900',
 '$882,000',
 '$739,000',
 '$570,000',
 '$628,000',
 '$474,900',
 '$495,000',
 '$749,000',
 '$799,900',
 '$999,000',
 '$699,000',
 '$779,000',
 '$968,888',
 '$649,000',
 '$499,800',
 '$615,000',
 '$489,000',
 '$585,000',
 '$679,000',
 '$749,900',
 '$749,000',
 '$529,000',
 '$1,179,900',
 '$888,000',
 '$659,800']

## Defining variable that stores all listings on a page

In [68]:
# We start by defining a variable house_container for page 1 which has information on all the parameters that we need
house_container = soup.find_all('div','_3SMhd') 
house_container

[<div class="_3SMhd"><div class="_3O0GU"><div class="_2PKdn">$799,000</div><div class="_3980O">5 hours</div></div><div><span class="_1Gfb3">605 Dufferin St</span></div><div class="_3FIJA">3BD<span></span>2BA<span></span>1 Parking<div class="_2HUK2"></div></div><div class="YjyI8"></div></div>,
 <div class="_3SMhd"><div class="_3O0GU"><div class="_2PKdn">$608,000</div><div class="_3980O">8 minutes</div></div><div><span class="_1Gfb3">707 - 68 Shuter St N</span></div><div class="_3FIJA">1+1BD<span></span>1BA<span></span>1 Parking<div class="_2HUK2"><span></span>500-599 sqft</div></div><div class="YjyI8"><div>Maint. Fee $555</div></div></div>,
 <div class="_3SMhd"><div class="_3O0GU"><div class="_2PKdn">$539,000</div><div class="_3980O">8 minutes</div></div><div><span class="_1Gfb3">807 - 28 Olive Ave</span></div><div class="_3FIJA">1BD<span></span>1BA<span></span>1 Parking<div class="_2HUK2"><span></span>500-599 sqft</div></div><div class="YjyI8"><div>Maint. Fee $496</div></div></div>,
 <

In [69]:
#Defining Price
Price = int(house_container[0].find('div',class_ = '_2PKdn').get_text().split('$')[1].replace(',',''))
Price

799000

In [71]:
#Obtaining prices from all listings on page 1
Price_List=[]
for i in range(len(house_container)):
    for tag in house_container[i].find('div',class_ = '_2PKdn'):
        formatted_tag = int(tag.split('$')[1].replace(',',''))
        Price_List.append(formatted_tag)
Price_List

[799000,
 608000,
 539000,
 390000,
 719900,
 799900,
 460000,
 629900,
 1825000,
 1048000,
 669900,
 779900,
 780000,
 699000,
 489900,
 399000,
 635000,
 599900,
 399000,
 699000,
 808800,
 849900,
 798000,
 699900,
 839000,
 999000,
 569000,
 379900,
 1025990,
 590000,
 679000,
 499900,
 549000,
 579990,
 549000,
 799000,
 1169000,
 539000,
 579900,
 599000,
 868800,
 846000,
 948800,
 1099000]

In [72]:
Location = house_container[0].find('span',class_='_1Gfb3').get_text()
Location

'605 Dufferin St'

In [73]:
Location_list=[]
for i in range(len(house_container)):
    for tag in house_container[i].find('span',class_='_1Gfb3'):
        Location_list.append(tag)
Location_list

['605 Dufferin St',
 '707 - 68 Shuter St N',
 '807 - 28 Olive Ave',
 '1605 - 735 Don Mills Rd',
 '1003 - 260 Sackville St',
 '608 - 284 Bloor St W',
 '308 - 120 Dundalk Dr',
 '1909 - 181 Dundas St E',
 '328 - 40 Oaklands Ave',
 '1101 - 101 Charles St E',
 '709 - 260 Merton St',
 '30 Holly St',
 '3407 - 763 Bay St',
 '4208 - 295 Adelaide St W',
 '703 - 3845 Lake Shore Blvd W',
 '419 - 160 Flemington Rd',
 '503 - 18 Stafford St',
 '1710 - 66 Forest Manor Rd',
 '715 - 660 Eglinton Ave W',
 '414 - 1190 Dundas St E',
 '1409 - 1 Scott St',
 '128 Dairy Dr',
 '1505 - 59 Annie Craig Dr',
 '804 - 25 Stafford St',
 '1106 - 55 East Liberty St',
 '98 Mitchell Ave',
 '608 - 530 Indian Grve',
 '#208 - 2500 Bridletowne Circ',
 '21 - 12-16 Dervock Cres',
 '600 Fleet St',
 '421 - 1169 Queen St W',
 '226 - 1837 Eglinton Ave E',
 '435 - 1091 Kingston Rd',
 '403 - 200 Manitoba St',
 '515 - 105 George St',
 '813 - 3237 Bayview Ave',
 '406 - 10 Old York Mills Rd',
 '203 - 8 Fieldway Rd',
 '1501 - 33 Mill St'

In [None]:
#Floor List can be obtained from the Unit address for further analysis

In [22]:
Size = house_container[0].find('div',class_='_2HUK2').get_text()
Size

'1,400-1,599 sqft'

In [74]:
Size_list=[]
for i in range(len(house_container)):
    for tag in house_container[i].find('div',class_='_2HUK2'):
        print(tag)
        #Size_list.append(tag.get_text())
#Size_list

<span></span>
500-599 sqft
<span></span>
500-599 sqft
<span></span>
600-699 sqft
<span></span>
700-799 sqft
<span></span>
700-799 sqft
<span></span>
1,200-1,399 sqft
<span></span>
500-599 sqft
<span></span>
1,800-1,999 sqft
<span></span>
900-999 sqft
<span></span>
700-799 sqft
<span></span>
1,200-1,399 sqft
<span></span>
700-799 sqft
<span></span>
600-699 sqft
<span></span>
700-799 sqft
<span></span>
0-499 sqft
<span></span>
700-799 sqft
<span></span>
600-699 sqft
<span></span>
700-799 sqft
<span></span>
600-699 sqft
<span></span>
600-699 sqft
<span></span>
2,000-2,500 sqft
<span></span>
800-899 sqft
<span></span>
600-699 sqft
<span></span>
800-899 sqft
<span></span>
500-599 sqft
<span></span>
700-799 sqft
<span></span>
1,000-1,199 sqft
<span></span>
600-699 sqft
<span></span>
700-799 sqft
<span></span>
800-899 sqft
<span></span>
700-799 sqft
<span></span>
700-799 sqft
<span></span>
500-599 sqft
<span></span>
1,000-1,199 sqft
<span></span>
1,200-1,399 sqft
<span></span>
600-699 sqft
<s

In [75]:
Mainttenance_Fees = int(house_container[1].find('div',class_='YjyI8').get_text().split()[2].split('$')[1])
Mainttenance_Fees

555

In [76]:
Maintenance_list = []
for i in range(len(house_container)):
    for tag in house_container[i].find('div',class_='YjyI8'):
        Maintenance_Fees = int(tag.get_text().split()[2].split('$')[1].replace(',',''))
        Maintenance_list.append(Maintenance_Fees)
len(Maintenance_list)
#Result implies some values are missing. Not adding up to 43

41

In [112]:
Bedrooms = house_container[0].find('div',class_='_3FIJA')
Bedrooms
#Does not give us the intended result

<div class="_3FIJA">2+1BD<span></span>2BA<span></span>1 Parking<div class="_2HUK2"><span></span>1,000-1,199 sqft</div></div>

In [78]:
#To get information on bedrooms, bathrooms, parking, size we create the following list
big_list=[]
for i in range(len(house_container)):
    listing_list = []
    for tag in house_container[i].find('div',class_='_3FIJA'):
        listing_list.append(tag)
        #bedroom_list.append(listing[0])
        #Bathroom_list.append(listing[2])
        #Parking_list.append(listing[4])
        #Size_list.append(listing[6])
    big_list.append(listing_list)
big_list

[['3BD',
  <span></span>,
  '2BA',
  <span></span>,
  '1 Parking',
  <div class="_2HUK2"></div>],
 ['1+1BD',
  <span></span>,
  '1BA',
  <span></span>,
  '1 Parking',
  <div class="_2HUK2"><span></span>500-599 sqft</div>],
 ['1BD',
  <span></span>,
  '1BA',
  <span></span>,
  '1 Parking',
  <div class="_2HUK2"><span></span>500-599 sqft</div>],
 ['1BD',
  <span></span>,
  '1BA',
  <span></span>,
  '1 Parking',
  <div class="_2HUK2"><span></span>600-699 sqft</div>],
 ['2BD',
  <span></span>,
  '1BA',
  <span></span>,
  '1 Parking',
  <div class="_2HUK2"><span></span>700-799 sqft</div>],
 ['2BD',
  <span></span>,
  '1BA',
  <span></span>,
  '0 Parking',
  <div class="_2HUK2"><span></span>700-799 sqft</div>],
 ['3BD',
  <span></span>,
  '2BA',
  <span></span>,
  '1 Parking',
  <div class="_2HUK2"><span></span>1,200-1,399 sqft</div>],
 ['1+1BD',
  <span></span>,
  '1BA',
  <span></span>,
  '0 Parking',
  <div class="_2HUK2"><span></span>500-599 sqft</div>],
 ['2+2BD',
  <span></span>,
  '3B

In [89]:
Bedroom_list = []
Bathroom_list = []
Parking_list = []
Size_list = []
for x in big_list:
    
    bedroom = x[0]
    Bedroom_list.append(bedroom)
    
    Bathroom = x[2]
    Bathroom_list.append(Bathroom)
    
    parking = int(x[4].split()[0])
    Parking_list.append(parking)
    
    size = x[5].get_text()
    Size_list.append(size)

In [80]:
Size_list[0:5]

['', '500-599 sqft', '500-599 sqft', '600-699 sqft', '700-799 sqft']

In [29]:
#Taking mean of the size element for later cleaning purposes
int(Size_list[0].split()[0].split('-')[0].replace(',','')) + int(Size_list[0].split()[0].split('-')[1].replace(',',''))/2

2199.5

In [81]:
len(Bedroom_list)

44

## Filter Selection

In [11]:
# click on More_Filters button
more_filters = driver.find_element_by_css_selector('#heap-Search-Sale > div.styles___Flex-sc-1lfxfux-0.iHWtyF._27l5J > div:nth-child(2) > svg') 
more_filters.click()

## Price Filter

In [17]:
send_max_price = driver.find_element_by_css_selector('#heap-Search-Sale > div.ddT2Z.efN8r > div > div:nth-child(3) > div._3A8gT > div._3sd-t > div > section > div._1J3eX > div > div > div > div.Ua4ju.sliderValues > div > div:nth-child(3) > div > div > input[type=text]')

In [18]:
send_max_price.send_keys("2000000")

In [19]:
send_min_price = driver.find_element_by_css_selector('#heap-Search-Sale > div.ddT2Z.efN8r > div > div:nth-child(3) > div._3A8gT > div._3sd-t > div > section > div._1J3eX > div > div > div > div.Ua4ju.sliderValues > div > div:nth-child(1) > div > div > input[type=text]')

In [20]:
send_min_price.send_keys("1000000")

## Bed Filter 

In [22]:
# Studio
studio = driver.find_element_by_css_selector('#heap-Search-Sale > div.ddT2Z.efN8r > div > div:nth-child(3) > div._3A8gT > div:nth-child(3) > section > div._1J3eX > div > div:nth-child(1) > button > div')
studio.click()

In [23]:
# 2 Bed
two_bed = driver.find_element_by_css_selector('#heap-Search-Sale > div.ddT2Z.efN8r > div > div:nth-child(3) > div._3A8gT > div:nth-child(3) > section > div._1J3eX > div > div:nth-child(4) > button > div')
two_bed.click()

## HomeType filter

In [87]:
high_rise = driver.find_element_by_css_selector('#heap-Search-Sale > div > div > div._1FhvV._2KG7q > div:nth-child(3) > div:nth-child(2) > div > div._3A8gT > div:nth-child(6) > section > div._1J3eX > div > div:nth-child(2) > button > div')
high_rise.click()

In [92]:
Loft = driver.find_element_by_css_selector('#heap-Search-Sale > div > div > div._1FhvV._2KG7q > div:nth-child(3) > div:nth-child(2) > div > div._3A8gT > div:nth-child(6) > section > div._1J3eX > div > div:nth-child(5) > button > div')
Loft.click()

## Amenities

In [98]:
Gym = driver.find_element_by_css_selector('#heap-Search-Sale > div > div > div._1FhvV._2KG7q > div:nth-child(3) > div:nth-child(2) > div > div._3A8gT > div:nth-child(13) > section > div._1J3eX > div > div:nth-child(1) > button > div')
Gym.click()

In [99]:
Pool = driver.find_element_by_css_selector('#heap-Search-Sale > div > div > div._1FhvV._2KG7q > div:nth-child(3) > div:nth-child(2) > div > div._3A8gT > div:nth-child(13) > section > div._1J3eX > div > div:nth-child(2) > button > div')
Pool.click()

## Final Model

In [None]:
#For Analysis purposes, I have taken all the listings without any filters

In [82]:
# We define a data dictionary
data = { 'Location':[],
        'Date_listed':[],
        'Bedrooms':[],
        'Bathrooms':[],
        'Maint_Fees':[],
        'Size':[],
        'Parking':[]
       }

In [83]:
from random import random
random()

0.9259873227781932

In [98]:
data.clear()
data

{}

In [136]:
date_list=[]
date_tag = house_container[0].find('div',class_ = '_3980O').get_text()
       # data['Prices'].append(formatted_tag)
date_tag

'1 day'

In [90]:
listing.clear()
final_list.clear()

In [100]:
data = {'Prices':[],
       'Location':[],
        'Date_listed':[],
        'Bedrooms':[],
        'Bathrooms':[],
        'Maint_Fees':[],
        'Size':[],
        'Parking':[]
       }
final_list=[]
for page in range(50):
    
    soup = get_page('toronto', 'Sale',page)
    house_container = soup.find_all('div','_3SMhd')
    
    sleep(random()) 
    print(page)
    
    for i in range(len(house_container)):
        listing = []
        
        price_tag = house_container[i].find('div',class_ = '_2PKdn').get_text()
        formatted_tag = int(price_tag.split('$')[1].replace(',',''))
        data['Prices'].append(formatted_tag)
        
        location_tag = house_container[i].find('span',class_='_1Gfb3').get_text()
        data['Location'].append(location_tag) 
        
        date_tag = house_container[i].find('div',class_ = '_3980O').get_text()
        data['Date_listed'].append(date_tag)
        
        maint_tag = house_container[i].find('div',class_='YjyI8').get_text()
            
        if maint_tag != '':
                
            Maintenance_Fees = int(maint_tag.split('$')[1].replace(',',''))
            data['Maint_Fees'].append(Maintenance_Fees)
            
        else:
                
            data['Maint_Fees'].append('error')
        
        for info_tag in house_container[i].find('div',class_='_3FIJA'):
            listing.append(info_tag)
       
        final_list.append(listing)
        
        

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49


In [188]:
final_list

[['6BD',
  <span></span>,
  '3BA',
  <span></span>,
  '2 Parking',
  <div class="_2HUK2"><span></span>1,400-1,599 sqft</div>],
 ['2BD',
  <span></span>,
  '1BA',
  <span></span>,
  '0 Parking',
  <div class="_2HUK2"><span></span>700-799 sqft</div>],
 ['3BD',
  <span></span>,
  '2BA',
  <span></span>,
  '1 Parking',
  <div class="_2HUK2"><span></span>1,200-1,399 sqft</div>],
 ['1BD',
  <span></span>,
  '1BA',
  <span></span>,
  '1 Parking',
  <div class="_2HUK2"><span></span>700-799 sqft</div>],
 ['1BD',
  <span></span>,
  '1BA',
  <span></span>,
  '0 Parking',
  <div class="_2HUK2"><span></span>500-599 sqft</div>],
 ['3BD',
  <span></span>,
  '2BA',
  <span></span>,
  '1 Parking',
  <div class="_2HUK2"></div>],
 ['1+1BD',
  <span></span>,
  '1BA',
  <span></span>,
  '1 Parking',
  <div class="_2HUK2"><span></span>500-599 sqft</div>],
 ['1BD',
  <span></span>,
  '1BA',
  <span></span>,
  '1 Parking',
  <div class="_2HUK2"><span></span>500-599 sqft</div>],
 ['1BD',
  <span></span>,
  '1B

In [102]:
for x in final_list:
    
    bedroom = x[0]
    data['Bedrooms'].append(bedroom)

   
    bathroom = x[2]
    data['Bathrooms'].append(bathroom)
    
    parking = int(x[4].split()[0])
    data['Parking'].append(parking)
    
    size = x[5].get_text()
    data['Size'].append(size)

In [194]:
bed = [x[0] for x in final_list] 
set(bed)

{'1+1BD',
 '1+2BD',
 '1BD',
 '2+1BD',
 '2+2BD',
 '2BD',
 '3+1BD',
 '3+2BD',
 '3BD',
 '4+1BD',
 '4+2BD',
 '4BD',
 '5+1BD',
 '5BD',
 '6BD',
 'Studio'}

In [195]:
#No. of elements in every list
for i in data:
    print(i,len(data[i]))

Prices 2031
Location 2031
Date_listed 2031
Bedrooms 2031
Bathrooms 2031
Maint_Fees 2031
Size 2031
Parking 2031


## Defining DataFrame

In [3]:
condos = pd.DataFrame(data)
condos.head()

NameError: name 'data' is not defined

In [125]:
condos.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2031 entries, 0 to 2030
Data columns (total 8 columns):
Prices         2031 non-null int64
Location       2031 non-null object
Date_listed    2031 non-null object
Bedrooms       2031 non-null object
Bathrooms      2031 non-null object
Maint_Fees     2031 non-null object
Size           2031 non-null object
Parking        2031 non-null int64
dtypes: int64(2), object(6)
memory usage: 127.1+ KB


## Data Cleaning

In [None]:
for x in condos['Size']:
    #int((x.split('-')[0]).replace(',','')) 
    #int((x.split('-')[1]).replace(',','').split()[0]) 
    print((int((x.split('-')[0]).replace(',','')) + int((x.split('-')[1]).replace(',','').split()[0]))/2 )
    #print(int((x.split('-')[0]).replace(',','')) + int((x.split('-')[1]).replace(',','').split()[0]))/2

In [198]:
set(condos['Size'])

{'',
 '+5,000 sqft',
 '0-499 sqft',
 '1,000-1,199 sqft',
 '1,100-1,500 sqft',
 '1,200-1,399 sqft',
 '1,400-1,599 sqft',
 '1,500-2,000 sqft',
 '1,600-1,799 sqft',
 '1,800-1,999 sqft',
 '2,000-2,249 sqft',
 '2,000-2,500 sqft',
 '2,250-2,499 sqft',
 '2,500-2,749 sqft',
 '2,500-3,000 sqft',
 '2,750-2,999 sqft',
 '3,000-3,249 sqft',
 '3,000-3,500 sqft',
 '3,250-3,499 sqft',
 '3,500-5,000 sqft',
 '3,750-3,999 sqft',
 '4,500-4,749 sqft',
 '4,750-4,999 sqft',
 '495 sqft',
 '500-599 sqft',
 '527 sqft',
 '600-699 sqft',
 '700-799 sqft',
 '751 sqft',
 '800-899 sqft',
 '900 sqft',
 '900-999 sqft'}

In [199]:
condos['Size'].value_counts().head()

600-699 sqft        331
700-799 sqft        274
500-599 sqft        265
1,000-1,199 sqft    222
800-899 sqft        196
Name: Size, dtype: int64

In [200]:
#Filling the missing value with the most frequent observation
condos['Size'] = condos['Size'].replace('','600-699 sqft')

In [140]:
for x in condos['Size']:
    print(x)

1,400-1,599 sqft
700-799 sqft
1,200-1,399 sqft
700-799 sqft
500-599 sqft
600-699 sqft
500-599 sqft
500-599 sqft
600-699 sqft
700-799 sqft
500-599 sqft
1,800-1,999 sqft
900-999 sqft
700-799 sqft
1,200-1,399 sqft
700-799 sqft
600-699 sqft
700-799 sqft
0-499 sqft
700-799 sqft
600-699 sqft
700-799 sqft
600-699 sqft
600-699 sqft
2,000-2,500 sqft
800-899 sqft
600-699 sqft
800-899 sqft
600-699 sqft
500-599 sqft
700-799 sqft
1,000-1,199 sqft
600-699 sqft
700-799 sqft
800-899 sqft
700-799 sqft
1,000-1,199 sqft
1,200-1,399 sqft
600-699 sqft
500-599 sqft
600-699 sqft
1,000-1,199 sqft
1,000-1,199 sqft
1,400-1,599 sqft
700-799 sqft
1,200-1,399 sqft
700-799 sqft
500-599 sqft
600-699 sqft
500-599 sqft
500-599 sqft
600-699 sqft
700-799 sqft
500-599 sqft
1,800-1,999 sqft
900-999 sqft
700-799 sqft
1,200-1,399 sqft
700-799 sqft
600-699 sqft
700-799 sqft
0-499 sqft
700-799 sqft
600-699 sqft
700-799 sqft
600-699 sqft
600-699 sqft
2,000-2,500 sqft
800-899 sqft
600-699 sqft
800-899 sqft
600-699 sqft
500-599 

In [148]:
condos[condos['Size'] == '751 sqft']

Unnamed: 0,Prices,Location,Date_listed,Bedrooms,Bathrooms,Maint_Fees,Size,Parking
307,923888,3707 - 12 YORK Street,1 day,2BD,1BA,658,751 sqft,1


In [150]:
condos['Size'] = condos['Size'].replace('751 sqft','700-799 sqft')

In [152]:
set(condos['Size'])

{'+5,000 sqft',
 '0-499 sqft',
 '1,000-1,199 sqft',
 '1,100-1,500 sqft',
 '1,200-1,399 sqft',
 '1,400-1,599 sqft',
 '1,500-2,000 sqft',
 '1,600-1,799 sqft',
 '1,800-1,999 sqft',
 '2,000-2,249 sqft',
 '2,000-2,500 sqft',
 '2,250-2,499 sqft',
 '2,500-2,749 sqft',
 '2,500-3,000 sqft',
 '2,750-2,999 sqft',
 '3,000-3,249 sqft',
 '3,000-3,500 sqft',
 '3,250-3,499 sqft',
 '3,500-5,000 sqft',
 '3,750-3,999 sqft',
 '4,500-4,749 sqft',
 '4,750-4,999 sqft',
 '495 sqft',
 '500-599 sqft',
 '527 sqft',
 '600-699 sqft',
 '700-799 sqft',
 '800-899 sqft',
 '900 sqft',
 '900-999 sqft'}

In [201]:
condos['Size'].replace({'495 sqft': '400-499 sqft','751 sqft':'700-799 sqft','527 sqft':\
                                         '500-599 sqft','900 sqft':'900-999 sqft','+5,000 sqft':'5,000-5,100 sqft'}\
                       , inplace = True
                                       )

In [202]:
#Calculating Avg Size from size range
condos['Avg_Size'] = [(int((x.split('-')[0]).replace(',','')) + int((x.split('-')[1]).replace(',','').split()[0]))/2 for x in condos['Size']]

In [203]:
condos.head()

Unnamed: 0,Prices,Location,Date_listed,Bedrooms,Bathrooms,Maint_Fees,Size,Parking,Avg_Size
0,648000,20 - 375 Cook Rd,20 minutes,6BD,3BA,322,"1,400-1,599 sqft",2,1499.5
1,799900,608 - 284 Bloor St W,3 hours,2BD,1BA,790,700-799 sqft,0,749.5
2,460000,308 - 120 Dundalk Dr,4 hours,3BD,2BA,874,"1,200-1,399 sqft",1,1299.5
3,549000,435 - 1091 Kingston Rd,4 hours,1BD,1BA,568,700-799 sqft,1,749.5
4,549000,515 - 105 George St,5 hours,1BD,1BA,402,500-599 sqft,0,549.5


In [204]:
condos['Bedrooms'] = [x[0] for x in condos['Bedrooms']]
condos['Bathrooms'] = [x[0] for x in condos['Bedrooms']]

In [205]:
condos.head()

Unnamed: 0,Prices,Location,Date_listed,Bedrooms,Bathrooms,Maint_Fees,Size,Parking,Avg_Size
0,648000,20 - 375 Cook Rd,20 minutes,6,6,322,"1,400-1,599 sqft",2,1499.5
1,799900,608 - 284 Bloor St W,3 hours,2,2,790,700-799 sqft,0,749.5
2,460000,308 - 120 Dundalk Dr,4 hours,3,3,874,"1,200-1,399 sqft",1,1299.5
3,549000,435 - 1091 Kingston Rd,4 hours,1,1,568,700-799 sqft,1,749.5
4,549000,515 - 105 George St,5 hours,1,1,402,500-599 sqft,0,549.5


In [159]:
condos['Maint_Fees'].isnull()

0       False
1       False
2       False
3       False
4       False
        ...  
2026    False
2027    False
2028    False
2029    False
2030    False
Name: Maint_Fees, Length: 2031, dtype: bool

In [111]:
import numpy as np

In [206]:
condos['Maint_Fees']=condos['Maint_Fees'].replace('error',np.NaN)

In [207]:
avg_maint_fees = round(condos['Maint_Fees'].mean(),2)

In [208]:
condos['Maint_Fees'] = condos['Maint_Fees'].fillna(avg_maint_fees)
condos['Maint_Fees']

0       322.0
1       790.0
2       874.0
3       568.0
4       402.0
        ...  
2026    915.0
2027    427.0
2028    403.0
2029    452.0
2030    397.0
Name: Maint_Fees, Length: 2031, dtype: float64

In [209]:
set(condos['Bedrooms'])

{'1', '2', '3', '4', '5', '6', 'S'}

In [210]:
condos['Bedrooms'] = condos['Bedrooms'].replace('S',np.NaN)

In [211]:
condos['Bedrooms'] = condos['Bedrooms'].astype(float)

In [212]:
set(condos['Bathrooms'])

{'1', '2', '3', '4', '5', '6', 'S'}

In [213]:
condos['Bathrooms'] = condos['Bathrooms'].replace('S',np.NaN)

In [214]:
condos['Bathrooms'] = condos['Bathrooms'].astype(float)

In [215]:
set(condos['Parking'])

{0, 1, 2, 3, 4, 5}

In [217]:
condos['Parking'] = condos['Parking'].astype(float)

In [216]:
condos['Location']

0                     20 - 375 Cook Rd
1                 608 - 284 Bloor St W
2                 308 - 120 Dundalk Dr
3               435 - 1091 Kingston Rd
4                  515 - 105 George St
                     ...              
2026                 2504 - 12 York St
2027           1401 - 88 Blue Jays Way
2028    204 - 100 Harrison Garden Blvd
2029             S421 - 455 Front St E
2030            1921 - 233 Beecroft Rd
Name: Location, Length: 2031, dtype: object

In [221]:
unit_list=[]
street_list=[]
for x in condos['Location']:
    #unit = x.split('-')[0].split(' ')[0])
    length = len(x.split('-')) 
    if length == 1:
        street_list.append(x.split('-')[0])
    else:
        unit_list.append(x.split('-')[0])
        street_list.append(x.split('-')[1])
    
    

In [222]:
len(unit_list)

1407

In [223]:
len(street_list)

2031

In [224]:
street_list = [x.split(',')[0] for x in street_list]
street_list

[' 375 Cook Rd',
 ' 284 Bloor St W',
 ' 120 Dundalk Dr',
 ' 1091 Kingston Rd',
 ' 105 George St',
 '605 Dufferin St',
 ' 68 Shuter St N',
 ' 28 Olive Ave',
 ' 735 Don Mills Rd',
 ' 260 Sackville St',
 ' 181 Dundas St E',
 ' 40 Oaklands Ave',
 ' 101 Charles St E',
 ' 260 Merton St',
 '30 Holly St',
 ' 763 Bay St',
 ' 295 Adelaide St W',
 ' 3845 Lake Shore Blvd W',
 ' 160 Flemington Rd',
 ' 18 Stafford St',
 ' 66 Forest Manor Rd',
 ' 660 Eglinton Ave W',
 ' 1190 Dundas St E',
 ' 1 Scott St',
 '128 Dairy Dr',
 ' 59 Annie Craig Dr',
 ' 25 Stafford St',
 ' 55 East Liberty St',
 '98 Mitchell Ave',
 ' 530 Indian Grve',
 ' 2500 Bridletowne Circ',
 ' 12',
 '600 Fleet St',
 ' 1169 Queen St W',
 ' 1837 Eglinton Ave E',
 ' 200 Manitoba St',
 ' 3237 Bayview Ave',
 ' 10 Old York Mills Rd',
 ' 8 Fieldway Rd',
 ' 33 Mill St',
 ' 159 Dundas St E',
 ' 205 The Donway W',
 ' 100 Quebec Ave',
 ' 375 Cook Rd',
 ' 284 Bloor St W',
 ' 120 Dundalk Dr',
 ' 1091 Kingston Rd',
 ' 105 George St',
 '605 Dufferin St

In [225]:
address = pd.DataFrame(street_list,columns = ['Street Address'])
address

Unnamed: 0,Street Address
0,375 Cook Rd
1,284 Bloor St W
2,120 Dundalk Dr
3,1091 Kingston Rd
4,105 George St
...,...
2026,12 York St
2027,88 Blue Jays Way
2028,100 Harrison Garden Blvd
2029,455 Front St E


In [226]:
final_condos = pd.merge(left = condos,right=address,left_index= True, right_index=True)
final_condos.head()

Unnamed: 0,Prices,Location,Date_listed,Bedrooms,Bathrooms,Maint_Fees,Size,Parking,Avg_Size,Street Address
0,648000,20 - 375 Cook Rd,20 minutes,6.0,6.0,322.0,"1,400-1,599 sqft",2.0,1499.5,375 Cook Rd
1,799900,608 - 284 Bloor St W,3 hours,2.0,2.0,790.0,700-799 sqft,0.0,749.5,284 Bloor St W
2,460000,308 - 120 Dundalk Dr,4 hours,3.0,3.0,874.0,"1,200-1,399 sqft",1.0,1299.5,120 Dundalk Dr
3,549000,435 - 1091 Kingston Rd,4 hours,1.0,1.0,568.0,700-799 sqft,1.0,749.5,1091 Kingston Rd
4,549000,515 - 105 George St,5 hours,1.0,1.0,402.0,500-599 sqft,0.0,549.5,105 George St


In [229]:
final_condos.to_csv('final_condos.csv')

## Visualization in separate file