# Web-Scraping using Regular Expressions

###  Steps 
1. Import packages
2. Get the text data from webpages from years 2015 to 2023 for www.apartments.com using BeautifulSoup
3. Remove HTML tags and clean the text data
4. Use regular expressions to find address, bed range, price range and contact number for each rental property from years 2015 to 2023
5. Convert extracted information to a Data Frame and save the Data Frame

## Step 1: Import Packages

In [1]:
import requests
import time
import os
import pandas as pd
import numpy as np
import random
import re
import regex
from time import sleep
from os import path
from pandas import DataFrame
from bs4 import BeautifulSoup
from random import randint

## Step 2 and 3 : Get the text data from webpages using BeautifulSoup and Remove HTML tags and clean the text data 

In [2]:
def get_contents(soup, content_text):
  try:
    parents_blacklist=['[document]','html','head',
                       'style','script','body',
                       'section','tr',
                       'td','label','ul','header',
                       'aside']
    content=''
    text=soup.find_all(text=True)

    
    for t in text:
        if t.parent.name not in parents_blacklist and len(t) > 5:
            content=content+t+' '
    content_text.append(content)
  except Exception:
    content_text.append('')
    pass

## Step 4: Use regular expressions to find address, bed range, price range and contact number 

In [3]:
def localized(text):
    contacts = r'\(?[0-9]{3}\)?\/?\.?-?\s?[0-9]{3}-?\.?-?\s?[0-9]{4}\b'
    address = r'(?<!\S)\b([1-9][0-9]{,4}\s*-\s*)?[1-9][0-9]{,4} ([A-Za-z0-9&]*,?\s*){,5} (Chicago|CHICAGO),?[\w\s,]{,3}\s*[0-9]{,6}?\b'
    bed_range = r'((?<!\S)\b(Studio|[1-9])\s*-\s*[1-9]\s*(Bed|Br|Beds|Bedrooms)\b)'
    prices = r'\$[1-9]?[0-9]{,3},?[0-9]{,3}\s*-\s*\$?[1-9]?[0-9]{,3},?[0-9]{,3}|\$[1-9]?[0-9]{,3},?[0-9]{,3}$|Call for Rent\b'
    output_final = []
    if re.search(address, text).groups() is not None:
        for i in re.finditer(address, text):
            property_info = {'Address': None, 'Price': None, 'Beds': None, 'Contact' : None}
            count = 0
            indexlocation= i.span()
            startindex= i.start()
            endindex= i.end()
            if indexlocation[0] >= 0 and indexlocation[1] + 100 < len(text):
                out = text[indexlocation[0]:indexlocation[1]+250] 
                address_words = (re.search(address, out).group()).split()
                if re.search(address, out):
                    property_info['Address'] = " ".join(address_words[-6:])
                    count += 1
                if re.search(prices, out):
                    property_info['Price'] = re.search(prices, out).group()
                    count += 1
                if re.search(bed_range,out):
                    property_info['Beds'] = re.search(bed_range, out).group()
                    count += 1
                if re.search(contacts,out):
                    property_info['Contact'] = re.search(contacts, out).group()
                    count += 1
                if count > 2:
                    output_final.append(property_info)
        merged = pd.DataFrame(output_final)
        print(merged)
        return merged
    else:
        print("Pattern Doesn't Match")
        return None
        

## Step 5: Convert extracted information to a Data Frame and save the Data Frame

In [5]:
links_df = pd.read_csv("results/links.csv")
for i, row in links_df.iterrows():
    webpage = row['Links']
    stamp = webpage[28:42]
    print(webpage, stamp)
    page=requests.get(webpage)
    soup=BeautifulSoup(page.text,'html.parser')
    content_text = []
    get_contents(soup, content_text)
    output_df = localized(content_text[0])
    file_name = "results_4/" + stamp + ".csv"
    output_df.to_csv(file_name)
    print("Saved to ", file_name)

                               Address            Price             Beds  \
0      1457 N Halsted St, Chicago, IL    $2,308 - 5,936  Studio - 2 Beds   
1          234 W Polk St, Chicago, IL    $2,170 - 5,355  Studio - 3 Beds   
2     5630 N Sheridan Rd, Chicago, IL    $1,130 - 1,550   Studio - 1 Bed   
3       930 W Altgeld St, Chicago, IL   $2,080 - 14,700  Studio - 3 Beds   
4     5252 S Cornell Ave, Chicago, IL    $2,485 - 9,853  Studio - 3 Beds   
5    1950 N Campbell Ave, Chicago, IL    $2,400 - 3,900         1-2 Beds   
6      121 W Chestnut St, Chicago, IL   $2,081 - 11,979  Studio - 3 Beds   
7      1210 N State Pkwy, Chicago, IL    $2,990 - 6,423  Studio - 2 Beds   
8   340 E North Water St, Chicago, IL   $2,175 - 11,882  Studio - 3 Beds   
9          11 S Green St, Chicago, IL    $2,103 - 8,701  Studio - 2 Beds   
10  2552 N Milwaukee Ave, Chicago, IL    $2,008 - 4,620  Studio - 2 Beds   
11          8 E Huron St, Chicago, IL    $2,580 - 7,045         1-3 Beds   
12          