### Scraping http://www.city-data.com/  website to get all details of each zip code

In [None]:
import time
import pandas as pd
import requests
import re
from bs4 import BeautifulSoup
import Warnings

#### Defining Helper functions which interprets required data from the html

In [None]:
def getDemographics(soup,zipDict):
    
    #Cost of Living Index
    parentDiv = soup.find('div',{'class':'row'}).findAll(text=True, recursive=False)
    clIndex=parentDiv[11]
    zipDict['CLIndex']=clIndex
    #print("CLI",clIndex)
    
    #Population density
    popElement = soup.find('b',text=re.compile('Population density:'))
    #print(popElement)#.findAll(text=True, recursive=False)
    popdensity=popElement.find_parent().text
    zipDict['popDensity'] =popdensity.split(":")[1]
    #print("CLI",zipDict['popDensity'])
    
    #Median Age
    divEle=soup.find_all("div",{"class":"hgraph"})
    tempEle=""
    for div in divEle:
        tempEle=div.find('b',text=re.compile("Median resident age:"))
        if tempEle is not None:
            break
        else:
            pass
    ageString=tempEle.find_next().text
    zipDict['MAge']=ageString.split(":")[1]
    #print("CLI",zipDict['MAge'])
    #Median Income
    tempEle=""
    for div in divEle:
        tempEle=div.find('b',text=re.compile("Estimated median household income"))
        if tempEle is not None:
            break
        else:
            pass
    #print(tempEle)
    tdList=tempEle.next_sibling()[0]
    zipDict['MIncome']=tdList.text.split(":")[1]
    #print("MIncome",zipDict['MIncome'])
    
    #Below Poverty Line
    tempEle=soup.find('b',text=re.compile("Residents with income below the poverty level"))
    #print(tempEle)
    tdList=tempEle.next_siblings
    zipDict['BPovertyLine']=list(tdList)[2].text.split(":")[1]
    #print("BPovertyLine",zipDict['BPovertyLine'])
    
    #Median rent
    tempEle=soup.find('b',text=re.compile("Median gross rent in 2019:"))
    #print(tempEle)
    tdList=tempEle.find_parent()
    zipDict['MRent']=tdList.text.split(":")[1]
    #print("MRent",zipDict['MRent'])
    
    return zipDict


#This function is callded after scraping gets completed
    
def cleanZipDataset(zipCodedf):
    #Clean Median Income
    zipCodedf['MIncome']=zipCodedf['MIncome'].apply(lambda x:str(x).strip("$"))
    #Clean BPovertyLine
    zipCodedf['BPovertyLine']=zipCodedf['BPovertyLine'].apply(lambda x:str(x).split("%")[0].strip())
    #Clean MRent
    zipCodedf['MRent']=zipCodedf['MRent'].apply(lambda x:str(x).strip(".\r\n").replace("$","").strip(""))
    #Clean MedianAge
    zipCodedf['MAge']=zipCodedf['MAge'].apply(lambda x:re.sub(r'[^\d.]+', '', str(x)))
    #clean PopDensity
    zipCodedf['popDensity']=zipCodedf['popDensity'].apply(lambda x:re.sub(r'[^\d]+', '', str(x)))
    
    return zipCodedf
    
    
    


#### Get  Unique Zip Codes from the data collected over Trulia Website.

In [None]:
#Reading this file is required as it will give us  the Unique Zip codes for which demographic details need to be collected.

#HousePriceDetails.csv has the data collected from Trulia Website. We are reading this file to get Zip Codes.
housedf=pd.read_csv("HousePriceDetails.csv")
housedf.dropna(subset=['Price','Bed','Bath','FloorSize','LotSize','Crime','HOAfees','Shop&Eat','PropertyType','School'],inplace=True)
uniqueZipCodes=housedf['ZipCode'].unique().tolist()
uniqueCities=housedf['City'].unique().tolist()

#### Writing the Scraping Logic

In [None]:
len(uniqueZipCodes)

In [None]:
url="http://www.city-data.com/zips/"
zipDetailsList=[]
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'}
for i in range(0,len(uniqueZipCodes)):
    newUrl=""
    newUrl=url+str(uniqueZipCodes[i])+".html"
    print(newUrl)
    zipDict={}
    try:
        response = requests.get(newUrl,headers=headers)
        time.sleep(10)
        soup = BeautifulSoup(response.content, 'lxml')
        zipDict=getDemographics(soup,zipDict)
       
    except:
        print("Doesnt have details for this zipCOde")
    zipDict['zipCode']=str(uniqueZipCodes[i])
    print(zipDict)
    zipDetailsList.append(zipDict)
    
        
#Convert list of dictionaries into dataframe
zipCodedf=pd.DataFrame(zipDetailsList)

zipCodedf=cleanZipDataset(zipCodedf)

                           

#### Writing the dataframe to a csv file which will be used in data preprocessing as We need to merge these details with the House price details

In [10]:
zipCodedf.to_csv("zipDetails.csv",index=False)
zipCodedf.head()

Unnamed: 0,CLIndex,popDensity,MAge,MIncome,BPovertyLine,MRent,zipCode
0,149.7,10251,36.7,117191,7.3,2376,95136
1,150.0,8331,37.8,122947,4.9,2608,95123
2,148.3,5151,36.1,103198,6.8,2075,95127
3,148.2,11536,35.2,76430,11.0,1631,95111
4,149.7,7916,39.8,162651,3.3,2403,95124


### Scraping Crime Details for each city


In [None]:
# Declaring a Map here as format of city value used for some cities are different than those of TRulia.com.
#IN Trulia It is "underscore" but in city-data.com it is "hyphen"
cityMap={'SanJose':'San-Jose','CastroValley':'Castro-Valley',
        'SanRamon':'San-Ramon','UnionCity':'Union-City','PaloAlto':'Palo-Alto',
        'SantaClara':'Santa-Clara','SanMateo':'San-Mateo','WalnutCreek':'Walnut-Creek',
        'SantaCruz':'Santa-Cruz','HalfMoonBay':'Half-Moon-Bay'}

In [None]:
url="http://www.city-data.com/city/"
cityDetailsList=[]
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'}
try:
    for i in range(0,len(uniqueCities)):
        cityVal=cityMap.get(uniqueCities[i],uniqueCities[i])
        newUrl=""
        newUrl=url+str(cityVal)+"-California.html"
        print(newUrl)
        cityDict={}
        response = requests.get(newUrl,headers=headers)
        time.sleep(10)
        soup = BeautifulSoup(response.content, 'lxml')
        crimeIndex=None
        try:
            trEle=soup.find('tr',{'class':'nosort'})
            #print(trEle)
            tdList=list(trEle.find_all('td'))
            #print(tdList)
            crimeIndex=tdList[-1].text
            #print(crimeIndex)
        except:
            print("doesn't have crime Index details")
        cityDict['crimeIndex']=crimeIndex
        cityDict['city']=str(uniqueCities[i])
        print(cityDict)
        cityDetailsList.append(cityDict)
except:
    print("Exception ")

        
#Convert list of dictionaries into dataframe

cityCrimeIndexdf=pd.DataFrame(cityDetailsList)
cityCrimeIndexdf.head()




In [11]:
#Manually entering CrimeIndex for Castro Valley and El Granda as website doesn't show the related info.
#Assigning the corresponding values of neighbouring cities
#Crime Index of Castro Valley = Crime Index of Hayward
#Crime Index of El Granda =crime Index of San Mateo

cityCrimeIndexdf.loc[cityCrimeIndexdf['city']=="CastroValley",'crimeIndex']= cityCrimeIndexdf.loc[cityCrimeIndexdf['city']=="Hayward",'crimeIndex'].item()
cityCrimeIndexdf.loc[cityCrimeIndexdf['city']=="ElGranada",'crimeIndex']= cityCrimeIndexdf.loc[cityCrimeIndexdf['city']=="SanMateo",'crimeIndex'].item()

  
  import sys


#### Writing the above dataframe to a csv file as this will be merged with HousePrice Details

In [12]:
cityCrimeIndexdf.to_csv("CrimeDetailsperCity.csv",index=False)