# Zillow Webscraping + Text Analysis

## 1. Webscraping:

In this part of the code, I'll show you how to extract a data frame with the houses for sale in Champaign from www.zillow.com. This code only extract the first page of the static webpage. Therefore, the resulting data frame will contain only 40 rows (houses/apts).

In [None]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import numpy as np
import os
import re
import json
import matplotlib.pyplot as plt
import matplotlib
import random

In [None]:
#pd.set_option('display.max_rows', 40)
#pd.set_option('display.max_columns', 500)
#pd.set_option('display.max_colwidth', None)
random.seed(10)

In [None]:
req_headers = {
    'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
    'accept-encoding': 'gzip, deflate, br',
    'accept-language': 'en-US,en;q=0.8',
    'upgrade-insecure-requests': '1',
    'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36'
}

with requests.Session() as s:
    city = 'champaign/' 
    url = 'https://www.zillow.com/homes/for_sale/'+city    
    r = s.get(url, headers=req_headers)

In [None]:
url

In [None]:
r.status_code

In [None]:
soup = BeautifulSoup(r.content, 'html.parser')

In [None]:
soup

## Create DataFrame based on classes

In [None]:
df = pd.DataFrame()

df['price'] = soup.find_all(class_='list-card-price')
df['address'] = soup.find_all(class_='list-card-addr')
df['beds'] = soup.find_all("ul", class_="list-card-details")

#df['link']  = list(soup.find_all(class_= 'list-card-link'))
df;


In [None]:
# PRICES
df['price']=[x.get_text() for x in df['price']]
df['price'] = df['price'].str.replace(r'\D', '')  # \D is regex for non-digit. 

# ADDRESSES:
df['address']=[x.get_text() for x in df['address']]

# BEDS - BATHS - SQFEET - TYPE 

df['beds']=[x.get_text() for x in df['beds']]

df[['beds','baths']] = df.beds.str.split(" bds",expand=True)
df[['baths','sq_feet']] = df.baths.str.split(" ba",expand=True)
df[['sq_feet','type']] = df.sq_feet.str.split(" sqft- ",expand=True)


# There are alternative ways to get rid of html tags. Here is the manual way: 
# df['beds'] = df['beds'].astype('str')
# df['beds'] = df['beds'].replace('<ul class="list-card-details"><li class="">','', regex=True)
# df['beds'] = df['beds'].replace('<abbr class="list-card-label"> <!-- -->','',regex=True)
# df['beds'] = df['beds'].replace('</abbr></li><li class="">','-',regex=True)
# df['beds'] = df['beds'].replace('<abbr class="list-card-label"> <!-- -->','',regex=True)
# df['beds'] = df['beds'].replace('</abbr></li><li class="">','-',regex=True)
# df['beds'] = df['beds'].replace('</abbr></li><li class="list-card-statusText">','',regex=True)
# df['beds'] = df['beds'].replace('</li></ul>','',regex=True)
# df[['beds','baths','sq_feet','type','none1']] = df.beds.str.split("-",expand=True)


In [None]:
df

In [None]:
df = df[['price','address','beds','baths','sq_feet','type']]
df

In [None]:
df['price'] = df['price'].astype('int')
df['beds'] = df['beds'].astype('float')
df['baths'] = df['baths'].astype('float')
df['sq_feet'] = df['sq_feet'].str.replace(r'\D', '').astype('float')
df

## Obtaining the link of the house/apt



In [None]:
link = soup.find_all(class_= 'list-card-link')
link

In [None]:
#create empty url list
urls = []

#loop through url, pull the href and strip out the address tag
for link in soup.find_all("article"):
    href = link.find('a',class_="list-card-link")
    url = href.get('href')
    urls.append(url)

df['urls'] = urls
df

## Webscraping each house/apt link.

From each specific link, we will get the descriptions and latitude/longitude of each house/apt for sale

In [None]:
# We'll use same req_header as before to avoid captchas from Zillow...
req_headers = {
    'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
    'accept-encoding': 'gzip, deflate, br',
    'accept-language': 'en-US,en;q=0.8',
    'upgrade-insecure-requests': '1',
    'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36'
}

#### I did a lot of trial and error here...

But in summary, I tried with the first link. And then I look inside the results to see where I can find what we want...

In [None]:
df['urls'][0]

In [None]:
with requests.Session() as s:
    url = 'https://www.zillow.com/homedetails/1508-S-Mattis-Ave-Champaign-IL-61821/3227757_zpid/'
    r2 = s.get(url, headers=req_headers)
r2.status_code

In [None]:
r2.text

In [None]:
soup2 = BeautifulSoup(r2.content, 'html.parser')

#### Getting lat/lon

In [None]:
latlon = soup2.find('script', {'type':'application/ld+json'})
latlon = json.loads(latlon.contents[0])
latlon

In [None]:
latlon['geo']['longitude']

#### Getting Description

In [None]:
#description=soup2.find_all(class_='Text-c11n-8-18-0__aiai24-0 sc-qPwPv cZodDt')
description=soup2.find_all(class_='Text-c11n-8-18-0__aiai24-0 sc-qPwPv ielpMy')
description = [d.text for d in description]
description

## Using a loop to evaluate all the links:

In [None]:
descrip = []
descrip2 = []
lat = []
lon = []

for link in df['urls']:
    r = s.get(link, headers=req_headers)
    soup = BeautifulSoup(r.content, 'html.parser')
    # Gettting description
    description= soup.find_all(class_='Text-c11n-8-18-0__aiai24-0 sc-qPwPv cZodDt')
    description = [d.text for d in description]
    descrip.append(description)
    
    # Gettting description - version 2 
    description2= soup.find_all(class_='Text-c11n-8-18-0__aiai24-0 sc-qPwPv ielpMy')
    description2 = [d.text for d in description2]
    descrip2.append(description2)
    
    
    # Getting latitude and longitude:     
    latlon = soup.find('script', {'type':'application/ld+json'})
    latlon = json.loads(latlon.contents[0])
    latitude = latlon['geo']['latitude']
    longitude = latlon['geo']['longitude']
    
    lat.append(latitude)
    lon.append(longitude)
    

In [None]:
descrip;

### Addind the new columns to our DataFrame:

In [None]:
df['lat'] = lat
df['lon'] = lon
# description 1
df['descrip'] = descrip
df['descrip'] = df['descrip'].astype('str')
df['descrip']  = df['descrip'].replace('\[', '', regex=True)
df['descrip']  = df['descrip'].replace('\]', '', regex=True)

# description 2
df['descrip2'] = descrip2
df['descrip2'] = df['descrip2'].astype('str')
df['descrip2']  = df['descrip2'].replace('\[', '', regex=True)
df['descrip2']  = df['descrip2'].replace('\]', '', regex=True)

In [None]:
usedescrip=[len(x)>0 for x in df['descrip']]
usedescrip2 = [len(x)>0 for x in df['descrip2']]
df.loc[usedescrip,'description'] = df.loc[usedescrip,'descrip']
df.loc[usedescrip2,'description'] = df.loc[usedescrip2,'descrip2']

df

In [None]:
df.columns

In [None]:
df = df[['price', 'address', 'beds', 'baths', 'sq_feet', 'type', 'urls', 'lat','lon', 'description']]

In [None]:
df

In [None]:
# In case you want to save the resulting dataframe: 
#df.to_csv(r'ZillowWebscrap_Champaign_page1.csv',index=False)

## 2. Text Analysis

We'll do text analysis over the description of each listing. However, we will use a file that contains all the 6 pages from Zillow (Instead of only one as we doid above). 

### Remove punctuation and split words

In [None]:
df = pd.read_csv('ZillowWebscrap_Champaign.csv')

In [None]:
df.shape

In [None]:
import string
string.punctuation

In [None]:
# Create a lower case variable, and remove \n (new lines) character in case we have: 
df['desctiption_lower'] =  df['description'].str.lower().str.replace("\n","")

# Remove punctuation and list of characters that we need to remove 
remv_punc = str.maketrans('','',string.punctuation + '“' +"‘"+'”')

df['description_clean'] =  df['desctiption_lower'].str.translate(remv_punc)
# Use of regular expressoin to remove digits: 
df['description_clean'] = [re.sub("\d+", "", x) for x in df['description_clean']]

In [None]:
words = [x.split(" ") for x in df['description_clean']]
words;

### Remove stop words

In [None]:
# You need to download the stopwords first. I don't need to do that again.
# import nltk
# nltk.download('stopwords')

In [None]:
from nltk.corpus import stopwords

In [None]:
print(stopwords.words('english'))

In [None]:
sw_list = stopwords.words('english') + [" ", '']
sw_list

In [None]:
words_list = [pd.Series(x).value_counts() for x in words]
word_df = pd.concat(words_list,axis=1).fillna(0).T
word_df

In [None]:
# removing stopwords:
words_nsw = word_df.loc[:,~word_df.T.index.isin(sw_list)]
words_nsw.head(10)

### Words counts


In [None]:
words_nsw.sum().sort_values(ascending=False).head(10)

In [None]:
words_nsw.sum().sort_values().tail(20).plot(kind='barh',figsize=(7,5));
plt.xlabel("Number of times");
plt.title("Frequently used words in Zillow listings \n Houses for rent. Champaign",fontsize=14);

### An application of Term Frequency - Inverse Document Frequency

In this case, a document will be a row of our original data. (i.e., one listing)

**Term Frequency - Inverse Document Frequency (TF-IDF)** 

Term frequency: how often does a word appear in a document?

Document frequency: How many documents contain this word?

We divide term frequency by the total number of documents that have that word: $TFDF = TF/DF$ 


In [None]:
words_nswT = words_nsw.T
words_nswT

In [None]:
def tf_calc(column):
    return column/column.sum()

tf = words_nswT.apply(tf_calc,axis=1)
tf

In [None]:
#Now calculate IDF:
inv_doc_freq = np.log(tf.shape[1]/(words_nswT!=0).sum(axis=1))
inv_doc_freq

In [None]:
# Or, using vectorization method:

idf_mat= np.repeat(np.array(inv_doc_freq)[:,np.newaxis],\
                   tf.shape[1],\
                   axis=1)

tf_idf = tf*idf_mat

In [None]:
# Checking one listing:
listing_no= 10
tf_idf[listing_no][tf_idf[listing_no]<5.9].sort_values(ascending=False).head(10)

In [None]:
# Checking words with highest values of TD-DF (overall)
tf_idf['mean'] = tf_idf.mean(axis=1)
tf_idf['mean'].describe()

In [None]:
tf_idf['mean'].sort_values(ascending=False).head(150)

In [None]:
words_high = tf_idf['mean'].sort_values(ascending=False).head(10)
words_high = list(words_high.index)
words_high

In [None]:
tf_idfT = tf_idf.T
tf_idfT_sub = tf_idfT[words_high]


In [None]:
df[words_high] = tf_idfT_sub
df

### Sentiment Analysis

In [None]:
# import nltk
# nltk.download('vader_lexicon')

In [None]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer
sid = SentimentIntensityAnalyzer()

In [None]:
df['sentiment'] = [sid.polarity_scores(x)['compound'] for x in df['description_clean']]

In [None]:
df.columns

### Regression Analysis

In [None]:
from econtools.metrics import reg

In [None]:
print(words_high)

In [None]:
reg(df,"price",["beds", "baths","sq_feet","sentiment"], addcons=True)