# Pre: Data Preparation

In [7]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import geopandas as gpd 
import descartes


## Read data
lists = pd.read_csv('listings.csv')
pd.set_option('display.max_columns', 500) #expand the size of terminal window to display all rows

## Clean data - only for this case, using LA as example
lists['zipcode'] = lists['zipcode'].astype(str).\
                                    str.replace('ca','').\
                                    str.replace('CA','').\
                                    str.replace('Near ','').\
                                    str.replace('139 S Valencia Ave, Glendora.','').\
                                    str.extract(r'^(\d{5})')

lists['price'] = lists['price'].astype(str).\
                                str.replace('$','').\
                                str.replace(',','').\
                                astype(float)

lists = lists.query('price > 0')  #filter out listings for free living
lists['cleaning_fee'] = lists['cleaning_fee'].astype(str).\
                                            str.replace('$','').\
                                            str.replace(',','').\
                                            astype(float)

lists['host_since_year'] = lists['host_since'].astype(str).apply(lambda x: x.split('-')[0]) #extract host join year

lists['list_since_year'] = lists['first_review'].astype(str).apply(lambda x: x.split('-')[0]) #extract listing upload year

## Summarize data
#lists.describe()


# Part I: Description of Data

In [6]:
## Bar chart showing the new listing growth rate from 2008
list_growth = pd.DataFrame()
list_growth = lists.groupby('list_since_year')['id'].nunique().reset_index()
list_growth = list_growth.query('list_since_year != "nan"').rename(columns = {'id': 'new_listing_count'})
list_growth.plot(kind = 'bar', x = 'list_since_year', title = 'New Listing Growth Trend')

## Bar chart showing the new host growth rate from 2008
host_growth = pd.DataFrame()
host_growth = lists.groupby('host_since_year')['host_id'].nunique().reset_index()
host_growth = host_growth.query('host_since_year != "nan"').rename(columns = {'host_id': 'new_host_count'})
host_growth.plot(kind = 'bar', x = 'host_since_year', title = 'New Host Growth Trend')

# Part II: Where to live?

In [5]:
import gmaps  #heatmap
import os
import requests
from requests import RequestException  #automatic extraction

## Preparation
listzipcode = pd.DataFrame()
listzipcode['count'] = lists.groupby(['zipcode']).count()['id']
listzipcode['avgprice'] = lists.groupby(['zipcode']).mean()['price']
listzipcode['avgscore'] = lists.groupby(['zipcode']).mean()['review_scores_rating']
listzipcode['location'] = lists.groupby(['zipcode']).mean()['review_scores_location']
listzipcode['value'] = lists.groupby(['zipcode']).mean()['review_scores_value']
listzipcode = listzipcode.reset_index()

## 1.Heat map of Airbnb listing density in Los Angeles
# link to get google map api_key: https://developers.google.com/maps/documentation/embed/get-api-key
gmaps.configure(api_key = "AIzaSyATnwHhxPqq3QN7q5DSIfcMQOTjsStXF30")   
location_columns = lists[['latitude', 'longitude']]
location_tuples = [tuple(x) for x in location_columns.values]
fig = gmaps.figure()
fig.add_layer(gmaps.heatmap_layer(location_tuples))
fig

## 2.Choropleth of Crime Rate by zip code in Los Angeles
# Read shapefile of ZIP Code Tabulation Areas (ZCTAs) geometry of Great Los Angeles Area  
shapefile = gpd.read_file("tl_2018_us_zcta510.shp")[['ZCTA5CE10','geometry']]
shapefile.rename(columns = {'ZCTA5CE10': 'zipcode'}, inplace = True)
la_zip = pd.read_excel('lazip.xlsx').astype(str)
shapefile = shapefile.merge(la_zip, on = 'zipcode', how = 'inner')

# Scrape crime rate of zip code level from www.bestplaces.net and merge to listzipcode.csv
procrime = {}
for i in listzipcode['zipcode']:
    url = 'https://www.bestplaces.net/crime/zip-code/*/*'
    url = os.path.join(url, i)
    html = requests.get(url).text
    if "property crime is" in html:
        procrime[i] = html.split("violent crime is")[1].split()[0].replace('.<small>','')
    else:
        procrime[i] = 'NaN'
crime_rate = pd.DataFrame({'crimerate':procrime}).reset_index()
crime_rate.rename(columns = {'index': 'zipcode'}, inplace = True)
listzipcode = listzipcode.merge(crime_rate, on = 'zipcode', how = 'left')

# Merge geodata with cleaned crime rate dataset
listchoropleth = shapefile.merge(listzipcode, on='zipcode', how='right')

# Create choropleth of Crime Rate by zip code in Los Angeles
variable_crime = 'crimerate'
vmin, vmax = 50, 100
fig, ax = plt.subplots(1, figsize = (20, 10))
ax.set_xlim([-119.1, -117.4])
ax.set_ylim([33.6, 34.9])
ax.axis('off')
ax.set_title('Violent Crime by Zip Code in Los Angeles', \
             fontdict = {'fontsize': '25', 'fontweight' : '3'})

ax.annotate('Source: www.bestplaces.net', xy = (0.1, .08), \
            xycoords = 'figure fraction', horizontalalignment = 'left', \
            verticalalignment = 'top', fontsize = 12, color = '#555555')  

img_crime = listchoropleth.plot(column = variable_crime, scheme = 'fisher_jenks', \
                                cmap = 'Reds', linewidth = 0.8, ax = ax, edgecolor = '0.8')
# Create colorbar as legend
sm_crime = plt.cm.ScalarMappable(cmap = 'Reds', norm = plt.Normalize(vmin = vmin, vmax = vmax)) 
sm_crime._A = []
cbar_crime = fig.colorbar(sm_crime)

## 3.Choropleth of Average Price by zip code in Los Angeles
variable_price = 'avgprice'
vmin, vmax = 0, 2000
fig, ax = plt.subplots(1, figsize=(20, 10))
ax.set_xlim([-119.1, -117.4])
ax.set_ylim([33.6, 34.9])
ax.axis('off')
ax.set_title('Average Price by Zip Code in Los Angeles', \
             fontdict = {'fontsize': '25', 'fontweight' : '3'})

ax.annotate('Source: Inside Airbnb', xy = (0.1, .08), \
            xycoords = 'figure fraction', horizontalalignment = 'left', \
            verticalalignment = 'top', fontsize = 12, color = '#555555')

img_price = listchoropleth.plot(column = variable_price, scheme='fisher_jenks', \
                                cmap = 'Blues', linewidth = 0.8, ax = ax, edgecolor = '0.8')

sm_price = plt.cm.ScalarMappable(cmap = 'Blues', norm = plt.Normalize(vmin = vmin, vmax = vmax))
sm_price._A = []
cbar_price = fig.colorbar(sm_price)


## 4.Line chart showing rating distribution of most expensive listings
# listzipcode.query('avgprice > 1000')
df_price = lists[['zipcode',
                'review_scores_accuracy',
                'review_scores_cleanliness',
                'review_scores_checkin',
                'review_scores_communication',
                'review_scores_location',
                'review_scores_value']].query ('zipcode in ["90077","90210","90265"]')
df_price = df_price.rename(columns = {'review_scores_accuracy': 'accuracy', 
                                    'review_scores_cleanliness': 'cleanliness', 
                                    'review_scores_checkin': 'checkin',
                                    'review_scores_communication':'communication',
                                    'review_scores_location':'location',
                                    'review_scores_value': 'value'})
high_price = df_price[['accuracy',
               'cleanliness',
               'checkin',
               'communication',
               'location',
               'value']].mean()
high_price.plot(kind = 'line', title = 'Rating Distribution of Most Expensive Listings' )

## 5.Horizontal bar chart for 20 neighbourhoods with highest score of value
listvalue = pd.DataFrame()
listvalue['count'] = lists.groupby(['neighbourhood_cleansed']).count()['id']
listvalue['value'] = lists.groupby(['neighbourhood_cleansed']).mean()['review_scores_value']
listvalue = listvalue.reset_index().\
                        query('count>100').\
                        sort_values(by = 'value').\
                        tail(20).\
                        rename(columns = {'neighbourhood_cleansed':'neighbourhood'})

listvalue.plot(kind = 'barh', 
               figsize = (8, 8), 
               y = 'value', 
               x = 'neighbourhood', 
               title = '20 Neighbourhoods with Highest Score of Value', 
               xlim = (9.4,10))


# Part III: Which listing to select?
### 1. What kind of hosts can be trusted?

In [4]:
## 1.Airbnb authentication policies - "Superhost", identity verification, host profile pic
import scipy.stats as stats
from scipy.stats import ttest_ind
import seaborn as sns 
from statsmodels.stats.multicomp import (pairwise_tukeyhsd, MultiComparison)

listhost = lists.groupby(['host_id'])['host_total_listings_count',
                                    'host_response_rate',
                                    'number_of_reviews',
                                    'review_scores_rating',
                                    'reviews_per_month'].mean()
df = lists[['host_id',
             'host_is_superhost',
             'host_has_profile_pic',
             'host_response_time',
             'host_identity_verified',
             'host_verifications',
             'host_neighbourhood']]
listhost = listhost.merge(df, on = 'host_id', how = 'left')

sns.boxplot(x = "host_is_superhost", y = "review_scores_rating", data = listhost, showfliers = False)
ttest_ind(listhost.query('host_is_superhost == "t"').dropna()['review_scores_rating'],
          listhost.query('host_is_superhost == "f"').dropna()['review_scores_rating'])

sns.boxplot(x = "host_has_profile_pic", y = "review_scores_rating", data = listhost, showfliers = False)
ttest_ind(listhost.query('host_has_profile_pic == "t"').dropna()['review_scores_rating'],
          listhost.query('host_has_profile_pic == "f"').dropna()['review_scores_rating'])

sns.boxplot(x = "host_identity_verified", y = "review_scores_rating", data = listhost, showfliers = False)
ttest_ind(listhost.query('host_identity_verified == "t"').dropna()['review_scores_rating'],
          listhost.query('host_identity_verified == "f"').dropna()['review_scores_rating'])

## 2.Host's cancellation policy

# lists.query('cancellation_policy == "super_strict_30"') #only 9 listings with this policy
# lists.query('cancellation_policy == "strict"') #only 63 listings with this policy which has changed to "strict_14_with_grace_period" from 2018
# remove "super_strict_30" and "strict"

list_cancel = lists[['review_scores_rating', 'cancellation_policy']].dropna()
list_cancel = list_cancel.query('cancellation_policy !=["super_strict_30","strict"]')
res = pairwise_tukeyhsd(list_cancel['review_scores_rating'], list_cancel['cancellation_policy'])
res.plot_simultaneous()


### 2. What ratings and reviews tell?

In [9]:
import nltk
nltk.download('wordnet') 
import textblob
import langdetect
from textblob import TextBlob, Word
from nltk.tokenize import sent_tokenize
from langdetect import detect
from sklearn.cluster import KMeans
from sklearn.metrics import pairwise_distances_argmin_min
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator 


# Pre: read review data and remove non-English comments
reviews = pd.read_csv('reviews.csv')

reviews['comments'] = reviews['comments'].astype(str)
reviews['language'] = None
for i in range(len(reviews)):
    try:
        reviews['language'][i] = detect(reviews['comments'][i])
    except:
        reviews['language'][i] = "non-en"   #language detection from nltk is sensitive to empty string
reviews=reviews.query('language =="en" ')


## 1.Scatter plot showing relationship between rating scores and number of reviews
lists.plot(kind='scatter', y='reviews_per_month', x='review_scores_rating')


## 2.Wordcloud of reviews for each listing
file = open('stopwords.txt','r')
stopwords = []
for line in file:
    line = line.rstrip().strip(' ')
    stopwords.append(line)  
    
def comment(id):
    '''
    Generate a dataframe with all the comments of the listing with the id you input
    '''
    dic = {}
    dic2 = {}
    for index, row in reviews.iterrows():
        if row['listing_id'] == id :
            dic[index] = row['comments']
            dic2[index] = row['listing_id']
    df = pd.DataFrame({'listing_id':dic2, 'comments':dic})  
    return df
    
def wordcloud(id):
    '''
    Generate wordcloud for the listing with the id you input
    '''  
    df = comment(id)
    # text cleaning for listing comments
    df['comments'] = df['comments'].apply(lambda x: " ".join(x.lower() for x in x.split()))   #lower cases
    df['comments'] = df['comments'].str.replace('[^\w\s]','')   #remove punctuation
    df['comments'] = df['comments'].apply(lambda x: " ".join(x for x in x.split() if x not in stopwords))  #remove stop words
    df['comments'] = df['comments'].apply(lambda x: str(TextBlob(x).correct()))  #correct spelling 
    df['comments'] = df['comments'].apply(lambda x: " ".join([Word(word).lemmatize() for word in x.split()]))  #lemmatization
    
    # create wordcloud
    total_comments = " ".join(com for com in df.comments) 
    wordcloud = WordCloud(background_color = "white").generate(total_comments)
    plt.imshow(wordcloud, interpolation = 'bilinear')
    plt.axis("off")
    plt.savefig("wordcloud.png") 


## 3.Clustering Summarization
def  summarization(id):
    '''
    Generate summarization for each listing with the id you input
    '''
    df = comment(id)
    # tokenize English sentences
    eng_comments = " ".join(sent for sent in comment['comments'])
    tok_comments = sent_tokenize(eng_comments)
    
    # skip thoughts sentence embedding
    all_com = [sent for sent in tok_comments]
    enc_com = encoder.encode(all_sent, verbose=False)   #pre-trained model can be checked in #Appendix#
    
    # KMeans clustering
    n_clusters = int(np.ceil(len(enc_com)**0.2))  #determine number of clusters
    kmeans = KMeans(n_clusters=n_clusters, random_state=0)
    kmeans = kmeans.fit(enc_com)
    avg = []
    closest = []
    for j in range(n_clusters):
        idx = np.where(kmeans.labels_ == j)[0]
        avg.append(np.mean(idx))
    closest, _ = pairwise_distances_argmin_min(kmeans.cluster_centers_, enc_com)
    summary = " ".join(cleaned_text[idx] for idx in closest)
    print(summary)

    
## 4.Sentiment Analysis
reviews['sentiment'] = reviews['comments'].apply(lambda x: TextBlob(x).sentiment[0])


## Appendix: Import pre-trained Skip Thoughts encode model
import scipy.spatial.distance as sd
import configuration
import encoder_manager

VOCAB_FILE = "../skip_thoughts_bi_2017_02_16/vocab.txt"
EMBEDDING_MATRIX_FILE = "../skip_thoughts_bi_2017_02_16/embeddings.npy"
CHECKPOINT_PATH = "../skip_thoughts_bi_2017_02_16/model.ckpt-500008"

encoder = encoder_manager.EncoderManager()
encoder.load_model(configuration.model_config(bidirectional_encoder=True),
                   vocabulary_file=VOCAB_FILE,
                   embedding_matrix_file=EMBEDDING_MATRIX_FILE,
                   checkpoint_path=CHECKPOINT_PATH)
