In [3]:
import numpy as np
import pandas as pd
import random
from tqdm import tqdm
from gensim.models import Word2Vec
import matplotlib.pyplot as plt
import umap
from nltk.tokenize import word_tokenize
import nltk
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer
import re
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics.pairwise import cosine_similarity
from nltk.corpus import stopwords
import string

In [4]:
# note scipy==1.7.3 has the function 'triu', which is needed in Word2Vec
# pip uninstall scipy -y
# !pip install scipy==1.7.3

In [5]:
import scipy.linalg
print(dir(scipy.linalg))



In [6]:
listings = pd.read_csv('./data/listings.csv')
listings.isnull().sum()
selection = ['id','name','description','neighborhood_overview','host_is_superhost', 'host_response_time',
             'host_response_rate','host_acceptance_rate','host_total_listings_count',  
             'neighbourhood','latitude','longitude','property_type','room_type','accommodates','bathrooms',
             'bathrooms_text','bedrooms','beds','amenities','price','number_of_reviews','review_scores_rating', 
             'review_scores_accuracy','review_scores_cleanliness','review_scores_checkin',
             'review_scores_communication', 'review_scores_location','review_scores_value',
             'instant_bookable']  
selected_listings = listings[selection]
selected_listings.loc[:,'description'] = selected_listings['description'].astype(str).str.replace('<br />', '').apply(lambda x: re.sub(r'[^\w\s]', '', x))
# selected_listings.isnull().sum()

### category variables

In [7]:
# host_response_time, room_type, bathrooms_text
category_columns = ['host_response_time', 'room_type', 'bathrooms_text']
# selected_listings = pd.get_dummies(selected_listings, columns=category_columns)

label_encoder = LabelEncoder()
for column in category_columns:
    selected_listings.loc[:,column] = label_encoder.fit_transform(selected_listings.loc[:,column])

### continuous variables

In [8]:
# deal with x% (transform string type to float)
def percentage_to_float(value):
    if pd.isna(value):
        return None
    else:
        return float(value) / 100

# Convert percentage strings to float
selected_listings.loc[:,'host_response_rate'] = selected_listings.loc[:,'host_response_rate'].str.replace('%', '')
selected_listings.loc[:,'host_acceptance_rate'] = selected_listings.loc[:,'host_acceptance_rate'].str.replace('%', '')
selected_listings.loc[:,'price'] = selected_listings.loc[:,'price'].str.replace('$', '').str.replace(',', '').astype(float)

numeric_features = ['host_response_rate', 'host_acceptance_rate', 'host_total_listings_count', 'price',
                    'latitude', 'longitude', 'accommodates', 'bathrooms', 'bedrooms', 'beds','number_of_reviews']
selected_listings.dropna(subset=numeric_features, inplace=True)

scaler = StandardScaler()
selected_listings.loc[:,numeric_features] = scaler.fit_transform(selected_listings.loc[:,numeric_features])

rate_columns = ['review_scores_rating', 'review_scores_accuracy', 'review_scores_cleanliness', 
                'review_scores_checkin', 'review_scores_communication','review_scores_location', 
                'review_scores_value']
selected_listings.dropna(subset=rate_columns, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  selected_listings.dropna(subset=numeric_features, inplace=True)
 -0.42803414]' has dtype incompatible with int64, please explicitly cast to a compatible dtype first.
  selected_listings.loc[:,numeric_features] = scaler.fit_transform(selected_listings.loc[:,numeric_features])
 -0.81333409]' has dtype incompatible with int64, please explicitly cast to a compatible dtype first.
  selected_listings.loc[:,numeric_features] = scaler.fit_transform(selected_listings.loc[:,numeric_features])
  0.83582241]' has dtype incompatible with int64, please explicitly cast to a compatible dtype first.
  selected_listings.loc[:,numeric_features] = scaler.fit_transform(selected_listings.loc[:,numeric_features])
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the do

### Boolean variables

In [9]:
boolean_features = ['instant_bookable','host_is_superhost']
selected_listings.loc[:, 'instant_bookable'] = selected_listings['instant_bookable'].apply(lambda x: 1 if x == 't' else 0)
selected_listings.loc[:,'host_is_superhost'] = selected_listings['host_is_superhost'].apply(lambda x: 1 if x == 't' else 0)

### train Word2Vec model for text features

In [10]:
# text features: name, description, neighborhood_overview, property_type, neighbourhood, amenities
corpus = selected_listings['name'].fillna('')+ selected_listings['description'].fillna('') + selected_listings['neighborhood_overview'].fillna('') + ' ' + selected_listings['property_type'].fillna('') + ' ' + selected_listings['neighbourhood'].fillna('') + ' ' + selected_listings['amenities'].fillna('')
tokenized_corpus = corpus.apply(lambda x: word_tokenize(x.lower()))
# train word2vec model
model = Word2Vec(sentences=tokenized_corpus, vector_size=100, window=5, min_count=1, workers=4)

In [11]:
def document_vector(doc):
    # remove out-of-vocabulary words
    doc = [word for word in doc if word in model.wv.index_to_key]
    return np.mean(model.wv[doc], axis=0) if doc else np.zeros(model.vector_size)

doc_vectors = []

# Manually iterate over the tokenized corpus with a progress bar
for doc in tqdm(tokenized_corpus, total=len(tokenized_corpus)):
    vector = document_vector(doc)
    doc_vectors.append(vector)

# Convert list to DataFrame or Series, depending on your requirement
doc_vectors = pd.Series(doc_vectors)

100%|██████████| 10313/10313 [01:30<00:00, 114.36it/s]


### Aggregate Items Features

In [12]:
features = selected_listings[category_columns+numeric_features+rate_columns+boolean_features]
combined_features = np.hstack([features, doc_vectors.tolist()])

In [13]:
combined_features.shape

(10313, 123)

### Aggregate User Features

In [15]:
reviews = pd.read_csv('./data/reviews-Sydney.csv')
reviews.isnull().sum()
reviews.dropna(inplace=True)
reviews['comments'] = reviews['comments'].str.replace('<br/>', '').apply(lambda x: re.sub(r'\s+', ' ', x))
reviews_column = ['id','reviewer_id','comments']
reviews = reviews[reviews_column]
reviews.shape

(499522, 3)

In [16]:
reviews_grouped = reviews.groupby('reviewer_id')['comments'].agg(lambda x: ' '.join(x)).reset_index()
reviews_grouped.head()

Unnamed: 0,reviewer_id,comments
0,19,"Cat is a such a caring and pleasant host, and ..."
1,46,You cannot beat the location of this place. An...
2,795,We really enjoyed Linda’s place. We felt at ho...
3,1008,StudioKB was just right for what I needed - ve...
4,1187,"The location and view are unparalleled, checki..."


In [17]:
nltk.download('punkt')
nltk.download('stopwords')

stop_words = set(stopwords.words('english'))  

def preprocess(text):
    text = text.lower()
    text = ''.join([char for char in text if char not in string.punctuation])
    words = word_tokenize(text)
    words = [word for word in words if word not in stop_words]
    return words

reviews_grouped['processed_comments'] = reviews_grouped['comments'].apply(preprocess)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\fyr\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\fyr\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [18]:
model = Word2Vec(sentences=reviews_grouped['processed_comments'], vector_size=100, window=5, min_count=2, workers=4)
model.save("word2vec_reviewer_comments.model")

In [19]:
def average_word_vectors(words, model, vocabulary, num_features):
    feature_vector = np.zeros((num_features,), dtype="float64")
    nwords = 0.

    for word in words:
        if word in vocabulary: 
            nwords = nwords + 1.
            feature_vector = np.add(feature_vector, model.wv[word])

    if nwords:
        feature_vector = np.divide(feature_vector, nwords)
    return feature_vector

vocabulary = set(model.wv.index_to_key)
reviews_grouped['feature_vector'] = reviews_grouped['processed_comments'].apply(
    lambda x: average_word_vectors(x, model, vocabulary, 100))

In [20]:
user_features = np.array(reviews_grouped['feature_vector'].tolist())
num_users = user_features.shape[0]
additional_features = np.zeros((num_users, 23))

# combine with function hstack
user_features = np.hstack([user_features, additional_features])

In [21]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
import random
from IPython.display import display, HTML
import ipywidgets as widgets

id_input = widgets.IntText(
    value=0,
    description='Please input a user ID:',
    disabled=False,
    style={'description_width': 'initial'}
)

submit_button = widgets.Button(description="Search", button_style='success') 

output = widgets.Output()

def on_button_clicked(b):
    with output:
        output.clear_output()
        
        user_id = id_input.value
        if 0 <= user_id <= 412495:
            try:
                single_user_feature = user_features[user_id].reshape(1, -1)
                similarities = cosine_similarity(single_user_feature, combined_features)[0]
                indexed_similarities = list(zip(selected_listings['id'], similarities))
                sorted_listings = sorted(indexed_similarities, key=lambda x: x[1], reverse=True)

                top_n = 5
                top_recommendations = sorted_listings[:top_n]
                recommended_ids = [item[0] for item in top_recommendations]
                recommend_listings = listings[listings['id'].isin(recommended_ids)]
                
                data = {'Room ID': recommended_ids,
                        'Room Link': [f'<a href="{link}">{link}</a>' for link in recommend_listings['listing_url']]}
                df = pd.DataFrame(data)
                
                display(HTML(df.to_html(escape=False)))
            except Exception as e:
                print(f"ERROR: {e}")
        else:
            print("Please input a valid user ID (from 0 to 412495).")

submit_button.on_click(on_button_clicked)

form = widgets.VBox([widgets.HBox([id_input, submit_button]), output])
display(form)

VBox(children=(HBox(children=(IntText(value=0, description='Please input a user ID:', style=DescriptionStyle(d…