# Airbnb - Reviews & Price NLP

In [57]:
import pandas as pd
import numpy as np
import nltk
nltk.download('stopwords')
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet


[nltk_data] Downloading package stopwords to /Users/jf/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [55]:
la_rev = pd.read_csv('data/la_reviews.csv')
la_list = pd.read_csv('data/la_listings.csv')

[nltk_data] Downloading package stopwords to /Users/jf/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## 'Reviews' DF cleaning

In [3]:
la_rev

Unnamed: 0,listing_id,id,date,reviewer_id,reviewer_name,comments
0,8941071,68391055,2016-04-04,10164333,Smruti,"Danielle was a great host, she was extremely r..."
1,8941071,153719836,2017-05-21,97944097,Rob,The apartment was great for us to spend the we...
2,8941071,147589354,2017-04-27,4123723,Widya,"Danielle is a great host, very concerned with..."
3,8941071,145742425,2017-04-19,1459499,Darian,Great location and spacious. Danielle's place ...
4,8941071,144400833,2017-04-15,98494277,Charlie,"Danielle's place was as expected, really good ..."
...,...,...,...,...,...,...
1532920,837764720715019063,882048452895718293,2023-05-01,61709926,Xana,Alexis and David were incredible hosts - frien...
1532921,837764720715019063,866817204898142401,2023-04-10,20783675,Priscila,Such an amazing and beautiful place! David and...
1532922,837764720715019063,864618654800651449,2023-04-07,407565907,Petra,The owners were very kind and helpful. Beautif...
1532923,837764720715019063,850814630932374093,2023-03-19,129150527,Mike,"Beautiful space and view, very friendly hosts,..."


In [17]:
la_rev.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1532925 entries, 0 to 1532924
Data columns (total 6 columns):
 #   Column         Non-Null Count    Dtype 
---  ------         --------------    ----- 
 0   listing_id     1532925 non-null  int64 
 1   id             1532925 non-null  int64 
 2   date           1532925 non-null  object
 3   reviewer_id    1532925 non-null  int64 
 4   reviewer_name  1532925 non-null  object
 5   comments       1532639 non-null  object
dtypes: int64(3), object(3)
memory usage: 70.2+ MB


In [23]:
la_rev['comments'].isna().sum()

286

In [24]:
la_rev[la_rev['comments'].isna()]

Unnamed: 0,listing_id,id,date,reviewer_id,reviewer_name,comments
11041,12232198,597570153,2020-01-25,109869097,Larry,
36262,18771571,499592244,2019-07-31,200662007,Lilyan,
38689,19966810,428734333,2019-03-25,248109683,Danielle,
39068,20012997,500736816306141481,2021-11-21,78355861,Jill,
40846,19434972,486204261023070433,2021-11-01,110160469,Farnaz,
...,...,...,...,...,...,...
1511887,16661893,621560353,2020-04-01,342670536,Al,
1514989,46444583,868264023745043602,2023-04-12,23149537,Flannery,
1517560,19332411,222064868,2017-12-28,113707624,Betsy,
1524215,30460122,561530122,2019-11-09,207881679,Bianca,


In [25]:
la_rev.dropna(subset=['comments'], inplace=True)

In [26]:
la_rev.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1532639 entries, 0 to 1532924
Data columns (total 6 columns):
 #   Column         Non-Null Count    Dtype 
---  ------         --------------    ----- 
 0   listing_id     1532639 non-null  int64 
 1   id             1532639 non-null  int64 
 2   date           1532639 non-null  object
 3   reviewer_id    1532639 non-null  int64 
 4   reviewer_name  1532639 non-null  object
 5   comments       1532639 non-null  object
dtypes: int64(3), object(3)
memory usage: 81.9+ MB


In [40]:
la_rev[la_rev['listing_id'] == 109]

Unnamed: 0,listing_id,id,date,reviewer_id,reviewer_name,comments
497553,109,74506539,2016-05-15,22509885,Jenn,Me and two friends stayed for four and a half ...
497554,109,449036,2011-08-15,927861,Edwin,The host canceled my reservation the day befor...


In [27]:
la_rev_con = la_rev.groupby(['listing_id'], as_index=False).agg({'comments': " ".join})

In [28]:
la_rev_con

Unnamed: 0,listing_id,comments
0,109,Me and two friends stayed for four and a half ...
1,2708,Charles is the man!! Just wrapped up an amazin...
2,2732,"Unfortunately, I was really disappointed with ..."
3,6033,Sarah was a great host. She was always quick t...
4,6931,The best host and best stay I've ever had with...
...,...,...
32956,968513441909611726,BEAUTIFUL HOME. GREAT LOCATION. AWESOME SUPER ...
32957,969535403681694277,We had a perfect time at Sean’s cottage. It wa...
32958,969626715256159808,Kelly was communicative and super responsive. ...
32959,970252209631292696,Such a cute spot in a nice neighborhood… check...


In [37]:
la_rev_con['comments'][0]

"Me and two friends stayed for four and a half months. It was a great place to stay! The apartment was very comfortable and I really enjoyed having the park with running path across the street. The only downside was it wasn't within walking distance to restaurants, bars, or coffee shops. But they are a short drive away. Overall, great stay! The host canceled my reservation the day before arrival."

In [39]:
la_rev_con[la_rev_con['listing_id'] == 8941071]['comments']

3003    Danielle was a great host, she was extremely r...
Name: comments, dtype: object

## 'Listings' DF cleaning

In [5]:
la_list.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 44594 entries, 0 to 44593
Data columns (total 75 columns):
 #   Column                                        Non-Null Count  Dtype  
---  ------                                        --------------  -----  
 0   id                                            44594 non-null  int64  
 1   listing_url                                   44594 non-null  object 
 2   scrape_id                                     44594 non-null  int64  
 3   last_scraped                                  44594 non-null  object 
 4   source                                        44594 non-null  object 
 5   name                                          44594 non-null  object 
 6   description                                   43937 non-null  object 
 7   neighborhood_overview                         25053 non-null  object 
 8   picture_url                                   44594 non-null  object 
 9   host_id                                       44594 non-null 

In [8]:
la_list[la_list['id'] == 8941071]

Unnamed: 0,id,listing_url,scrape_id,last_scraped,source,name,description,neighborhood_overview,picture_url,host_id,...,review_scores_communication,review_scores_location,review_scores_value,license,instant_bookable,calculated_host_listings_count,calculated_host_listings_count_entire_homes,calculated_host_listings_count_private_rooms,calculated_host_listings_count_shared_rooms,reviews_per_month
4276,8941071,https://www.airbnb.com/rooms/8941071,20230903194229,2023-09-03,previous scrape,Rental unit in West Hollywood · ★4.81 · 1 bedr...,"Perfect heart of the city feeling, short walk ...",,https://a0.muscache.com/pictures/ba9498f2-815d...,23592617,...,5.0,4.95,4.84,,f,1,1,0,0,0.24


In [9]:
la_list['review_scores_rating'].value_counts()

5.00    10388
4.50      838
4.00      778
4.67      755
4.88      728
        ...  
3.56        1
2.75        1
3.14        1
3.96        1
4.03        1
Name: review_scores_rating, Length: 157, dtype: int64

In [10]:
la_small = la_list[['id', 'price', 'bedrooms', 'review_scores_rating', 'neighbourhood_cleansed']]

In [45]:
la_small['bedrooms'].value_counts()

1.0     13872
2.0      8313
3.0      5008
4.0      2494
5.0      1008
6.0       334
7.0       124
8.0        40
9.0        20
10.0       12
11.0        6
14.0        4
12.0        4
13.0        3
18.0        2
20.0        1
23.0        1
16.0        1
32.0        1
24.0        1
19.0        1
15.0        1
Name: bedrooms, dtype: int64

In [46]:
df_la_1bd = la_small[la_small['bedrooms'] == 1.0]

In [47]:
df_la_1bd['price'] = df_la_1bd['price'].str.split('.').str[0]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_la_1bd['price'] = df_la_1bd['price'].str.split('.').str[0]


In [48]:
df_la_1bd['price'] = df_la_1bd['price'].str.replace(",", "")
df_la_1bd['price']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_la_1bd['price'] = df_la_1bd['price'].str.replace(",", "")


2         $69
3        $120
8        $201
11        $88
15        $60
         ... 
44586    $175
44587    $194
44588    $180
44590    $168
44593    $480
Name: price, Length: 13872, dtype: object

In [49]:
df_la_1bd['price'] = df_la_1bd['price'].str.replace('$', '')
df_la_1bd

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_la_1bd['price'] = df_la_1bd['price'].str.replace('$', '')


Unnamed: 0,id,price,bedrooms,review_scores_rating,neighbourhood_cleansed
2,942342470598573002,69,1.0,,Rowland Heights
3,41240375,120,1.0,5.00,Playa del Rey
8,15239926,201,1.0,4.99,Santa Clarita
11,14821183,88,1.0,3.00,Diamond Bar
15,18976122,60,1.0,,Fairfax
...,...,...,...,...,...
44586,720164781296601135,175,1.0,4.75,West Hollywood
44587,960073745720655216,194,1.0,,Downtown
44588,680301812424175952,180,1.0,4.50,Downtown
44590,674665451875208878,168,1.0,,Downtown


## Merge 'Reviews' and 'Listings'

In [50]:
df_merge = df_la_1bd.merge(la_rev_con, how='left', left_on='id', right_on='listing_id')

In [52]:
df_merge

Unnamed: 0,id,price,bedrooms,review_scores_rating,neighbourhood_cleansed,listing_id,comments
0,942342470598573002,69,1.0,,Rowland Heights,,
1,41240375,120,1.0,5.00,Playa del Rey,4.124038e+07,Paola is the best host I have ever had. She ha...
2,15239926,201,1.0,4.99,Santa Clarita,1.523993e+07,Fantastic super hosts and space . What a beaut...
3,14821183,88,1.0,3.00,Diamond Bar,1.482118e+07,Quiet house. Bedroom is enough for sleeping. u...
4,18976122,60,1.0,,Fairfax,,
...,...,...,...,...,...,...,...
13867,720164781296601135,175,1.0,4.75,West Hollywood,7.201648e+17,"Great location, great host, beautiful apartmen..."
13868,960073745720655216,194,1.0,,Downtown,,
13869,680301812424175952,180,1.0,4.50,Downtown,6.803018e+17,Very convenient and comfortable. It had everyt...
13870,674665451875208878,168,1.0,,Downtown,,


In [51]:
df_merge['comments'].isna().sum()

4003

In [53]:
df_merge.dropna(subset=['comments'], inplace=True)

In [54]:
df_merge.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9869 entries, 1 to 13869
Data columns (total 7 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   id                      9869 non-null   int64  
 1   price                   9869 non-null   object 
 2   bedrooms                9869 non-null   float64
 3   review_scores_rating    9869 non-null   float64
 4   neighbourhood_cleansed  9869 non-null   object 
 5   listing_id              9869 non-null   float64
 6   comments                9869 non-null   object 
dtypes: float64(3), int64(1), object(3)
memory usage: 616.8+ KB


## Pre-Processing Text

In [68]:
# Create an intance of the RegexpTokenizer with the variable name `tokenizer`
# The regex pattern should select all words with three or more characters

token_pattern = r"(?u)\w{3,}"
tokenizer = RegexpTokenizer(token_pattern)

# Create a list of stopwords in English
stopwords_list = stopwords.words('english')

In [71]:
def preprocess_text(text, tokenizer, stopwords_list):
    # Standardize case (lowercase the text)
    text_std = text.lower()
    # Tokenize
    token_list = tokenizer.tokenize(text_std)
    # Remove stopwords
    stopwords_removed = [token for token in token_list if token not in stopwords_list]
    return stopwords_removed
   

In [72]:
reviews_proc = df_merge.comments.apply(lambda x: preprocess_text(x, tokenizer, stopwords_list))
reviews_proc

1        [paola, best, host, ever, taken, care, every, ...
2        [fantastic, super, hosts, space, beautiful, pl...
3        [quiet, house, bedroom, enough, sleeping, uncl...
6        [amazing, experience, house, also, recently, u...
7        [nice, neighborhood, hosts, great, location, h...
                               ...                        
13862    [super, host, thank, apartment, perfect, work,...
13863    [great, location, overall, decent, stay, bigge...
13864    [since, sent, review, guess, reviewing, confir...
13867    [great, location, great, host, beautiful, apar...
13869    [convenient, comfortable, everything, could, n...
Name: comments, Length: 9869, dtype: object

### Tag & Lemmatize

In [73]:
#Create Lemmatizer
lemmatizer = WordNetLemmatizer()

In [74]:
#Map POS tag to first character for use in WordNetLemmatizer
def get_wordnet_pos(treebank_tag):
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN  # Default to NOUN

In [76]:
#POS Tagging
tagged_text = reviews_proc.apply(lambda x: pos_tag(x))

KeyboardInterrupt: 

In [77]:
tagged_text

1        [(paola, NN), (best, JJS), (host, NN), (ever, ...
2        [(fantastic, JJ), (super, JJ), (hosts, NNS), (...
3        [(quiet, JJ), (house, NN), (bedroom, NN), (eno...
6        [(amazing, VBG), (experience, NN), (house, NN)...
7        [(nice, JJ), (neighborhood, NN), (hosts, NNS),...
                               ...                        
13862    [(super, NN), (host, NN), (thank, NN), (apartm...
13863    [(great, JJ), (location, NN), (overall, JJ), (...
13864    [(since, IN), (sent, VBN), (review, NN), (gues...
13867    [(great, JJ), (location, NN), (great, JJ), (ho...
13869    [(convenient, NN), (comfortable, JJ), (everyth...
Name: comments, Length: 9869, dtype: object

In [80]:
#Lemmatize
proccessed_rev = tagged_text.apply(lambda x: [lemmatizer.lemmatize(word, get_wordnet_pos(pos)) for word, pos in x])

In [83]:
processed_rev = proccessed_rev 

In [84]:
df_merge['processed_reviews'] = processed_rev
df_merge

Unnamed: 0,id,price,bedrooms,review_scores_rating,neighbourhood_cleansed,listing_id,comments,processed_reviews
1,41240375,120,1.0,5.00,Playa del Rey,4.124038e+07,Paola is the best host I have ever had. She ha...,"[paola, best, host, ever, take, care, every, d..."
2,15239926,201,1.0,4.99,Santa Clarita,1.523993e+07,Fantastic super hosts and space . What a beaut...,"[fantastic, super, host, space, beautiful, pla..."
3,14821183,88,1.0,3.00,Diamond Bar,1.482118e+07,Quiet house. Bedroom is enough for sleeping. u...,"[quiet, house, bedroom, enough, sleep, unclean]"
6,26296415,180,1.0,5.00,Torrance,2.629642e+07,Amazing experience. The house was also recentl...,"[amaze, experience, house, also, recently, upg..."
7,22746714,35,1.0,4.57,North El Monte,2.274671e+07,Nice neighborhood and hosts. Great location. T...,"[nice, neighborhood, host, great, location, ho..."
...,...,...,...,...,...,...,...,...
13862,575384126844892676,159,1.0,4.67,West Hollywood,5.753841e+17,"Super host, thank you! This apartment was perf...","[super, host, thank, apartment, perfect, work,..."
13863,16072625,177,1.0,4.14,East Hollywood,1.607262e+07,Great location. And overall a decent stay. My ...,"[great, location, overall, decent, stay, big, ..."
13864,924091269757225413,120,1.0,1.00,Beverly Hills,9.240913e+17,"Since they sent me a review, I guess I’m revie...","[since, send, review, guess, review, confirm, ..."
13867,720164781296601135,175,1.0,4.75,West Hollywood,7.201648e+17,"Great location, great host, beautiful apartmen...","[great, location, great, host, beautiful, apar..."


In [85]:
# Convert token lists to strings
df_merge['processed_reviews'] = df_merge['processed_reviews'].str.join(' ')

In [86]:
df_merge

Unnamed: 0,id,price,bedrooms,review_scores_rating,neighbourhood_cleansed,listing_id,comments,processed_reviews
1,41240375,120,1.0,5.00,Playa del Rey,4.124038e+07,Paola is the best host I have ever had. She ha...,paola best host ever take care every detail al...
2,15239926,201,1.0,4.99,Santa Clarita,1.523993e+07,Fantastic super hosts and space . What a beaut...,fantastic super host space beautiful place sta...
3,14821183,88,1.0,3.00,Diamond Bar,1.482118e+07,Quiet house. Bedroom is enough for sleeping. u...,quiet house bedroom enough sleep unclean
6,26296415,180,1.0,5.00,Torrance,2.629642e+07,Amazing experience. The house was also recentl...,amaze experience house also recently upgrade w...
7,22746714,35,1.0,4.57,North El Monte,2.274671e+07,Nice neighborhood and hosts. Great location. T...,nice neighborhood host great location host res...
...,...,...,...,...,...,...,...,...
13862,575384126844892676,159,1.0,4.67,West Hollywood,5.753841e+17,"Super host, thank you! This apartment was perf...",super host thank apartment perfect work stay s...
13863,16072625,177,1.0,4.14,East Hollywood,1.607262e+07,Great location. And overall a decent stay. My ...,great location overall decent stay big issue p...
13864,924091269757225413,120,1.0,1.00,Beverly Hills,9.240913e+17,"Since they sent me a review, I guess I’m revie...",since send review guess review confirm reserva...
13867,720164781296601135,175,1.0,4.75,West Hollywood,7.201648e+17,"Great location, great host, beautiful apartmen...",great location great host beautiful apartment ...
