# Airbnb - Reviews & Price NLP

In [2]:
import pandas as pd
import numpy as np
import nltk
nltk.download('stopwords')

from nltk.tokenize import RegexpTokenizer, word_tokenize
from nltk.corpus import stopwords, wordnet
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.tag import pos_tag
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.linear_model import LogisticRegression, LinearRegression, Ridge
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingRegressor
from sklearn.neighbors import KNeighborsClassifier
from tqdm import tqdm
from sklearn.metrics import accuracy_score, classification_report


[nltk_data] Downloading package stopwords to /Users/jf/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
#load in rviews and listings data
la_rev = pd.read_csv('../data/la_reviews.csv')
la_list = pd.read_csv('../data/la_listings.csv')

## 'Reviews' DF cleaning

In [4]:
la_rev

Unnamed: 0,listing_id,id,date,reviewer_id,reviewer_name,comments
0,8941071,68391055,2016-04-04,10164333,Smruti,"Danielle was a great host, she was extremely r..."
1,8941071,153719836,2017-05-21,97944097,Rob,The apartment was great for us to spend the we...
2,8941071,147589354,2017-04-27,4123723,Widya,"Danielle is a great host, very concerned with..."
3,8941071,145742425,2017-04-19,1459499,Darian,Great location and spacious. Danielle's place ...
4,8941071,144400833,2017-04-15,98494277,Charlie,"Danielle's place was as expected, really good ..."
...,...,...,...,...,...,...
1532920,837764720715019063,882048452895718293,2023-05-01,61709926,Xana,Alexis and David were incredible hosts - frien...
1532921,837764720715019063,866817204898142401,2023-04-10,20783675,Priscila,Such an amazing and beautiful place! David and...
1532922,837764720715019063,864618654800651449,2023-04-07,407565907,Petra,The owners were very kind and helpful. Beautif...
1532923,837764720715019063,850814630932374093,2023-03-19,129150527,Mike,"Beautiful space and view, very friendly hosts,..."


In [5]:
la_rev.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1532925 entries, 0 to 1532924
Data columns (total 6 columns):
 #   Column         Non-Null Count    Dtype 
---  ------         --------------    ----- 
 0   listing_id     1532925 non-null  int64 
 1   id             1532925 non-null  int64 
 2   date           1532925 non-null  object
 3   reviewer_id    1532925 non-null  int64 
 4   reviewer_name  1532925 non-null  object
 5   comments       1532639 non-null  object
dtypes: int64(3), object(3)
memory usage: 70.2+ MB


In [6]:
#checking for nulls
la_rev['comments'].isna().sum()

286

In [7]:
#visualizing nulls
la_rev[la_rev['comments'].isna()]

Unnamed: 0,listing_id,id,date,reviewer_id,reviewer_name,comments
11041,12232198,597570153,2020-01-25,109869097,Larry,
36262,18771571,499592244,2019-07-31,200662007,Lilyan,
38689,19966810,428734333,2019-03-25,248109683,Danielle,
39068,20012997,500736816306141481,2021-11-21,78355861,Jill,
40846,19434972,486204261023070433,2021-11-01,110160469,Farnaz,
...,...,...,...,...,...,...
1511887,16661893,621560353,2020-04-01,342670536,Al,
1514989,46444583,868264023745043602,2023-04-12,23149537,Flannery,
1517560,19332411,222064868,2017-12-28,113707624,Betsy,
1524215,30460122,561530122,2019-11-09,207881679,Bianca,


In [8]:
#dropping null values
la_rev.dropna(subset=['comments'], inplace=True)

In [9]:
la_rev.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1532639 entries, 0 to 1532924
Data columns (total 6 columns):
 #   Column         Non-Null Count    Dtype 
---  ------         --------------    ----- 
 0   listing_id     1532639 non-null  int64 
 1   id             1532639 non-null  int64 
 2   date           1532639 non-null  object
 3   reviewer_id    1532639 non-null  int64 
 4   reviewer_name  1532639 non-null  object
 5   comments       1532639 non-null  object
dtypes: int64(3), object(3)
memory usage: 81.9+ MB


In [10]:
#checking separate rewviews for s specific listing_id BEFORE grouping all review by listing_id
la_rev[la_rev['listing_id'] == 109]

Unnamed: 0,listing_id,id,date,reviewer_id,reviewer_name,comments
497553,109,74506539,2016-05-15,22509885,Jenn,Me and two friends stayed for four and a half ...
497554,109,449036,2011-08-15,927861,Edwin,The host canceled my reservation the day befor...


In [11]:
#grouping all reviews text by 'listing_id'
la_rev_con = la_rev.groupby(['listing_id'], as_index=False).agg({'comments': " ".join})

In [12]:
la_rev_con

Unnamed: 0,listing_id,comments
0,109,Me and two friends stayed for four and a half ...
1,2708,Charles is the man!! Just wrapped up an amazin...
2,2732,"Unfortunately, I was really disappointed with ..."
3,6033,Sarah was a great host. She was always quick t...
4,6931,The best host and best stay I've ever had with...
...,...,...
32956,968513441909611726,BEAUTIFUL HOME. GREAT LOCATION. AWESOME SUPER ...
32957,969535403681694277,We had a perfect time at Sean’s cottage. It wa...
32958,969626715256159808,Kelly was communicative and super responsive. ...
32959,970252209631292696,Such a cute spot in a nice neighborhood… check...


In [13]:
#sanity check: (cross-referencing with listing_id 109 above
la_rev_con['comments'][0]

"Me and two friends stayed for four and a half months. It was a great place to stay! The apartment was very comfortable and I really enjoyed having the park with running path across the street. The only downside was it wasn't within walking distance to restaurants, bars, or coffee shops. But they are a short drive away. Overall, great stay! The host canceled my reservation the day before arrival."

## 'Listings' DF cleaning

In [138]:
la_list.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 44594 entries, 0 to 44593
Data columns (total 75 columns):
 #   Column                                        Non-Null Count  Dtype  
---  ------                                        --------------  -----  
 0   id                                            44594 non-null  int64  
 1   listing_url                                   44594 non-null  object 
 2   scrape_id                                     44594 non-null  int64  
 3   last_scraped                                  44594 non-null  object 
 4   source                                        44594 non-null  object 
 5   name                                          44594 non-null  object 
 6   description                                   43937 non-null  object 
 7   neighborhood_overview                         25053 non-null  object 
 8   picture_url                                   44594 non-null  object 
 9   host_id                                       44594 non-null 

In [139]:
la_list[la_list['id'] == 8941071]

Unnamed: 0,id,listing_url,scrape_id,last_scraped,source,name,description,neighborhood_overview,picture_url,host_id,...,review_scores_communication,review_scores_location,review_scores_value,license,instant_bookable,calculated_host_listings_count,calculated_host_listings_count_entire_homes,calculated_host_listings_count_private_rooms,calculated_host_listings_count_shared_rooms,reviews_per_month
4276,8941071,https://www.airbnb.com/rooms/8941071,20230903194229,2023-09-03,previous scrape,Rental unit in West Hollywood · ★4.81 · 1 bedr...,"Perfect heart of the city feeling, short walk ...",,https://a0.muscache.com/pictures/ba9498f2-815d...,23592617,...,5.0,4.95,4.84,,f,1,1,0,0,0.24


In [140]:
la_list['review_scores_rating'].value_counts()

5.00    10388
4.50      838
4.00      778
4.67      755
4.88      728
        ...  
3.56        1
2.75        1
3.14        1
3.96        1
4.03        1
Name: review_scores_rating, Length: 157, dtype: int64

In [141]:
#condensing df to potential features of interest for NLP
la_small = la_list[['id', 'price', 'bedrooms', 'review_scores_rating', 'neighbourhood_cleansed']]

In [142]:
la_small['bedrooms'].value_counts()

1.0     13872
2.0      8313
3.0      5008
4.0      2494
5.0      1008
6.0       334
7.0       124
8.0        40
9.0        20
10.0       12
11.0        6
14.0        4
12.0        4
13.0        3
18.0        2
20.0        1
23.0        1
16.0        1
32.0        1
24.0        1
19.0        1
15.0        1
Name: bedrooms, dtype: int64

In [143]:
# Filtering for 1 bedroom properties - Analysis only concerned with 1 bedroom Airbnb listings
df_la_1bd = la_small[la_small['bedrooms'] == 1.0]

In [144]:
# removing decimals from price points
df_la_1bd['price'] = df_la_1bd['price'].str.split('.').str[0]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_la_1bd['price'] = df_la_1bd['price'].str.split('.').str[0]


In [145]:
#removing commas from price points
df_la_1bd['price'] = df_la_1bd['price'].str.replace(",", "")
df_la_1bd['price']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_la_1bd['price'] = df_la_1bd['price'].str.replace(",", "")


2         $69
3        $120
8        $201
11        $88
15        $60
         ... 
44586    $175
44587    $194
44588    $180
44590    $168
44593    $480
Name: price, Length: 13872, dtype: object

In [146]:
#removing $ signs from price points
df_la_1bd['price'] = df_la_1bd['price'].str.replace('$', '')
df_la_1bd

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_la_1bd['price'] = df_la_1bd['price'].str.replace('$', '')


Unnamed: 0,id,price,bedrooms,review_scores_rating,neighbourhood_cleansed
2,942342470598573002,69,1.0,,Rowland Heights
3,41240375,120,1.0,5.00,Playa del Rey
8,15239926,201,1.0,4.99,Santa Clarita
11,14821183,88,1.0,3.00,Diamond Bar
15,18976122,60,1.0,,Fairfax
...,...,...,...,...,...
44586,720164781296601135,175,1.0,4.75,West Hollywood
44587,960073745720655216,194,1.0,,Downtown
44588,680301812424175952,180,1.0,4.50,Downtown
44590,674665451875208878,168,1.0,,Downtown


## Merge 'Reviews' and 'Listings'

In [147]:
# merge 'reviews' df with filtered 'listings' df
df_merge = df_la_1bd.merge(la_rev_con, how='left', left_on='id', right_on='listing_id')

In [148]:
df_merge

Unnamed: 0,id,price,bedrooms,review_scores_rating,neighbourhood_cleansed,listing_id,comments
0,942342470598573002,69,1.0,,Rowland Heights,,
1,41240375,120,1.0,5.00,Playa del Rey,4.124038e+07,Paola is the best host I have ever had. She ha...
2,15239926,201,1.0,4.99,Santa Clarita,1.523993e+07,Fantastic super hosts and space . What a beaut...
3,14821183,88,1.0,3.00,Diamond Bar,1.482118e+07,Quiet house. Bedroom is enough for sleeping. u...
4,18976122,60,1.0,,Fairfax,,
...,...,...,...,...,...,...,...
13867,720164781296601135,175,1.0,4.75,West Hollywood,7.201648e+17,"Great location, great host, beautiful apartmen..."
13868,960073745720655216,194,1.0,,Downtown,,
13869,680301812424175952,180,1.0,4.50,Downtown,6.803018e+17,Very convenient and comfortable. It had everyt...
13870,674665451875208878,168,1.0,,Downtown,,


In [None]:
#check for numerical values in text data
df_merge[df_merge['comments'].str.isnumeric() == True]

In [149]:
#checking for nulls in reviews
df_merge['comments'].isna().sum()

4003

In [150]:
#dropping nulls from reviews
df_merge.dropna(subset=['comments'], inplace=True)

In [151]:
df_merge.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9869 entries, 1 to 13869
Data columns (total 7 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   id                      9869 non-null   int64  
 1   price                   9869 non-null   object 
 2   bedrooms                9869 non-null   float64
 3   review_scores_rating    9869 non-null   float64
 4   neighbourhood_cleansed  9869 non-null   object 
 5   listing_id              9869 non-null   float64
 6   comments                9869 non-null   object 
dtypes: float64(3), int64(1), object(3)
memory usage: 616.8+ KB


## Pre-Processing Text

In [152]:
#instantiate tokenizer
#regex pattern returns words or 3 or more characters and drops all non-english characters
token_pattern = r"(?u)\w{3,}|/[^\x00-\x7F]+/"
tokenizer = RegexpTokenizer(token_pattern)

#create a list of stopwords
stopwords_list = stopwords.words('english')

In [153]:
#create function to pre-process text
def preprocess_text(text, tokenizer, stopwords_list):
    # Standardize case (lowercase the text)
    text_std = text.lower()
    # Tokenize
    token_list = tokenizer.tokenize(text_std)
    # Remove stopwords
    stopwords_removed = [token for token in token_list if token not in stopwords_list]
    return stopwords_removed
   

In [154]:
#pre-process the text data with 'preprocess_text' function
reviews_proc = df_merge.comments.apply(lambda x: preprocess_text(x, tokenizer, stopwords_list))
reviews_proc

1        [paola, best, host, ever, taken, care, every, ...
2        [fantastic, super, hosts, space, beautiful, pl...
3        [quiet, house, bedroom, enough, sleeping, uncl...
6        [amazing, experience, house, also, recently, u...
7        [nice, neighborhood, hosts, great, location, h...
                               ...                        
13862    [super, host, thank, apartment, perfect, work,...
13863    [great, location, overall, decent, stay, bigge...
13864    [since, sent, review, guess, reviewing, confir...
13867    [great, location, great, host, beautiful, apar...
13869    [convenient, comfortable, everything, could, n...
Name: comments, Length: 9869, dtype: object

In [168]:
reviews_proc[1]

['paola',
 'best',
 'host',
 'ever',
 'taken',
 'care',
 'every',
 'detail',
 'always',
 'looking',
 'comfort',
 'thee',
 'condo',
 'great',
 'garage',
 'parking',
 'laundry',
 'room',
 'site',
 'works',
 'perfectly',
 'cheap',
 'easy',
 'use',
 'app',
 'appliances',
 'new',
 'kitchenware',
 'complete',
 'ever',
 'seen',
 'arbnb',
 'condo',
 'pretty',
 'good',
 'size',
 'comes',
 'working',
 'corner',
 'screen',
 'multifunction',
 'printer',
 'balcony',
 'really',
 'cute',
 'good',
 'size',
 'perfect',
 'place',
 'read',
 'simply',
 'rest',
 'armchairs',
 'furniture',
 'good',
 'quality',
 'everything',
 'makes',
 'place',
 'cozy',
 'neighborhood',
 'fantastic',
 'safe',
 'truly',
 'minutes',
 'walk',
 'beach',
 'minutes',
 'walk',
 'nearest',
 'food',
 'store',
 'howe',
 'handy',
 'best',
 'paola',
 'landlady',
 'always',
 'take',
 'care',
 'make',
 'stay',
 'comfortable',
 'unforgettable',
 'highly',
 'recommend',
 'condo',
 'paola']

### Tag & Lemmatize

In [161]:
#Create Lemmatizer
lemmatizer = WordNetLemmatizer()

In [162]:
#Map POS tag to first character for use in WordNetLemmatizer
def get_wordnet_pos(treebank_tag):
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN  # Default to NOUN

In [163]:
#POS Tagging
tagged_text = reviews_proc.apply(lambda x: pos_tag(x))

In [164]:
tagged_text

1        [(paola, NN), (best, JJS), (host, NN), (ever, ...
2        [(fantastic, JJ), (super, JJ), (hosts, NNS), (...
3        [(quiet, JJ), (house, NN), (bedroom, NN), (eno...
6        [(amazing, VBG), (experience, NN), (house, NN)...
7        [(nice, JJ), (neighborhood, NN), (hosts, NNS),...
                               ...                        
13862    [(super, NN), (host, NN), (thank, NN), (apartm...
13863    [(great, JJ), (location, NN), (overall, JJ), (...
13864    [(since, IN), (sent, VBN), (review, NN), (gues...
13867    [(great, JJ), (location, NN), (great, JJ), (ho...
13869    [(convenient, NN), (comfortable, JJ), (everyth...
Name: comments, Length: 9869, dtype: object

In [171]:
#Lemmatize the processed text
processed_rev = tagged_text.apply(lambda x: [lemmatizer.lemmatize(word, get_wordnet_pos(pos)) for word, pos in x])

In [172]:
#create new column in df with the processed reviews
df_merge['processed_reviews'] = processed_rev
df_merge

Unnamed: 0,id,price,bedrooms,review_scores_rating,neighbourhood_cleansed,listing_id,comments,processed_reviews
1,41240375,120,1.0,5.00,Playa del Rey,4.124038e+07,Paola is the best host I have ever had. She ha...,"[paola, best, host, ever, take, care, every, d..."
2,15239926,201,1.0,4.99,Santa Clarita,1.523993e+07,Fantastic super hosts and space . What a beaut...,"[fantastic, super, host, space, beautiful, pla..."
3,14821183,88,1.0,3.00,Diamond Bar,1.482118e+07,Quiet house. Bedroom is enough for sleeping. u...,"[quiet, house, bedroom, enough, sleep, unclean]"
6,26296415,180,1.0,5.00,Torrance,2.629642e+07,Amazing experience. The house was also recentl...,"[amaze, experience, house, also, recently, upg..."
7,22746714,35,1.0,4.57,North El Monte,2.274671e+07,Nice neighborhood and hosts. Great location. T...,"[nice, neighborhood, host, great, location, ho..."
...,...,...,...,...,...,...,...,...
13862,575384126844892676,159,1.0,4.67,West Hollywood,5.753841e+17,"Super host, thank you! This apartment was perf...","[super, host, thank, apartment, perfect, work,..."
13863,16072625,177,1.0,4.14,East Hollywood,1.607262e+07,Great location. And overall a decent stay. My ...,"[great, location, overall, decent, stay, big, ..."
13864,924091269757225413,120,1.0,1.00,Beverly Hills,9.240913e+17,"Since they sent me a review, I guess I’m revie...","[since, send, review, guess, review, confirm, ..."
13867,720164781296601135,175,1.0,4.75,West Hollywood,7.201648e+17,"Great location, great host, beautiful apartmen...","[great, location, great, host, beautiful, apar..."


In [173]:
#convert token lists to strings
df_merge['processed_reviews'] = df_merge['processed_reviews'].str.join(' ')

In [174]:
df_merge

Unnamed: 0,id,price,bedrooms,review_scores_rating,neighbourhood_cleansed,listing_id,comments,processed_reviews
1,41240375,120,1.0,5.00,Playa del Rey,4.124038e+07,Paola is the best host I have ever had. She ha...,paola best host ever take care every detail al...
2,15239926,201,1.0,4.99,Santa Clarita,1.523993e+07,Fantastic super hosts and space . What a beaut...,fantastic super host space beautiful place sta...
3,14821183,88,1.0,3.00,Diamond Bar,1.482118e+07,Quiet house. Bedroom is enough for sleeping. u...,quiet house bedroom enough sleep unclean
6,26296415,180,1.0,5.00,Torrance,2.629642e+07,Amazing experience. The house was also recentl...,amaze experience house also recently upgrade w...
7,22746714,35,1.0,4.57,North El Monte,2.274671e+07,Nice neighborhood and hosts. Great location. T...,nice neighborhood host great location host res...
...,...,...,...,...,...,...,...,...
13862,575384126844892676,159,1.0,4.67,West Hollywood,5.753841e+17,"Super host, thank you! This apartment was perf...",super host thank apartment perfect work stay s...
13863,16072625,177,1.0,4.14,East Hollywood,1.607262e+07,Great location. And overall a decent stay. My ...,great location overall decent stay big issue p...
13864,924091269757225413,120,1.0,1.00,Beverly Hills,9.240913e+17,"Since they sent me a review, I guess I’m revie...",since send review guess review confirm reserva...
13867,720164781296601135,175,1.0,4.75,West Hollywood,7.201648e+17,"Great location, great host, beautiful apartmen...",great location great host beautiful apartment ...


In [None]:
df_merge_processed = df_merge

In [None]:
#create csv of processed reviews df with initial regex pattern
df_merge_processed.to_csv('../data/processed_reviews.csv')

In [175]:
df_merge_processed2 = df_merge

In [176]:
#create csv of processed reviews df with NEW regex pattern
df_merge_processed2.to_csv('../data/processed_reviews2.csv')

## Modeling

In [76]:
df_processed = pd.read_csv('../data/processed_reviews.csv')
df_processed

Unnamed: 0.1,Unnamed: 0,id,price,bedrooms,review_scores_rating,neighbourhood_cleansed,listing_id,comments,processed_reviews
0,1,41240375,120.0,1.0,5.00,Playa del Rey,4.124038e+07,Paola is the best host I have ever had. She ha...,paola best host ever take care every detail al...
1,2,15239926,201.0,1.0,4.99,Santa Clarita,1.523993e+07,Fantastic super hosts and space . What a beaut...,fantastic super host space beautiful place sta...
2,3,14821183,88.0,1.0,3.00,Diamond Bar,1.482118e+07,Quiet house. Bedroom is enough for sleeping. u...,quiet house bedroom enough sleep unclean
3,6,26296415,180.0,1.0,5.00,Torrance,2.629642e+07,Amazing experience. The house was also recentl...,amaze experience house also recently upgrade w...
4,7,22746714,35.0,1.0,4.57,North El Monte,2.274671e+07,Nice neighborhood and hosts. Great location. T...,nice neighborhood host great location host res...
...,...,...,...,...,...,...,...,...,...
9866,13862,575384126844892676,159.0,1.0,4.67,West Hollywood,5.753841e+17,"Super host, thank you! This apartment was perf...",super host thank apartment perfect work stay s...
9867,13863,16072625,177.0,1.0,4.14,East Hollywood,1.607262e+07,Great location. And overall a decent stay. My ...,great location overall decent stay big issue p...
9868,13864,924091269757225413,120.0,1.0,1.00,Beverly Hills,9.240913e+17,"Since they sent me a review, I guess I’m revie...",since send review guess review confirm reserva...
9869,13867,720164781296601135,175.0,1.0,4.75,West Hollywood,7.201648e+17,"Great location, great host, beautiful apartmen...",great location great host beautiful apartment ...


In [78]:
#checking for nulls
df_processed.isna().sum()

Unnamed: 0                 0
id                         0
price                      2
bedrooms                   2
review_scores_rating       2
neighbourhood_cleansed     2
listing_id                 2
comments                   2
processed_reviews         11
dtype: int64

In [79]:
#dropping nulls
df_processed.dropna(inplace=True)
df_processed.isna().sum()

Unnamed: 0                0
id                        0
price                     0
bedrooms                  0
review_scores_rating      0
neighbourhood_cleansed    0
listing_id                0
comments                  0
processed_reviews         0
dtype: int64

In [80]:
#convert 'price' column to int
df_processed['price'] = df_processed['price'].astype(int)

In [81]:
#train test split 
X = df_processed["processed_reviews"]
y = df_processed["price"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=42
)

#sanity check shape
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((7395,), (2465,), (7395,), (2465,))

## Linear Regression

In [64]:
#instantiate simple model 
pipe_lr = Pipeline([("vec", CountVectorizer()), ("lreg", LinearRegression())])

In [67]:
from sklearn.model_selection import cross_validate

In [68]:
scores = cross_validate(pipe_lr, X_train, y_train, cv=3, 
                        scoring=('r2', 'neg_mean_squared_error'), 
                        return_train_score=True)

In [71]:
scores

{'fit_time': array([878.05117702, 591.76208997, 739.64259696]),
 'score_time': array([2.25494194, 1.80523682, 2.06091213]),
 'test_r2': array([ -178.56458962, -2651.70339649, -5278.74265526]),
 'train_r2': array([0.99903127, 0.99930777, 0.99951327]),
 'test_neg_mean_squared_error': array([-7.30324878e+08, -9.05486962e+09, -7.63545058e+08]),
 'train_neg_mean_squared_error': array([-1723.67717788, -1457.95388423, -1820.54631292])}

## Additional Models & GridSearch

In [74]:
#Instantiate various pipelines
pipe_gbr = Pipeline([("vec", TfidfVectorizer()), ("gbr", GradientBoostingRegressor(random_state=42))])
pipe_ridge = Pipeline([("vec", TfidfVectorizer()), ("rdg", Ridge(random_state=42))])
pipe_dt = Pipeline([("vec", TfidfVectorizer()), ("dt", DecisionTreeClassifier(random_state=42))])
pipe_rf = Pipeline([("vec", TfidfVectorizer()), ("rf", RandomForestClassifier(random_state=42))])
pipe_knn = Pipeline([("vec", TfidfVectorizer()), ("knn", KNeighborsClassifier())])


In [75]:
grid_dt = {'dt__max_depth': ['None', 2, 5, 10],
       'dt__min_samples_split': [2, 5],
       'vec__ngram_range': [(1,1), (1,2)],
        'vec__max_df': [.8, .9, .99],
        'vec__min_df': [.01, .05]}
gs = GridSearchCV(estimator=pipe_dt, param_grid=grid_dt, verbose=2)
gs.fit(X_train, y_train)

Fitting 5 folds for each of 96 candidates, totalling 480 fits




[CV] END dt__max_depth=None, dt__min_samples_split=2, vec__max_df=0.8, vec__min_df=0.01, vec__ngram_range=(1, 1); total time=   4.8s
[CV] END dt__max_depth=None, dt__min_samples_split=2, vec__max_df=0.8, vec__min_df=0.01, vec__ngram_range=(1, 1); total time=   4.4s
[CV] END dt__max_depth=None, dt__min_samples_split=2, vec__max_df=0.8, vec__min_df=0.01, vec__ngram_range=(1, 1); total time=   4.5s
[CV] END dt__max_depth=None, dt__min_samples_split=2, vec__max_df=0.8, vec__min_df=0.01, vec__ngram_range=(1, 1); total time=   4.4s
[CV] END dt__max_depth=None, dt__min_samples_split=2, vec__max_df=0.8, vec__min_df=0.01, vec__ngram_range=(1, 1); total time=   4.3s
[CV] END dt__max_depth=None, dt__min_samples_split=2, vec__max_df=0.8, vec__min_df=0.01, vec__ngram_range=(1, 2); total time=  14.0s
[CV] END dt__max_depth=None, dt__min_samples_split=2, vec__max_df=0.8, vec__min_df=0.01, vec__ngram_range=(1, 2); total time=  14.2s
[CV] END dt__max_depth=None, dt__min_samples_split=2, vec__max_df=0.8

[CV] END dt__max_depth=None, dt__min_samples_split=5, vec__max_df=0.8, vec__min_df=0.01, vec__ngram_range=(1, 1); total time=   4.4s
[CV] END dt__max_depth=None, dt__min_samples_split=5, vec__max_df=0.8, vec__min_df=0.01, vec__ngram_range=(1, 1); total time=   4.4s
[CV] END dt__max_depth=None, dt__min_samples_split=5, vec__max_df=0.8, vec__min_df=0.01, vec__ngram_range=(1, 1); total time=   4.2s
[CV] END dt__max_depth=None, dt__min_samples_split=5, vec__max_df=0.8, vec__min_df=0.01, vec__ngram_range=(1, 2); total time=  13.3s
[CV] END dt__max_depth=None, dt__min_samples_split=5, vec__max_df=0.8, vec__min_df=0.01, vec__ngram_range=(1, 2); total time=  13.0s
[CV] END dt__max_depth=None, dt__min_samples_split=5, vec__max_df=0.8, vec__min_df=0.01, vec__ngram_range=(1, 2); total time=  13.2s
[CV] END dt__max_depth=None, dt__min_samples_split=5, vec__max_df=0.8, vec__min_df=0.01, vec__ngram_range=(1, 2); total time=  13.3s
[CV] END dt__max_depth=None, dt__min_samples_split=5, vec__max_df=0.8

[CV] END dt__max_depth=2, dt__min_samples_split=2, vec__max_df=0.8, vec__min_df=0.01, vec__ngram_range=(1, 1); total time=   8.1s
[CV] END dt__max_depth=2, dt__min_samples_split=2, vec__max_df=0.8, vec__min_df=0.01, vec__ngram_range=(1, 2); total time=  21.5s
[CV] END dt__max_depth=2, dt__min_samples_split=2, vec__max_df=0.8, vec__min_df=0.01, vec__ngram_range=(1, 2); total time=  22.0s
[CV] END dt__max_depth=2, dt__min_samples_split=2, vec__max_df=0.8, vec__min_df=0.01, vec__ngram_range=(1, 2); total time=  21.4s
[CV] END dt__max_depth=2, dt__min_samples_split=2, vec__max_df=0.8, vec__min_df=0.01, vec__ngram_range=(1, 2); total time=  21.2s
[CV] END dt__max_depth=2, dt__min_samples_split=2, vec__max_df=0.8, vec__min_df=0.01, vec__ngram_range=(1, 2); total time=  21.9s
[CV] END dt__max_depth=2, dt__min_samples_split=2, vec__max_df=0.8, vec__min_df=0.05, vec__ngram_range=(1, 1); total time=   7.9s
[CV] END dt__max_depth=2, dt__min_samples_split=2, vec__max_df=0.8, vec__min_df=0.05, vec_

[CV] END dt__max_depth=2, dt__min_samples_split=5, vec__max_df=0.8, vec__min_df=0.01, vec__ngram_range=(1, 2); total time=  22.0s
[CV] END dt__max_depth=2, dt__min_samples_split=5, vec__max_df=0.8, vec__min_df=0.01, vec__ngram_range=(1, 2); total time=  21.6s
[CV] END dt__max_depth=2, dt__min_samples_split=5, vec__max_df=0.8, vec__min_df=0.01, vec__ngram_range=(1, 2); total time=  21.6s
[CV] END dt__max_depth=2, dt__min_samples_split=5, vec__max_df=0.8, vec__min_df=0.05, vec__ngram_range=(1, 1); total time=   8.3s
[CV] END dt__max_depth=2, dt__min_samples_split=5, vec__max_df=0.8, vec__min_df=0.05, vec__ngram_range=(1, 1); total time=   8.4s
[CV] END dt__max_depth=2, dt__min_samples_split=5, vec__max_df=0.8, vec__min_df=0.05, vec__ngram_range=(1, 1); total time=   8.3s
[CV] END dt__max_depth=2, dt__min_samples_split=5, vec__max_df=0.8, vec__min_df=0.05, vec__ngram_range=(1, 1); total time=   8.2s
[CV] END dt__max_depth=2, dt__min_samples_split=5, vec__max_df=0.8, vec__min_df=0.05, vec_

[CV] END dt__max_depth=5, dt__min_samples_split=2, vec__max_df=0.8, vec__min_df=0.05, vec__ngram_range=(1, 1); total time=  10.6s
[CV] END dt__max_depth=5, dt__min_samples_split=2, vec__max_df=0.8, vec__min_df=0.05, vec__ngram_range=(1, 1); total time=  10.5s
[CV] END dt__max_depth=5, dt__min_samples_split=2, vec__max_df=0.8, vec__min_df=0.05, vec__ngram_range=(1, 1); total time=  10.5s
[CV] END dt__max_depth=5, dt__min_samples_split=2, vec__max_df=0.8, vec__min_df=0.05, vec__ngram_range=(1, 1); total time=  10.5s
[CV] END dt__max_depth=5, dt__min_samples_split=2, vec__max_df=0.8, vec__min_df=0.05, vec__ngram_range=(1, 1); total time=  10.5s
[CV] END dt__max_depth=5, dt__min_samples_split=2, vec__max_df=0.8, vec__min_df=0.05, vec__ngram_range=(1, 2); total time=  24.2s
[CV] END dt__max_depth=5, dt__min_samples_split=2, vec__max_df=0.8, vec__min_df=0.05, vec__ngram_range=(1, 2); total time=  24.2s
[CV] END dt__max_depth=5, dt__min_samples_split=2, vec__max_df=0.8, vec__min_df=0.05, vec_

[CV] END dt__max_depth=5, dt__min_samples_split=5, vec__max_df=0.8, vec__min_df=0.05, vec__ngram_range=(1, 1); total time=  10.7s
[CV] END dt__max_depth=5, dt__min_samples_split=5, vec__max_df=0.8, vec__min_df=0.05, vec__ngram_range=(1, 1); total time=  10.5s
[CV] END dt__max_depth=5, dt__min_samples_split=5, vec__max_df=0.8, vec__min_df=0.05, vec__ngram_range=(1, 2); total time=  24.6s
[CV] END dt__max_depth=5, dt__min_samples_split=5, vec__max_df=0.8, vec__min_df=0.05, vec__ngram_range=(1, 2); total time=  24.9s
[CV] END dt__max_depth=5, dt__min_samples_split=5, vec__max_df=0.8, vec__min_df=0.05, vec__ngram_range=(1, 2); total time=  24.4s
[CV] END dt__max_depth=5, dt__min_samples_split=5, vec__max_df=0.8, vec__min_df=0.05, vec__ngram_range=(1, 2); total time=  24.6s
[CV] END dt__max_depth=5, dt__min_samples_split=5, vec__max_df=0.8, vec__min_df=0.05, vec__ngram_range=(1, 2); total time=  24.4s
[CV] END dt__max_depth=5, dt__min_samples_split=5, vec__max_df=0.9, vec__min_df=0.01, vec_

[CV] END dt__max_depth=10, dt__min_samples_split=2, vec__max_df=0.8, vec__min_df=0.05, vec__ngram_range=(1, 2); total time=  35.3s
[CV] END dt__max_depth=10, dt__min_samples_split=2, vec__max_df=0.8, vec__min_df=0.05, vec__ngram_range=(1, 2); total time=  34.9s
[CV] END dt__max_depth=10, dt__min_samples_split=2, vec__max_df=0.8, vec__min_df=0.05, vec__ngram_range=(1, 2); total time=  35.1s
[CV] END dt__max_depth=10, dt__min_samples_split=2, vec__max_df=0.8, vec__min_df=0.05, vec__ngram_range=(1, 2); total time=  34.4s
[CV] END dt__max_depth=10, dt__min_samples_split=2, vec__max_df=0.9, vec__min_df=0.01, vec__ngram_range=(1, 1); total time=  19.6s
[CV] END dt__max_depth=10, dt__min_samples_split=2, vec__max_df=0.9, vec__min_df=0.01, vec__ngram_range=(1, 1); total time=  20.5s
[CV] END dt__max_depth=10, dt__min_samples_split=2, vec__max_df=0.9, vec__min_df=0.01, vec__ngram_range=(1, 1); total time=  20.2s
[CV] END dt__max_depth=10, dt__min_samples_split=2, vec__max_df=0.9, vec__min_df=0.

[CV] END dt__max_depth=10, dt__min_samples_split=5, vec__max_df=0.8, vec__min_df=0.05, vec__ngram_range=(1, 2); total time=  32.7s
[CV] END dt__max_depth=10, dt__min_samples_split=5, vec__max_df=0.9, vec__min_df=0.01, vec__ngram_range=(1, 1); total time=  18.2s
[CV] END dt__max_depth=10, dt__min_samples_split=5, vec__max_df=0.9, vec__min_df=0.01, vec__ngram_range=(1, 1); total time=  17.9s
[CV] END dt__max_depth=10, dt__min_samples_split=5, vec__max_df=0.9, vec__min_df=0.01, vec__ngram_range=(1, 1); total time=  18.0s
[CV] END dt__max_depth=10, dt__min_samples_split=5, vec__max_df=0.9, vec__min_df=0.01, vec__ngram_range=(1, 1); total time=  18.0s
[CV] END dt__max_depth=10, dt__min_samples_split=5, vec__max_df=0.9, vec__min_df=0.01, vec__ngram_range=(1, 1); total time=  17.8s
[CV] END dt__max_depth=10, dt__min_samples_split=5, vec__max_df=0.9, vec__min_df=0.01, vec__ngram_range=(1, 2); total time=  44.5s
[CV] END dt__max_depth=10, dt__min_samples_split=5, vec__max_df=0.9, vec__min_df=0.

120 fits failed out of a total of 480.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
120 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/jf/anaconda3/envs/learn-env/lib/python3.8/site-packages/sklearn/model_selection/_validation.py", line 732, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/jf/anaconda3/envs/learn-env/lib/python3.8/site-packages/sklearn/base.py", line 1151, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "/Users/jf/anaconda3/envs/learn-env/lib/python3.8/site-packages/sklearn/pipeline.py", line 420, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "/Users/jf/anaconda3/envs/learn-env/lib/python3.8/site

## Converting Tagret Variable ('Price') to Discrete

In [85]:
#create a column that sorts price based on categories of ranges of price
conditions = [
    (df_processed['price'] <= 100),
    (df_processed['price'] > 100) & (df_processed['price'] <= 150),
    (df_processed['price'] > 150) & (df_processed['price'] <= 200),
    (df_processed['price'] > 200) & (df_processed['price'] <= 250),
    (df_processed['price'] > 250) & (df_processed['price'] <= 300),
    (df_processed['price'] > 300)]

#create a list of the values we want to assign for each condition
values = [1, 2, 3, 4, 5, 6]

#create a new column and use np.select to assign values to it using our lists as arguments
df_processed['price_range'] = np.select(conditions, values)
df_processed

Unnamed: 0.1,Unnamed: 0,id,price,bedrooms,review_scores_rating,neighbourhood_cleansed,listing_id,comments,processed_reviews,price_range
0,1,41240375,120,1.0,5.00,Playa del Rey,4.124038e+07,Paola is the best host I have ever had. She ha...,paola best host ever take care every detail al...,2
1,2,15239926,201,1.0,4.99,Santa Clarita,1.523993e+07,Fantastic super hosts and space . What a beaut...,fantastic super host space beautiful place sta...,4
2,3,14821183,88,1.0,3.00,Diamond Bar,1.482118e+07,Quiet house. Bedroom is enough for sleeping. u...,quiet house bedroom enough sleep unclean,1
3,6,26296415,180,1.0,5.00,Torrance,2.629642e+07,Amazing experience. The house was also recentl...,amaze experience house also recently upgrade w...,3
4,7,22746714,35,1.0,4.57,North El Monte,2.274671e+07,Nice neighborhood and hosts. Great location. T...,nice neighborhood host great location host res...,1
...,...,...,...,...,...,...,...,...,...,...
9866,13862,575384126844892676,159,1.0,4.67,West Hollywood,5.753841e+17,"Super host, thank you! This apartment was perf...",super host thank apartment perfect work stay s...,3
9867,13863,16072625,177,1.0,4.14,East Hollywood,1.607262e+07,Great location. And overall a decent stay. My ...,great location overall decent stay big issue p...,3
9868,13864,924091269757225413,120,1.0,1.00,Beverly Hills,9.240913e+17,"Since they sent me a review, I guess I’m revie...",since send review guess review confirm reserva...,2
9869,13867,720164781296601135,175,1.0,4.75,West Hollywood,7.201648e+17,"Great location, great host, beautiful apartmen...",great location great host beautiful apartment ...,3


# SIMPLE MODEL

**train test split with 'price' as a discrete target**

In [107]:
#train test split
X = df_processed["processed_reviews"]
y = df_processed["price_range"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=42
)

X_train.shape, X_test.shape, y_train.shape, y_test.shape

((7395,), (2465,), (7395,), (2465,))

## Baseline Model

In [94]:
#instantiate pipeline for basic Lo
pipe_logr_bas = Pipeline([("vec", CountVectorizer()), ("lreg", LogisticRegression(max_iter=10000))])

In [95]:
#cross-validate & return scores
scores = cross_validate(pipe_logr_range, X_train, y_train, 
                        cv=3, return_train_score=True)
scores

{'fit_time': array([163.30727196, 173.14294314, 158.89530182]),
 'score_time': array([1.86575103, 2.10764313, 2.17601728]),
 'test_score': array([0.40040568, 0.40081136, 0.39553753]),
 'train_score': array([0.97667343, 0.97931034, 0.97565923])}

### Logistic Regression - GridSearch

In [100]:
pipe_logr = Pipeline([("vec", TfidfVectorizer()), ("lreg", LogisticRegression())])

In [102]:
#Grid search for hyperparameter tuning the Logistic Regression

#Define parameter grid for the vectorizer and logistic regression
log_param_grid = {
    'vec__min_df': [.01, .02, .03],
    'vec__max_df': [.88, .90, .92],
    'vec__ngram_range': [(1,1), (1,2), (1,3)],
    'lreg__penalty': [None, 'l2', 'l1'],
    'lreg__class_weight': [None, 'balanced'],
    'lreg__solver': ['lbfgs', 'liblinear'],
    'lreg__max_iter': [1000, 10000]
}


log_grid = GridSearchCV(pipe_logr, log_param_grid, cv=5, n_jobs=-2, verbose=1, scoring='accuracy')
log_grid.fit(X_train, y_train)

Fitting 5 folds for each of 648 candidates, totalling 3240 fits


1080 fits failed out of a total of 3240.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
540 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/jf/anaconda3/envs/learn-env/lib/python3.8/site-packages/sklearn/model_selection/_validation.py", line 732, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/jf/anaconda3/envs/learn-env/lib/python3.8/site-packages/sklearn/base.py", line 1151, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "/Users/jf/anaconda3/envs/learn-env/lib/python3.8/site-packages/sklearn/pipeline.py", line 420, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "/Users/jf/anaconda3/envs/learn-env/lib/python3.8/si

In [106]:
log_grid.best_params_

{'lreg__class_weight': None,
 'lreg__max_iter': 1000,
 'lreg__penalty': 'l2',
 'lreg__solver': 'lbfgs',
 'vec__max_df': 0.9,
 'vec__min_df': 0.03,
 'vec__ngram_range': (1, 3)}

In [105]:
log_grid.best_score_

0.45990534144692363

**5% improvement in tuned logistic regression model with Tf-Idf vectorizer**

## Decision Tree Classifier

In [113]:
pipe_dt = Pipeline([("vec", TfidfVectorizer()), ("dt", DecisionTreeClassifier(random_state=42))])

In [114]:
#set up parameter grid
grid_dt = {'dt__max_depth': ['None', 2, 5, 10],
       'dt__min_samples_split': [2, 5],
       'vec__ngram_range': [(1,1), (1,2)],
        'vec__max_df': [.8, .9, .99],
        'vec__min_df': [.01, .05]}
#instantiate Grid Search for decision tree and vectorizer
gs = GridSearchCV(estimator=pipe_dt, param_grid=grid_dt, verbose=2)

#fit it to training data
gs.fit(X_train, y_train)

Fitting 5 folds for each of 96 candidates, totalling 480 fits
[CV] END dt__max_depth=None, dt__min_samples_split=2, vec__max_df=0.8, vec__min_df=0.01, vec__ngram_range=(1, 1); total time=   4.7s
[CV] END dt__max_depth=None, dt__min_samples_split=2, vec__max_df=0.8, vec__min_df=0.01, vec__ngram_range=(1, 1); total time=   4.6s
[CV] END dt__max_depth=None, dt__min_samples_split=2, vec__max_df=0.8, vec__min_df=0.01, vec__ngram_range=(1, 1); total time=   4.5s
[CV] END dt__max_depth=None, dt__min_samples_split=2, vec__max_df=0.8, vec__min_df=0.01, vec__ngram_range=(1, 1); total time=   4.5s
[CV] END dt__max_depth=None, dt__min_samples_split=2, vec__max_df=0.8, vec__min_df=0.01, vec__ngram_range=(1, 1); total time=   4.3s
[CV] END dt__max_depth=None, dt__min_samples_split=2, vec__max_df=0.8, vec__min_df=0.01, vec__ngram_range=(1, 2); total time=  13.7s
[CV] END dt__max_depth=None, dt__min_samples_split=2, vec__max_df=0.8, vec__min_df=0.01, vec__ngram_range=(1, 2); total time=  14.1s
[CV] EN

[CV] END dt__max_depth=None, dt__min_samples_split=5, vec__max_df=0.8, vec__min_df=0.01, vec__ngram_range=(1, 1); total time=   4.4s
[CV] END dt__max_depth=None, dt__min_samples_split=5, vec__max_df=0.8, vec__min_df=0.01, vec__ngram_range=(1, 1); total time=   4.6s
[CV] END dt__max_depth=None, dt__min_samples_split=5, vec__max_df=0.8, vec__min_df=0.01, vec__ngram_range=(1, 1); total time=   4.4s
[CV] END dt__max_depth=None, dt__min_samples_split=5, vec__max_df=0.8, vec__min_df=0.01, vec__ngram_range=(1, 1); total time=   4.3s
[CV] END dt__max_depth=None, dt__min_samples_split=5, vec__max_df=0.8, vec__min_df=0.01, vec__ngram_range=(1, 2); total time=  13.5s
[CV] END dt__max_depth=None, dt__min_samples_split=5, vec__max_df=0.8, vec__min_df=0.01, vec__ngram_range=(1, 2); total time=  13.4s
[CV] END dt__max_depth=None, dt__min_samples_split=5, vec__max_df=0.8, vec__min_df=0.01, vec__ngram_range=(1, 2); total time=  13.6s
[CV] END dt__max_depth=None, dt__min_samples_split=5, vec__max_df=0.8

[CV] END dt__max_depth=2, dt__min_samples_split=2, vec__max_df=0.8, vec__min_df=0.01, vec__ngram_range=(1, 1); total time=   6.0s
[CV] END dt__max_depth=2, dt__min_samples_split=2, vec__max_df=0.8, vec__min_df=0.01, vec__ngram_range=(1, 1); total time=   5.9s
[CV] END dt__max_depth=2, dt__min_samples_split=2, vec__max_df=0.8, vec__min_df=0.01, vec__ngram_range=(1, 2); total time=  17.5s
[CV] END dt__max_depth=2, dt__min_samples_split=2, vec__max_df=0.8, vec__min_df=0.01, vec__ngram_range=(1, 2); total time=  17.6s
[CV] END dt__max_depth=2, dt__min_samples_split=2, vec__max_df=0.8, vec__min_df=0.01, vec__ngram_range=(1, 2); total time=  17.6s
[CV] END dt__max_depth=2, dt__min_samples_split=2, vec__max_df=0.8, vec__min_df=0.01, vec__ngram_range=(1, 2); total time=  17.1s
[CV] END dt__max_depth=2, dt__min_samples_split=2, vec__max_df=0.8, vec__min_df=0.01, vec__ngram_range=(1, 2); total time=  17.4s
[CV] END dt__max_depth=2, dt__min_samples_split=2, vec__max_df=0.8, vec__min_df=0.05, vec_

[CV] END dt__max_depth=2, dt__min_samples_split=5, vec__max_df=0.8, vec__min_df=0.01, vec__ngram_range=(1, 2); total time=  17.2s
[CV] END dt__max_depth=2, dt__min_samples_split=5, vec__max_df=0.8, vec__min_df=0.01, vec__ngram_range=(1, 2); total time=  16.8s
[CV] END dt__max_depth=2, dt__min_samples_split=5, vec__max_df=0.8, vec__min_df=0.01, vec__ngram_range=(1, 2); total time=  16.7s
[CV] END dt__max_depth=2, dt__min_samples_split=5, vec__max_df=0.8, vec__min_df=0.01, vec__ngram_range=(1, 2); total time=  16.8s
[CV] END dt__max_depth=2, dt__min_samples_split=5, vec__max_df=0.8, vec__min_df=0.05, vec__ngram_range=(1, 1); total time=   5.8s
[CV] END dt__max_depth=2, dt__min_samples_split=5, vec__max_df=0.8, vec__min_df=0.05, vec__ngram_range=(1, 1); total time=   5.8s
[CV] END dt__max_depth=2, dt__min_samples_split=5, vec__max_df=0.8, vec__min_df=0.05, vec__ngram_range=(1, 1); total time=   5.8s
[CV] END dt__max_depth=2, dt__min_samples_split=5, vec__max_df=0.8, vec__min_df=0.05, vec_

[CV] END dt__max_depth=5, dt__min_samples_split=2, vec__max_df=0.8, vec__min_df=0.01, vec__ngram_range=(1, 2); total time=  19.3s
[CV] END dt__max_depth=5, dt__min_samples_split=2, vec__max_df=0.8, vec__min_df=0.05, vec__ngram_range=(1, 1); total time=   6.6s
[CV] END dt__max_depth=5, dt__min_samples_split=2, vec__max_df=0.8, vec__min_df=0.05, vec__ngram_range=(1, 1); total time=   6.9s
[CV] END dt__max_depth=5, dt__min_samples_split=2, vec__max_df=0.8, vec__min_df=0.05, vec__ngram_range=(1, 1); total time=   7.0s
[CV] END dt__max_depth=5, dt__min_samples_split=2, vec__max_df=0.8, vec__min_df=0.05, vec__ngram_range=(1, 1); total time=   6.6s
[CV] END dt__max_depth=5, dt__min_samples_split=2, vec__max_df=0.8, vec__min_df=0.05, vec__ngram_range=(1, 1); total time=   6.6s
[CV] END dt__max_depth=5, dt__min_samples_split=2, vec__max_df=0.8, vec__min_df=0.05, vec__ngram_range=(1, 2); total time=  17.6s
[CV] END dt__max_depth=5, dt__min_samples_split=2, vec__max_df=0.8, vec__min_df=0.05, vec_

[CV] END dt__max_depth=5, dt__min_samples_split=5, vec__max_df=0.8, vec__min_df=0.05, vec__ngram_range=(1, 1); total time=   6.5s
[CV] END dt__max_depth=5, dt__min_samples_split=5, vec__max_df=0.8, vec__min_df=0.05, vec__ngram_range=(1, 1); total time=   6.4s
[CV] END dt__max_depth=5, dt__min_samples_split=5, vec__max_df=0.8, vec__min_df=0.05, vec__ngram_range=(1, 1); total time=   6.4s
[CV] END dt__max_depth=5, dt__min_samples_split=5, vec__max_df=0.8, vec__min_df=0.05, vec__ngram_range=(1, 2); total time=  17.6s
[CV] END dt__max_depth=5, dt__min_samples_split=5, vec__max_df=0.8, vec__min_df=0.05, vec__ngram_range=(1, 2); total time=  17.4s
[CV] END dt__max_depth=5, dt__min_samples_split=5, vec__max_df=0.8, vec__min_df=0.05, vec__ngram_range=(1, 2); total time=  17.9s
[CV] END dt__max_depth=5, dt__min_samples_split=5, vec__max_df=0.8, vec__min_df=0.05, vec__ngram_range=(1, 2); total time=  18.0s
[CV] END dt__max_depth=5, dt__min_samples_split=5, vec__max_df=0.8, vec__min_df=0.05, vec_

[CV] END dt__max_depth=10, dt__min_samples_split=2, vec__max_df=0.8, vec__min_df=0.05, vec__ngram_range=(1, 2); total time=  21.9s
[CV] END dt__max_depth=10, dt__min_samples_split=2, vec__max_df=0.8, vec__min_df=0.05, vec__ngram_range=(1, 2); total time=  20.9s
[CV] END dt__max_depth=10, dt__min_samples_split=2, vec__max_df=0.8, vec__min_df=0.05, vec__ngram_range=(1, 2); total time=  21.1s
[CV] END dt__max_depth=10, dt__min_samples_split=2, vec__max_df=0.8, vec__min_df=0.05, vec__ngram_range=(1, 2); total time=  21.6s
[CV] END dt__max_depth=10, dt__min_samples_split=2, vec__max_df=0.8, vec__min_df=0.05, vec__ngram_range=(1, 2); total time=  20.9s
[CV] END dt__max_depth=10, dt__min_samples_split=2, vec__max_df=0.9, vec__min_df=0.01, vec__ngram_range=(1, 1); total time=   9.4s
[CV] END dt__max_depth=10, dt__min_samples_split=2, vec__max_df=0.9, vec__min_df=0.01, vec__ngram_range=(1, 1); total time=   9.2s
[CV] END dt__max_depth=10, dt__min_samples_split=2, vec__max_df=0.9, vec__min_df=0.

[CV] END dt__max_depth=10, dt__min_samples_split=5, vec__max_df=0.8, vec__min_df=0.05, vec__ngram_range=(1, 2); total time=  21.6s
[CV] END dt__max_depth=10, dt__min_samples_split=5, vec__max_df=0.8, vec__min_df=0.05, vec__ngram_range=(1, 2); total time=  20.4s
[CV] END dt__max_depth=10, dt__min_samples_split=5, vec__max_df=0.9, vec__min_df=0.01, vec__ngram_range=(1, 1); total time=   9.0s
[CV] END dt__max_depth=10, dt__min_samples_split=5, vec__max_df=0.9, vec__min_df=0.01, vec__ngram_range=(1, 1); total time=   9.0s
[CV] END dt__max_depth=10, dt__min_samples_split=5, vec__max_df=0.9, vec__min_df=0.01, vec__ngram_range=(1, 1); total time=   9.0s
[CV] END dt__max_depth=10, dt__min_samples_split=5, vec__max_df=0.9, vec__min_df=0.01, vec__ngram_range=(1, 1); total time=   8.9s
[CV] END dt__max_depth=10, dt__min_samples_split=5, vec__max_df=0.9, vec__min_df=0.01, vec__ngram_range=(1, 1); total time=   9.1s
[CV] END dt__max_depth=10, dt__min_samples_split=5, vec__max_df=0.9, vec__min_df=0.

120 fits failed out of a total of 480.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
120 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/jf/anaconda3/envs/learn-env/lib/python3.8/site-packages/sklearn/model_selection/_validation.py", line 732, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/jf/anaconda3/envs/learn-env/lib/python3.8/site-packages/sklearn/base.py", line 1151, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "/Users/jf/anaconda3/envs/learn-env/lib/python3.8/site-packages/sklearn/pipeline.py", line 420, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "/Users/jf/anaconda3/envs/learn-env/lib/python3.8/site

In [120]:
y_train.value_counts()

2    2576
1    2166
3    1407
4     610
6     341
5     295
Name: price_range, dtype: int64

In [121]:
X_train.value_counts()

host cancel reservation day arrival automate post                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                       

## Modeling - New Regex Pattern

In [177]:
df_processed2 = pd.read_csv('../data/processed_reviews2.csv')
df_processed2

Unnamed: 0.1,Unnamed: 0,id,price,bedrooms,review_scores_rating,neighbourhood_cleansed,listing_id,comments,processed_reviews
0,1,41240375,120.0,1.0,5.00,Playa del Rey,4.124038e+07,Paola is the best host I have ever had. She ha...,paola best host ever take care every detail al...
1,2,15239926,201.0,1.0,4.99,Santa Clarita,1.523993e+07,Fantastic super hosts and space . What a beaut...,fantastic super host space beautiful place sta...
2,3,14821183,88.0,1.0,3.00,Diamond Bar,1.482118e+07,Quiet house. Bedroom is enough for sleeping. u...,quiet house bedroom enough sleep unclean
3,6,26296415,180.0,1.0,5.00,Torrance,2.629642e+07,Amazing experience. The house was also recentl...,amaze experience house also recently upgrade w...
4,7,22746714,35.0,1.0,4.57,North El Monte,2.274671e+07,Nice neighborhood and hosts. Great location. T...,nice neighborhood host great location host res...
...,...,...,...,...,...,...,...,...,...
9866,13862,575384126844892676,159.0,1.0,4.67,West Hollywood,5.753841e+17,"Super host, thank you! This apartment was perf...",super host thank apartment perfect work stay s...
9867,13863,16072625,177.0,1.0,4.14,East Hollywood,1.607262e+07,Great location. And overall a decent stay. My ...,great location overall decent stay big issue p...
9868,13864,924091269757225413,120.0,1.0,1.00,Beverly Hills,9.240913e+17,"Since they sent me a review, I guess I’m revie...",since send review guess review confirm reserva...
9869,13867,720164781296601135,175.0,1.0,4.75,West Hollywood,7.201648e+17,"Great location, great host, beautiful apartmen...",great location great host beautiful apartment ...
