<p style="font-family:'Times New Roman'; font-size:250%; text-align:center; font-weight:bold;">Text Analytics on Airbnb Reviews</p>
<br>
<p style="font-family:'Times New Roman'; font-size: 150%; text-align:right; font-weight:bold;"></p>

Archive data of Asheville data taken from http://insideairbnb.com/get-the-data.html. Compiled Date = '28 September, 2019'

# Import Statements

In [1]:
import pandas as pd
import numpy as np
import gzip

import re
from nltk.tokenize import word_tokenize # tokenize a document into words
from nltk.tokenize import sent_tokenize # tokenize a document into sentences
from collections import Counter
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.stem import SnowballStemmer
snowball_stemmer = SnowballStemmer("english")

import nltk

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans

from nltk import FreqDist
import spacy
import gensim
from gensim import corpora
from gensim.models import CoherenceModel

# libraries for Text Clustering visualization
import pyLDAvis
import pyLDAvis.gensim

from pprint import pprint
import os
import glob # finds all the pathnames matching a specified pattern according to the rules used by the Unix shell

import matplotlib.pyplot as plt
import seaborn as sns

import string
from datetime import datetime

import itertools # Useful for changing a 2D to 1D array - for creating a vocabulary of words from all the reviews
from sklearn.feature_extraction.text import CountVectorizer

# Seed for reproducibility
SEED = 1

# Reviews

In [2]:
reviews = pd.read_csv('.\\Asheville\\reviews.csv')

In [3]:
reviews.shape

(159010, 6)

## Dealing with missing values

In [4]:
total = reviews.isnull().sum().sort_values(ascending=False)
percent = (reviews.isnull().sum()/reviews.isnull().count()).sort_values(ascending=False)
missing_data = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
missing_data.head(20)

Unnamed: 0,Total,Percent
comments,41,0.000258
reviewer_name,1,6e-06
reviewer_id,0,0.0
date,0,0.0
id,0,0.0
listing_id,0,0.0


In [5]:
reviews.dropna(subset=['comments'], inplace=True)
reviews.shape

(158969, 6)

In [6]:
# to view full content of all cells in the dataframe
# pd.set_option('display.max_colwidth', -1)

In [7]:
reviews.head(1)

Unnamed: 0,listing_id,id,date,reviewer_id,reviewer_name,comments
0,38585,129120,2010-10-28,55877,Ritchie,Evelyne is an accommodating host who lives in ...


## Grouping reviews by listing_id

### Finding Number of reviews for each listing

In [8]:
reviews1 = reviews.groupby(["listing_id"], as_index=False).size().to_frame(name = 'review_count').reset_index()
reviews1.iloc[[0,-1]]

Unnamed: 0,listing_id,review_count
0,38585,134
2122,38616361,1


### Grouping review texts for each listing

In [9]:
reviews[reviews.listing_id == 38585].head(5)

Unnamed: 0,listing_id,id,date,reviewer_id,reviewer_name,comments
0,38585,129120,2010-10-28,55877,Ritchie,Evelyne is an accommodating host who lives in ...
1,38585,147273,2010-11-30,279973,Cathy,Evelyne was very welcoming to her home; my fri...
2,38585,198797,2011-03-14,411638,,I really enjoyed Evelyne's welcoming and bubbl...
3,38585,201932,2011-03-17,441855,Bill,Very gracious host and was helpful in all aspe...
4,38585,341616,2011-06-28,657560,Joakim,Evelyn was very friendly and easy to comunicat...


In [10]:
reviews['review'] = reviews[['listing_id','comments']].groupby(['listing_id'])['comments'].transform(lambda x: ','.join(x))

In [11]:
reviews.listing_id.nunique()

2123

After the grouping the number of records will be reduced from 158969 to 2123.

In [12]:
reviews = reviews[['listing_id', 'review']].drop_duplicates()
reviews

Unnamed: 0,listing_id,review
0,38585,Evelyne is an accommodating host who lives in ...
134,80905,Celeste is an accommodating and gracious host....
233,108061,"Lisa is superb hostess, she will treat you lik..."
320,155305,We had a wonderful time! The cottage was very ...
539,156805,"Cool place. Nice folks. Good location. ,They w..."
599,156926,Arrived around 4PM and greeted very warmly. Re...
863,160594,My husband and I had a wonderful time with Eli...
921,189916,Amy and Ken were fabulous hosts. They had lots...
1064,209068,Absolutely enjoyed our stay w/ Kevin and Anne....
1117,213006,I had an awesome time at Susanne's place! Her ...


In [13]:
reviews.shape

(2123, 2)

In [14]:
reviews.iloc[0,1]

'Evelyne is an accommodating host who lives in a beautiful rural area just outside Asheville. I recommend staying with her if you are looking for some peace and quiet in a natural setting.,Evelyne was very welcoming to her home; my friend and I enjoyed her company as well as our trip to asheville.,I really enjoyed Evelyne\'s welcoming and bubbly personality and was more "wowed" by the beautiful house and scenery than I expected to be.   Even Zoe was good company.  Also enjoyed the organic theme and left feeling more vibrant and healthier.  I would definitely stay again.  Thanks Evelyne!!,Very gracious host and was helpful in all aspects of finding her home and information about Ashville. ,Evelyn was very friendly and easy to comunicate with. the hous was very clean and he made us breakfast with tea from her own garden wich tasted really good. Nice stay.,Evelyne was a gracious, inviting, friendly hostess, well-informed about places to eat and what to do in Asheville. We got great direct

In [15]:
reviews = reviews.reset_index(drop=True)

### Joining review count with reviews table

In [16]:
reviews = pd.merge(reviews, reviews1, how='left', on=('listing_id'))

In [17]:
reviews.iloc[[0,-1]]

Unnamed: 0,listing_id,review,review_count
0,38585,Evelyne is an accommodating host who lives in ...,134
2122,38616361,If you have to stay in the Asheville area and ...,1


## Tokenizing the Review Text & Deriving New Columns

In [18]:
def text_transformation(data, column):
# Tokenizing the reviews into sentences
#     data['tokenized_sentenses'] = data.apply(lambda row: sent_tokenize(row[column]), axis=1)
#     Count the number of sentence in each Review and store it in a new column 'sentence_count'
    data['sentence_count'] = data.apply(lambda row: sent_tokenize(row[column]), axis=1).apply(lambda x: len(x))
    
#   List the Strong Words useful for cleaning later. Single letter words like 'I', 'A' will be ignored.
    data['strong_words'] = data.apply(lambda row: re.findall(r"\b[A-Z][A-Z]+\b",row[column]), axis=1)  
    
#   Number of words that were written in uppercase in the reviews (STRONG words)
    data['strong_count'] = data['strong_words'].apply(lambda x: len(x))
#     data['strong_count'] = data.apply(lambda row: len(map(str.isupper, row[column].split())),axis=1)

#   Count the number of '#', '!', '?' in each Review and store it in a new column 'symbol_count'
    data['symbol_count'] = data.apply(lambda row: re.findall(r"[\#\!\?]",row[column]), axis=1).apply(lambda x: len(x))
    
    return data

In [19]:
def review_words_transformation(data, column):
#     Word Tokenize the reviews
    data['temp'ply(lambda row: word_tokenize(row[column]), axis=1)
    
#     Convert the tokens into lowercase
    data['temp'] = data.apply(lambda row: [t.lower() for t in row['temp']], axis=1)
    
#     Retain only alphabetic_words] = data.ap
    data['temp'] = data.apply(lambda row: [t for t in row['temp'] if t.isalpha()], axis=1)    
    
#     Remove all stop words
    data['temp'] = data.apply(lambda row: [t for t in row['temp'] if t not in stopwords.words('english')], axis=1) 
    
#     Instantiate the WordNetLemmatizer
    wordnet_lemmatizer = WordNetLemmatizer()
# In general, stemming stripping affixes such as ‘ed’, ‘ing’, ‘tion’ etc. Ex. stemming ‘walked’ returns walk. 
# But a more intelligent way of cleaning text data is by lemmatization. 
# In lemmatization words are converted to its root words using a dictionary. Ex. lemmatizing ‘is’ returns ‘be’. 
# Hence we have used lemmatization in our cleaning process.

#     Lemmatize all tokens 
    data['temp'] = data.apply(lambda row: [wordnet_lemmatizer.lemmatize(t) for t in row['temp']], axis=1)
    
#     Stored Counter of tokenized words
    data['word_count'] = data.apply(lambda row: Counter(row['temp']), axis=1)
      
#     Number of records for which 'review_words_transformation' function is to be run
    return data

## Running the functions for Tokenization and Creating New Columns on the Subset dataframe

Creating subset dataframe so that we DO NOT run the functions that create the word token on the entire dataset but in batches.

Subset-1 : First 100 records.<br>
Subset-2 : 101-500 records.<br>
Subset-3 : 501-1000 records.<br>
Subset-4 : 1001-2000 records.<br>
Subset-5 : 2001-2123 records.<br>

In [20]:
# Creating a subset dataframe to work on
subset = reviews[2000:]
subset.shape

(123, 3)

In [21]:
# Adding New columns to the subset table
new_columns=reviews.columns.tolist()+['sentence_count','strong_words','strong_count','symbol_count','temp','word_count']
subset = subset.reindex(columns = new_columns)                

In [22]:
# First and Last Record of the current subset.
subset.iloc[[0, -1]]

Unnamed: 0,listing_id,review,review_count,sentence_count,strong_words,strong_count,symbol_count,temp,word_count
2000,35774025,This was a fabulous townhome - incredibly func...,10,,,,,,
2122,38616361,If you have to stay in the Asheville area and ...,1,,,,,,


In [23]:
%%time
text_transformation(subset, 'review')

Wall time: 102 ms


Unnamed: 0,listing_id,review,review_count,sentence_count,strong_words,strong_count,symbol_count,temp,word_count
2000,35774025,This was a fabulous townhome - incredibly func...,10,18,[LOVED],1,9,,
2001,35802993,Great location! The place seemed to be profess...,13,45,[],0,17,,
2002,35843306,Mark was a very accommodating and helpful host...,11,33,"[ZEN, ZEN, ZEN]",3,16,,
2003,35877435,"Great place, super clean with everything neede...",6,24,[],0,4,,
2004,35910484,My friend and I stayed here for a night. We a...,16,63,"[BBQ, AMAZING]",2,25,,
2005,35963534,Jeanne’s place is an adorable little apartment...,1,4,[],0,0,,
2006,35968053,Mary's home was perfect for our family trip to...,10,25,[],0,2,,
2007,36002529,The photos don't do this place justice! Selina...,7,23,[BNB],1,5,,
2008,36066072,We rented the Atkinson suite with a couple of ...,1,8,[],0,0,,
2009,36068314,Nice secluded place in the woods. Rustic but r...,1,2,[],0,0,,


In [24]:
%%time
review_words_transformation(subset, 'review')

Wall time: 23.1 s


Unnamed: 0,listing_id,review,review_count,sentence_count,strong_words,strong_count,symbol_count,temp,word_count
2000,35774025,This was a fabulous townhome - incredibly func...,10,18,[LOVED],1,9,"[fabulous, townhome, incredibly, functional, b...","{'fabulous': 1, 'townhome': 1, 'incredibly': 2..."
2001,35802993,Great location! The place seemed to be profess...,13,45,[],0,17,"[great, location, place, seemed, professionall...","{'great': 9, 'location': 9, 'place': 9, 'seeme..."
2002,35843306,Mark was a very accommodating and helpful host...,11,33,"[ZEN, ZEN, ZEN]",3,16,"[mark, accommodating, helpful, host, always, q...","{'mark': 11, 'accommodating': 2, 'helpful': 1,..."
2003,35877435,"Great place, super clean with everything neede...",6,24,[],0,4,"[great, place, super, clean, everything, neede...","{'great': 5, 'place': 6, 'super': 2, 'clean': ..."
2004,35910484,My friend and I stayed here for a night. We a...,16,63,"[BBQ, AMAZING]",2,25,"[friend, stayed, night, absolutely, loved, stu...","{'friend': 1, 'stayed': 1, 'night': 1, 'absolu..."
2005,35963534,Jeanne’s place is an adorable little apartment...,1,4,[],0,0,"[jeanne, place, adorable, little, apartment, b...","{'jeanne': 2, 'place': 1, 'adorable': 1, 'litt..."
2006,35968053,Mary's home was perfect for our family trip to...,10,25,[],0,2,"[mary, home, perfect, family, trip, asheville,...","{'mary': 10, 'home': 7, 'perfect': 2, 'family'..."
2007,36002529,The photos don't do this place justice! Selina...,7,23,[BNB],1,5,"[photo, place, justice, selina, home, unique, ...","{'photo': 1, 'place': 7, 'justice': 1, 'selina..."
2008,36066072,We rented the Atkinson suite with a couple of ...,1,8,[],0,0,"[rented, atkinson, suite, couple, friend, enti...","{'rented': 1, 'atkinson': 1, 'suite': 2, 'coup..."
2009,36068314,Nice secluded place in the woods. Rustic but r...,1,2,[],0,0,"[nice, secluded, place, wood, rustic, reasonab...","{'nice': 1, 'secluded': 1, 'place': 1, 'wood':..."


In [25]:
# If 'review_words_transformation' function is successfully implemented in the subset, the value will be 0.
subset[subset["word_count"].isnull()].shape[0]

0

In [26]:
# Preparing the dataset to be saved
subset = subset[['listing_id', 'review', 'sentence_count', 'strong_words', 'strong_count', 'symbol_count', 'word_count']]

In [27]:
subset.shape

(123, 7)

In [28]:
subset.dtypes

listing_id         int64
review            object
sentence_count     int64
strong_words      object
strong_count       int64
symbol_count       int64
word_count        object
dtype: object

In [29]:
# Finding all the subset reviews csv files
files = glob.glob('processed_reviews*.csv')
len(files)

4

In [30]:
# Extract "reviews" from files[0]
try: 
    new_filename = re.search(r"[A-Za-z+_]+", files[0])[0]
except IndexError:
    new_filename = 'processed_reviews'

# Create new file name by adding len+1 to the current reviews.csv list
# i.e, if there are already 5 csv files starting with "reviews", 
# the filename of the new file to be created will be reviews6.csv
new_filename = str(new_filename) + str(len(files)+1) + '.csv'
new_filename

'processed_reviews5.csv'

In [31]:
# Saving the dataset
subset.to_csv(new_filename)

In [32]:
# Add newly created csv file to the list
files = files + [new_filename]
files

['processed_reviews1.csv',
 'processed_reviews2.csv',
 'processed_reviews3.csv',
 'processed_reviews4.csv',
 'processed_reviews5.csv']

## Combining all the subset csv files to create a single dataset to be used for the model

In [33]:
# Intialize df with first file : reviews1.csv
df = pd.read_csv(files[0])
del files[0] # Delete 'reviews.csv' from the list, so that we don't append it.

for f in files:
    df_temp = pd.read_csv(f)
    df = pd.concat([df,df_temp])

In [34]:
df.shape

(2123, 8)

In [35]:
df = df.reset_index(drop=True)

In [36]:
df.iloc[[0, -1]]

Unnamed: 0.1,Unnamed: 0,listing_id,review,sentence_count,strong_words,strong_count,symbol_count,word_count
0,0,38585,Evelyne is an accommodating host who lives in ...,559,"['DELICIOUS', 'AMAZING', 'UNC', 'US', 'BBQ', '...",11,142,"Counter({'evelyne': 161, 'breakfast': 100, 'ho..."
2122,2122,38616361,If you have to stay in the Asheville area and ...,2,[],0,1,"Counter({'asheville': 2, 'stay': 1, 'area': 1,..."


In [37]:
df = df[['listing_id', 'review', 'sentence_count', 'strong_words', 'strong_count', 'symbol_count', 'word_count']]

In [38]:
df.to_csv('reviews_intermediate_stage.csv', index=False)

## Creating CounterVectorizer and Creating New Columns from the words present in the review

In [39]:
df = pd.read_csv('reviews_intermediate_stage.csv')
df.iloc[[0, -1]]

Unnamed: 0,listing_id,review,sentence_count,strong_words,strong_count,symbol_count,word_count
0,38585,Evelyne is an accommodating host who lives in ...,559,"['DELICIOUS', 'AMAZING', 'UNC', 'US', 'BBQ', '...",11,142,"Counter({'evelyne': 161, 'breakfast': 100, 'ho..."
2122,38616361,If you have to stay in the Asheville area and ...,2,[],0,1,"Counter({'asheville': 2, 'stay': 1, 'area': 1,..."


In [40]:
df.shape

(2123, 7)

In [41]:
def transform_word_count(text): # If text= "Counter({'first_word': 2, 'second_word': 1"
    words = re.findall(r'\'(.+?)\'',text) # returns ['first_word','second_word']
    n = re.findall(r"\d+",text) #returns ['2','1']
    result = []
    for i in range(len(words)): # goal is to obtain ['first_word', 'first_word', 'second_word']
        for j in range(int(n[i])):
            result.append(words[i])
    return result

In [42]:
def find_no_of_words(text): # If text= "Counter({'first_word': 2, 'second_word': 1"
    n = re.findall(r"\d+",text) #returns ['2','1']
    # Code Reference : https://stackoverflow.com/questions/7368789/convert-all-strings-in-a-list-to-int
    n = list(map(int, n)) #returns [2,1]
    return sum(n)

In [43]:
# Change the column name from 'word_count' to 'word_counter'
df.rename(columns={'word_count':'word_counter'}, inplace=True) #https://cmdlinetips.com/2018/03/how-to-change-column-names-and-row-indexes-in-pandas/

In [44]:
# Creating a new column word_count that has the number of cleaned words in the review
df['word_count'] = df.apply(lambda row: find_no_of_words(row['word_counter']), axis=1)
df.head(n=1)

Unnamed: 0,listing_id,review,sentence_count,strong_words,strong_count,symbol_count,word_counter,word_count
0,38585,Evelyne is an accommodating host who lives in ...,559,"['DELICIOUS', 'AMAZING', 'UNC', 'US', 'BBQ', '...",11,142,"Counter({'evelyne': 161, 'breakfast': 100, 'ho...",4356


In [45]:
# Modifying the word_counter to the desired format of text and storing it as a new variable 'modified_review_text'
df['modified_review_text'] = df.apply(lambda row: transform_word_count(row['word_counter']), axis=1)
df.head(n=1)

Unnamed: 0,listing_id,review,sentence_count,strong_words,strong_count,symbol_count,word_counter,word_count,modified_review_text
0,38585,Evelyne is an accommodating host who lives in ...,559,"['DELICIOUS', 'AMAZING', 'UNC', 'US', 'BBQ', '...",11,142,"Counter({'evelyne': 161, 'breakfast': 100, 'ho...",4356,"[evelyne, evelyne, evelyne, evelyne, evelyne, ..."


In [46]:
df['modified_review_text'].head(n=1)

0    [evelyne, evelyne, evelyne, evelyne, evelyne, ...
Name: modified_review_text, dtype: object

In [47]:
# Reference : https://stackoverflow.com/questions/29244286/how-to-flatten-a-2d-list-to-1d-without-using-numpy
# Creating a vocabulary of words from all the reviews using the itertools library by joining all modified_review_text values
vocabulary = list(itertools.chain.from_iterable(df.modified_review_text.tolist()))
vocabulary

['evelyne',
 'evelyne',
 'evelyne',
 'evelyne',
 'evelyne',
 'evelyne',
 'evelyne',
 'evelyne',
 'evelyne',
 'evelyne',
 'evelyne',
 'evelyne',
 'evelyne',
 'evelyne',
 'evelyne',
 'evelyne',
 'evelyne',
 'evelyne',
 'evelyne',
 'evelyne',
 'evelyne',
 'evelyne',
 'evelyne',
 'evelyne',
 'evelyne',
 'evelyne',
 'evelyne',
 'evelyne',
 'evelyne',
 'evelyne',
 'evelyne',
 'evelyne',
 'evelyne',
 'evelyne',
 'evelyne',
 'evelyne',
 'evelyne',
 'evelyne',
 'evelyne',
 'evelyne',
 'evelyne',
 'evelyne',
 'evelyne',
 'evelyne',
 'evelyne',
 'evelyne',
 'evelyne',
 'evelyne',
 'evelyne',
 'evelyne',
 'evelyne',
 'evelyne',
 'evelyne',
 'evelyne',
 'evelyne',
 'evelyne',
 'evelyne',
 'evelyne',
 'evelyne',
 'evelyne',
 'evelyne',
 'evelyne',
 'evelyne',
 'evelyne',
 'evelyne',
 'evelyne',
 'evelyne',
 'evelyne',
 'evelyne',
 'evelyne',
 'evelyne',
 'evelyne',
 'evelyne',
 'evelyne',
 'evelyne',
 'evelyne',
 'evelyne',
 'evelyne',
 'evelyne',
 'evelyne',
 'evelyne',
 'evelyne',
 'evelyne',
 'ev

In [48]:
vocab = Counter()
vocab.update(vocabulary)
print(vocab)






In [49]:
# keep top 5000 tokens
tokens = [k for k,c in vocab.most_common(5000)]
print((tokens))






In [50]:
# Function to save a text from this Notebook as a .txt file in the same directory. Saving text for later use.
def save_list(lines, filename):
    # convert lines to a single blob of text
    data = '\n'.join(lines)
    # open file
    file = open(filename, 'w', encoding='utf-8')
    # write text
    file.write(data)
    # close file
    file.close()

In [51]:
# save tokens to a vocabulary file
save_list(tokens, 'vocab.txt')

In [52]:
# load doc into memory
def load_doc(filename):
    # open the file as read only
    file = open(filename, 'r', encoding='utf-8')
    # read all text
    text = file.read()
    # close the file
    file.close()
    return text

In [53]:
# load the vocabulary
vocab = load_doc('vocab.txt')
vocab = vocab.split() # Split the content into a list of words like ['word1', 'word2']
# 'print(type(vocab))' returns '<class 'list'>'
print(vocab)

# vocab1 = set(vocab) #returns {'word1', 'word2'} but the order is changed
# 'print(type(vocab1))' returns '<class 'set'>'






In [54]:
vectorizer = CountVectorizer(max_features=5000, vocabulary=vocab)
vectorizer

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=5000, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None,
        vocabulary=['great', 'place', 'stay', 'asheville', 'location', 'clean', 'downtown', 'would', 'host', 'comfortable', 'house', 'u', 'everything', 'home', 'perfect', 'space', 'definitely', 'nice', 'close', 'recommend', 'easy', 'room', 'time', 'quiet', 'wonderful', 'bed', 'beautiful', 'loved', 'back', '...ted', 'comparison', 'prepped', 'doorway', 'lifetime', 'dans', 'una', 'twenty', 'sticky', 'platform'])

In [55]:
df['modified_review_text'].head(n=1)

0    [evelyne, evelyne, evelyne, evelyne, evelyne, ...
Name: modified_review_text, dtype: object

In [56]:
# Transforming the column 'modified_review_text' to be used as input for vectorizer.fit_transform
# Removing the '[' present in the modified_review_text column of all the records
df['modified_review_text'] = df.apply(lambda row: re.sub(r'\[',"",(str(row['modified_review_text']))), axis=1)
# Removing the single quotes present in the modified_review_text column of all the records
df['modified_review_text'] = df.apply(lambda row: re.sub(r'\'',"",(str(row['modified_review_text']))), axis=1)
# Removing the ',' present in the modified_review_text column of all the records
df['modified_review_text'] = df.apply(lambda row: re.sub(r',',"",(str(row['modified_review_text']))), axis=1)
# Removing the ']' present in the modified_review_text column of all the records
df['modified_review_text'] = df.apply(lambda row: re.sub(r'\]',"",(str(row['modified_review_text']))), axis=1)

df['modified_review_text'].head(n=1)

0    evelyne evelyne evelyne evelyne evelyne evelyn...
Name: modified_review_text, dtype: object

In [57]:
train_data_features = vectorizer.fit_transform(df['modified_review_text'])

In [58]:
print(len(vectorizer.get_feature_names()))
vectorizer.get_feature_names()[:10]

5000


['great',
 'place',
 'stay',
 'asheville',
 'location',
 'clean',
 'downtown',
 'would',
 'host',
 'comfortable']

In [59]:
# Bag of Words - Data Frame
bow_df = pd.DataFrame(train_data_features.toarray(), columns=vectorizer.get_feature_names())

In [60]:
df.columns.tolist()

['listing_id',
 'review',
 'sentence_count',
 'strong_words',
 'strong_count',
 'symbol_count',
 'word_counter',
 'word_count',
 'modified_review_text']

In [61]:
columns_to_delete = ['review', 'strong_words', 'word_counter', 'modified_review_text']
df.drop(columns=columns_to_delete, inplace=True)
df.tail(n=1)

Unnamed: 0,listing_id,sentence_count,strong_count,symbol_count,word_count
2122,38616361,2,0,1,15


In [62]:
final_df = pd.concat([df, bow_df], axis=1)

In [63]:
print("Number of columns in our original df = " + str(df.shape[1]))
print("Number of columns in our Bag of Words df = " + str(bow_df.shape[1]))
print("Number of columns in our Merged df = " + str(final_df.shape[1]))

Number of columns in our original df = 5
Number of columns in our Bag of Words df = 5000
Number of columns in our Merged df = 5005


In [64]:
final_df.to_csv('reviews_to_columns.csv', index=False)

# Loading the final dataset before joining with lisitngs data

In [65]:
reviews = pd.read_csv('reviews_to_columns.csv')
reviews.shape

(2123, 5005)

In [66]:
reviews.head(2)

Unnamed: 0,listing_id,sentence_count,strong_count,symbol_count,word_count,great,place,stay,asheville,location,...,fitted,comparison,prepped,doorway,lifetime,dans,una,twenty,sticky,platform
0,38585,559,11,142,4356,65,48,80,62,9,...,0,0,0,0,0,0,0,0,0,0
1,80905,308,6,71,2163,72,45,41,44,47,...,0,0,0,0,0,0,0,0,0,0


# Listings

In [67]:
listings = pd.read_csv('.\\Asheville\\listings.csv')

In [68]:
# to view all columns (max 50) in the dataframe
pd.set_option('display.max_columns', 120)

In [69]:
listings.head(1)

Unnamed: 0,id,listing_url,scrape_id,last_scraped,name,summary,space,description,experiences_offered,neighborhood_overview,notes,transit,access,interaction,house_rules,thumbnail_url,medium_url,picture_url,xl_picture_url,host_id,host_url,host_name,host_since,host_location,host_about,host_response_time,host_response_rate,host_acceptance_rate,host_is_superhost,host_thumbnail_url,host_picture_url,host_neighbourhood,host_listings_count,host_total_listings_count,host_verifications,host_has_profile_pic,host_identity_verified,street,neighbourhood,neighbourhood_cleansed,neighbourhood_group_cleansed,city,state,zipcode,market,smart_location,country_code,country,latitude,longitude,is_location_exact,property_type,room_type,accommodates,bathrooms,bedrooms,beds,bed_type,amenities,square_feet,price,weekly_price,monthly_price,security_deposit,cleaning_fee,guests_included,extra_people,minimum_nights,maximum_nights,minimum_minimum_nights,maximum_minimum_nights,minimum_maximum_nights,maximum_maximum_nights,minimum_nights_avg_ntm,maximum_nights_avg_ntm,calendar_updated,has_availability,availability_30,availability_60,availability_90,availability_365,calendar_last_scraped,number_of_reviews,number_of_reviews_ltm,first_review,last_review,review_scores_rating,review_scores_accuracy,review_scores_cleanliness,review_scores_checkin,review_scores_communication,review_scores_location,review_scores_value,requires_license,license,jurisdiction_names,instant_bookable,is_business_travel_ready,cancellation_policy,require_guest_profile_picture,require_guest_phone_verification,calculated_host_listings_count,calculated_host_listings_count_entire_homes,calculated_host_listings_count_private_rooms,calculated_host_listings_count_shared_rooms,reviews_per_month
0,38585,https://www.airbnb.com/rooms/38585,20190928054837,2019-09-28,Charming Victorian home - twin beds + breakfast,Per the City Council of Asheville. Due to the ...,"Charming room with 2 twin size beds, furnished...",Per the City Council of Asheville. Due to the ...,none,Our North Asheville neighborhood stretches alo...,I can lend you a yoga mat or you can go to our...,"You need a car, but UBER and Lyft are availabl...",Safe parking in our driveway. You also have fu...,"I love entertaining, enjoy chatting with guest...",This list is based on previous experiences: * ...,,,https://a0.muscache.com/im/pictures/216165/755...,,165529,https://www.airbnb.com/users/show/165529,Evelyne,2010-07-13,"Asheville, North Carolina, United States","(Hidden by Airbnb) Likes:\r\nDancing, Horse Ri...",within a few hours,100%,,t,https://a0.muscache.com/im/users/165529/profil...,https://a0.muscache.com/im/users/165529/profil...,,1,1,"['email', 'phone', 'facebook', 'reviews', 'kba']",t,f,"Asheville, NC, United States",,28804,,Asheville,NC,28804.0,North Carolina Mountains,"Asheville, NC",US,United States,35.65146,-82.62792,t,House,Private room,2,1.0,1,2,Real Bed,"{Internet,Wifi,""Air conditioning"",Kitchen,""Fre...",,$60.00,,,$150.00,$0.00,1,$20.00,2,3,2,2,3,3,2.0,3.0,yesterday,t,19,46,75,75,2019-09-28,134,19,2010-10-28,2019-09-02,96.0,10.0,10.0,10.0,10.0,10.0,10.0,f,,"{""NORTH CAROLINA"","" BUNCOMBE""}",f,f,moderate,t,t,1,0,1,0,1.23


In [70]:
listings.shape

(2342, 106)

In [71]:
listings.id.nunique()

2342

There are no reviews for 219 listings. We are going to ignore these 219 listings.

## Choosing Lisitng columns to be kept in the Machine Learning model

In [72]:
listings_continous = listings[['id','host_since','host_response_rate','host_listings_count','latitude','longitude',
                               'accommodates','bathrooms','bedrooms','beds','price','security_deposit','cleaning_fee',
                               'guests_included','extra_people','minimum_nights','maximum_nights','availability_30',
                               'availability_60','availability_90','availability_365','number_of_reviews',
                               'number_of_reviews_ltm','reviews_per_month','first_review','last_review',
                               'review_scores_rating','review_scores_accuracy','review_scores_cleanliness',
                               'review_scores_checkin','review_scores_communication','review_scores_location',
                               'review_scores_value','cancellation_policy','calculated_host_listings_count',
                               'calculated_host_listings_count_entire_homes','calculated_host_listings_count_private_rooms',
                               'calculated_host_listings_count_shared_rooms']]

In [73]:
listings_categorical = listings[['id','host_response_time','host_is_superhost','host_identity_verified',
                                 'host_verifications','neighbourhood_cleansed','is_location_exact',
                                 'property_type','room_type','amenities','instant_bookable']]
df.rename(columns={'neighbourhood_cleansed':'zipcode'}, inplace=True)

In [74]:
listings_text = listings[['id','name','summary','space','description','neighborhood_overview','notes',
                          'transit','access','interaction','house_rules','host_about']]

In [75]:
listings_categorical.head()

Unnamed: 0,id,host_response_time,host_is_superhost,host_identity_verified,host_verifications,neighbourhood_cleansed,is_location_exact,property_type,room_type,amenities,instant_bookable
0,38585,within a few hours,t,f,"['email', 'phone', 'facebook', 'reviews', 'kba']",28804,t,House,Private room,"{Internet,Wifi,""Air conditioning"",Kitchen,""Fre...",f
1,80905,within an hour,t,f,"['email', 'phone', 'facebook', 'reviews', 'jum...",28801,t,Loft,Entire home/apt,"{TV,Internet,Wifi,""Air conditioning"",Kitchen,""...",t
2,108061,within a day,f,t,"['email', 'phone', 'facebook', 'reviews', 'off...",28801,t,Apartment,Entire home/apt,"{Wifi,""Air conditioning"",Kitchen,""Free parking...",f
3,155305,within an hour,t,f,"['email', 'phone', 'facebook', 'reviews', 'off...",28806,t,Guesthouse,Entire home/apt,"{Internet,Wifi,""Air conditioning"",Kitchen,""Fre...",t
4,156805,within an hour,t,f,"['email', 'phone', 'facebook', 'reviews', 'off...",28806,t,House,Private room,"{Internet,Wifi,Kitchen,""Free parking on premis...",t


In [79]:
# Creating Dummay variables
categorical = listings_categorical.columns.tolist()
categorical.remove('id')
categorical = [e for e in categorical if e not in ('host_verifications', 'amenities')]
for i in categorical:
    listings_categorical = pd.get_dummies(listings_categorical, prefix=[i], columns=[i], drop_first=True)

In [81]:
listings = pd.merge(listings_continous, listings_categorical, how='left', on=('id'))

# Comibing Listings and Reviews

In [84]:
fianl_dataset = pd.merge(listings, reviews, how='inner', left_on='id', right_on='listing_id')

In [85]:
fianl_dataset.to_csv('+++Final_Airbnb_Dataset.csv', index=False)