In [None]:
# import necessary packages
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib as mpl

In [None]:
# import data
austin_listings = pd.read_csv('./data/austin_listings.csv', low_memory=False)
austin_reviews = pd.read_csv('./data/austin_reviews.csv', low_memory=False)
sf_reviews = pd.read_csv('./data/sanfrancisco_reviews.csv', low_memory=False)
sf_listings = pd.read_csv('./data/sanfrancisco_listings.csv', low_memory=False)

<h2> Preprocessing </h2>

In [None]:
# check Austin listings dataframe shape

austin_listings.shape

In [None]:
# check San Francisco listings dataframe shape

sf_listings.shape

In [None]:
# combine Austin listings and San Francisco listings dataframe

result_listings = austin_listings.append(sf_listings)
result_listings

In [None]:
# check Austin reviews dataframe shape
austin_reviews.shape

In [None]:
# check San Francisco reviews dataframe shape

sf_reviews.shape

In [None]:
# combine the Austin and San Francisco dataframe reviews

result_reviews = austin_reviews.append(sf_reviews)

# drop id 
result_reviews = result_reviews.drop('id', axis=1)
result_reviews

In [None]:
# rename id column

result_reviews.rename(columns={'listing_id':'id'}, inplace=True)
print(result_reviews.columns.values)

In [None]:
result_reviews.head(n=5)

In [None]:
result_listings.head(n=5)

In [None]:
comb_reviews = result_reviews.merge(result_listings, how="inner", on='id' )
comb_reviews = comb_reviews[['id', 'date', 'reviewer_name', 'reviewer_id', 'comments', 'review_scores_rating', 'review_scores_accuracy']]
comb_reviews

In [None]:
def check_null_columns(df):
    '''List all columns with missing values
    
    Input:
        df: Dataframe

    Returns:
        Dataframe: a dataframe with column names, number of missing values, and percentage of missing values
    '''
    cols = df.columns[df.isnull().sum() >= 0]
    df_null = pd.DataFrame(df[cols].isnull().sum().sort_values(), columns=['Number of Nulls'])
    df_null['% of Nulls'] = df[cols].isnull().mean().sort_values() * 100
    
    return df_null

In [None]:
check_null_columns(comb_reviews)

In [None]:
# drop all null rows

comb_reviews.dropna(inplace=True)
comb_reviews.shape

In [None]:
# check for null columns again

check_null_columns(comb_reviews)

<h1>What are property renters saying about their experiences in reviews ?</h1>

In [None]:
# convert review score rating to integer

comb_reviews["review_scores_rating"] = comb_reviews["review_scores_rating"].astype(int)
comb_reviews["review_scores_rating"]

In [None]:
def sentiments(col):
    '''
    Function classify ratings to positive, negative and neutral
    
    Args:
        col: the name of the column needed to be classified
        
    Returns:
        string: classification status
    '''
    if (col > 50):
        return "Positive"
    elif col < 50:
        return "Negative"
    elif col == 50:
        return "Neutral"
    
# Label the data sets with sentiments

comb_reviews["class"] = comb_reviews["review_scores_rating"].apply(sentiments)
comb_reviews

In [None]:
sample_reviews = comb_reviews.groupby('class').size().reset_index(name="count")
sample_reviews['%count'] = sample_reviews['count']/sum(sample_reviews['count']) * 100


sample_reviews

In [None]:
# visualize proportion of sentiments

sns.set(style="whitegrid")
tips = sns.load_dataset("tips")
ax = sns.barplot(x="class", y="%count", data=sample_reviews)
ax.set_title('Airbnb Sentiment representation on reviews')
plt.rcParams['figure.figsize']=(13,13)
plt.savefig('./plots/rating_status.png', bbox_inches='tight')


In [None]:
comb_reviews.head(n=2)

In [None]:
# select a comment
comb_reviews.iloc[0,4]


<h2>Sentiment Analysis</h2>

We will extract some features to determine a comment's sentiment using the Bag of words strategy

<h3>Text preprocessing:</h3>
<ul>
<li>Tokenization (extracting words and sentences from input)</li>
<li>Apply regex to remove punctuations, whitespaces from words</li>
<li>Apply Stopwords to remove stopwords with nltk corpus </li>
<li>Apply stemming For example: “Flying” is a word and its suffix is “ing”, if we remove “ing” from “Flying” then we will get base word or root word which is “Fly”.This suffix is used to create a new word from the original stem word.</li>
<li>Apply lemmatization which links words with similar meaning to one word. </li>
<li>Create bag of words with redundant words. </li>
<li>Remove irrelevant words(filtering "this"and three lettered words). </li>
<li>Counting occurence (builds a dictionary of features with popular words for each review in a list document)</li>
</ul>

In [None]:
from nltk.tokenize import word_tokenize,sent_tokenize
from nltk.tokenize import RegexpTokenizer

def tokenize_words_Sents(sent):
    '''Function to take text and split into words and sentences
    
    Input:
        text: string

    Returns:
        List: tokenized words and sentences
    '''
    return word_tokenize(sent),sent_tokenize(sent)
 
def regexp_tokenizer(sent):
    '''Function to split sentences using regular expression(searches for groups that have alphanumerics and removes whitespaces)
    
    Input:
        text: tokenized strings

    Returns: 
        List: strings matched with either the tokens or the separators between tokens.
    '''
    tokenizer = RegexpTokenizer(r'\w+')
    return tokenizer.tokenize(sent)

words,sents = tokenize_words_Sents(comb_reviews.iloc[0,4])
print("Words: ",words)
print("Sents: ",sents)


In [None]:
from nltk.corpus import stopwords

def remove_stop_words(sent):
    '''Function to remove stop words present in NLTK corpus from our list of words
    
    Input:
        List: tokenized words

    Returns:
        List: 1. words not in nltk corpus
         ''   2. words without stopwords
    '''
    stop_words = set(stopwords.words("english"))
    filtered_words = []
    for w in sent:
        if w not in stop_words:
            filtered_words.append(w)
    return stop_words,filtered_words

def remove_stop_word(sent):
    '''Function to remove stop words present in NLTK corpus from our list of words
    
    Input:
        List: tokenized words

    Returns:
        List: words not in stopwords
    '''
    stop_words = set(stopwords.words("english"))
    filtered_words = []
    for w in sent:
        if w not in stop_words:
            filtered_words.append(w)
    return filtered_words

stop_words,filtered_words = remove_stop_words(words)
print(stop_words)
print(".................................Words.....................................")
print(words)
print(".................................Words filtered............................")
print(filtered_words)

In [None]:
from nltk.stem import PorterStemmer

def stemming_words(words):
    '''Function to shorten the lookup, and normalize sentences.
    
    Input:
        List: tokenized words

    Returns:
        List: lemmatized words(root words)
    '''
    Ps = PorterStemmer()
    stemmed_words = []
    for m in words:
        stemmed_words.append(Ps.stem(m))
    return stemmed_words
stemmed_words = stemming_words(filtered_words)
print(stemmed_words)

In [None]:
from nltk.stem import WordNetLemmatizer

def lemmatizing_words(words):
    '''Function to replace synonyms.
    
    Input:
        List: filtered words

    Returns:
        List: root words
    '''
    lemma = WordNetLemmatizer()
    lemmatized_words = []
    for root in words:
        lemmatized_words.append(lemma.lemmatize(root))
    return lemmatized_words

lemmatized_words = lemmatizing_words(filtered_words)
print(lemmatized_words)

In [None]:
#Extract words from comments and  Tokenize 
#RegExpTokenizer to use regex to eliminate punctuations

list_words =  [regexp_tokenizer(m) for m in list(comb_reviews['comments'])]
print(list_words[1], 'done tokenizing')

#remove stop_words

list_words =  [remove_stop_word(m) for m in list_words]
print(list_words[1], 'done removing stop words')

#Stemming

list_words = [lemmatizing_words(m) for m in list_words]
print(list_words[1], 'done lemmatizing')

In [None]:
from nltk import FreqDist
def bag_of_words(list_words):
    '''Function to model and represent frequent data.
    
    Input:
        List: filtered words

    Returns:
        List: list of frequent words
    '''
    all_words = []
    for m in list_words:
        for w in m:
            all_words.append(w.lower())
    all_words = FreqDist(all_words)
    return all_words

In [None]:
import matplotlib as mpl
from wordcloud import WordCloud
all_words = bag_of_words(list_words)
ax = plt.figure(figsize=(15,10))

# Generate a cloud image for frequent words
wordcloud = WordCloud(background_color='white',max_font_size=40).generate(' '.join(all_words.keys()))

# Display the generated image:

import matplotlib.pyplot as plt
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
print("Famous words:",len(all_words))

In [None]:
import seaborn as sns
# visualize popular words
from sklearn.manifold import TSNE
all_words = bag_of_words(list_words)
count = []
words  = []
for w in all_words.most_common(10):
    count.append(w[1])
    words.append(w[0])
sns.set_style("darkgrid")
sns.barplot(words,count)

In [None]:

#ListWords To lower case
def remove_irrelevent_words(list_words):
    '''Function to remove irrelevant words still contained after stopwords removal.
    
    Input:
        List: filtered words

    Returns:
        List: words
    '''
    list_words1 = [] 
    for m in list_words:
        l = [item.lower() for item in m]
        list_words1.append(l)
    list_words = list_words1
    #elimnate words
    for m in list_words:
        for w in m:
            if len(w) <=3:
                m.remove(w)
            if w == 'this':
                m.remove(w)
    return list_words

In [None]:
# Figures inline and set visualization style
list_words = remove_irrelevent_words(list_words)
all_words = bag_of_words(list_words)
print("All Words length ",len(all_words))
%matplotlib inline
sns.set()
ax = plt.figure(figsize=(15,10))# Create freq distribution and plot
freqdist1 = FreqDist(all_words)
freqdist1.plot(25)

In [None]:
all_words = bag_of_words(list_words)
count = []
words  = []
for w in all_words.most_common(10):
    count.append(w[1])
    words.append(w[0])
sns.set_style("darkgrid")
plt.rcParams['figure.figsize']=(12,12)
sns.barplot(words,count)

In [None]:
import random
def create_document(comb_reviews,list_words):
    '''Function to create a document with a list of words and associated class.
    
    Input:
        Dataframe: target class
        List: preprocessed words

    Returns:
        Tuple: document
    '''
    list_class = list(comb_reviews['class'])
    documents =  []
    for m in range(len(list_words)):
        documents.append((list_words[m],list_class[m]))
    #shuffle
    random.shuffle(documents)
    return documents
#Review docs
documents = create_document(comb_reviews,list_words)
print(documents[1])

In [None]:
def find_features(document,all_words,num_of_words):
    '''function to create a dictionary of features with popular words for each review in the list document.
       The keys are the words in word_features. The values of each key are either true or false for 
       whether the feature appears in the review or not. 
    Input:
       string: sample review
       List: bag of words with the most repeated words
       num_of_words: specific number of repeated words

    Returns:
        Dictionary: features with repeated words for each review, festures as keys and its existence as boolean
    '''
    most_comm_word = []    
    for w in all_words.most_common(num_of_words):
        most_comm_word.append(w[0])

    word_features = most_comm_word
    words = regexp_tokenizer(document)
    features = {}
    for w in word_features:
        features[w] = (w in words)

    return features
find_features("I really love the balcony",all_words,150)

<h4>We have been able to create features for each reviews. The system is able to identify features and establish a match from unseen data. The next thing we would have done is to create a model to classify this features into positive, negative and neutral reviews. However, this is beyond the scope of this project.</h4>

We can see that Airbnb reviews has a proportion of positive reviews than any other type of reviews(negative, neutral). Some of the most frequently used words by customers in comments are also visualized above. 