# Import libraries

In [69]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

import os
from zipfile import ZipFile

In [2]:
%config Completer.use_jedi = False # Solve error in autocomplete

# Data understanding

## Load data
Extract dataframes from zipfile

In [6]:
DATA_PATH = os.path.join(os.getcwd(), 'data')
FILE_PATH_SEATTLE = os.path.join(DATA_PATH, 'seattle.zip')
FILE_PATH_BOSTON = os.path.join(DATA_PATH, 'boston.zip')

In [7]:
def extract_df_from_airbnb_zipfile(PATH_ZIPFILE) :
    ''' 
    Extract csv files from a zipfile and return a list of dataframes
    
    INPUT  : file path to a zipfile to open
    OUTPUT : a dictionary that contains dataframes of files 
    extracted from the zip file
    '''

    zf = ZipFile(PATH_ZIPFILE) 
    dfs = {
        text_file.filename : pd.read_csv(zf.open(text_file.filename ))
        for text_file in zf.infolist() 
        if text_file.filename.endswith('.csv')
    }
    
    print('Printing a dictionary with filenames as keys')
    for filename in dfs.keys() :
        print(f'Filename (keys): {filename}')
    
    return dfs

## Exploring Seattle data first

In [8]:
dfs_seattle = extract_df_from_airbnb_zipfile(FILE_PATH_SEATTLE)

Printing a dictionary with filenames as keys
Filename (keys): calendar.csv
Filename (keys): listings.csv
Filename (keys): reviews.csv


In [9]:
reviews_seattle = dfs_seattle['reviews.csv']
reviews_seattle.head()

Unnamed: 0,listing_id,id,date,reviewer_id,reviewer_name,comments
0,7202016,38917982,2015-07-19,28943674,Bianca,Cute and cozy place. Perfect location to every...
1,7202016,39087409,2015-07-20,32440555,Frank,Kelly has a great room in a very central locat...
2,7202016,39820030,2015-07-26,37722850,Ian,"Very spacious apartment, and in a great neighb..."
3,7202016,40813543,2015-08-02,33671805,George,Close to Seattle Center and all it has to offe...
4,7202016,41986501,2015-08-10,34959538,Ming,Kelly was a great host and very accommodating ...


In [10]:
reviews_seattle.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 84849 entries, 0 to 84848
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   listing_id     84849 non-null  int64 
 1   id             84849 non-null  int64 
 2   date           84849 non-null  object
 3   reviewer_id    84849 non-null  int64 
 4   reviewer_name  84849 non-null  object
 5   comments       84831 non-null  object
dtypes: int64(3), object(3)
memory usage: 3.9+ MB


There are 18 missing comments and date is not in datetime object.
As `comments` is the key feature for the sentivity analysis, any records missing comments will be dropped.

# Data preparation

### Dropping null values in 'comments' column

In [11]:
# Copy the dataframe for cleaning
df_copy = reviews_seattle.copy()

In [12]:
# Inspecting missing comments
df_copy[df_copy['comments'].isnull()]

Unnamed: 0,listing_id,id,date,reviewer_id,reviewer_name,comments
11034,461567,11614247,2014-04-09,12120141,Abbey,
15383,9460,10563024,2014-02-24,12498029,Debra,
15914,2906694,48629316,2015-09-27,44218980,Anush,
16097,910784,9950520,2014-01-21,179481,Enrico,
27280,10695,52378,2010-06-13,105412,Wei,
29467,1018204,10024984,2014-01-26,10571694,Jordan,
30619,6079216,34824019,2015-06-12,31556342,Mack,
31414,3354614,18103248,2014-08-21,12426758,Jeff,
35210,3554558,24863045,2015-01-03,24488791,Eleanor,
37971,1790020,15640556,2014-07-13,16884291,Michael,


In [13]:
# Dropping records with missing comments 
df_copy = df_copy[df_copy['comments'].notnull()]
df_copy.shape # 18 records deleted from 84849

(84831, 6)

## Remove blank records
Some types there are blank reviews which are not recognized as NaN values. The below code is to find the index of blank entries.

In [14]:
blanks = []

for i, comment in enumerate(df_copy['comments']) :
    if type(comment) == str : 
        if comment.isspace() : 
            blanks.append(i)
            
print(len(blanks))
print(blanks)

0
[]


No blank records found!

### Converting 'date' column into datetime object

In [15]:
df_copy['date'] = pd.to_datetime(df_copy['date'], format='%Y-%m-%d')

### Breaking the date into year, month & day

In [16]:
df_copy['year'] = df_copy['date'].dt.year
df_copy['month'] = df_copy['date'].dt.month
df_copy['day'] = df_copy['date'].dt.day

In [17]:
reviews_seattle_clean = df_copy

In [19]:
reviews_seattle_clean.head(3)

Unnamed: 0,listing_id,id,date,reviewer_id,reviewer_name,comments,year,month,day
0,7202016,38917982,2015-07-19,28943674,Bianca,Cute and cozy place. Perfect location to every...,2015,7,19
1,7202016,39087409,2015-07-20,32440555,Frank,Kelly has a great room in a very central locat...,2015,7,20
2,7202016,39820030,2015-07-26,37722850,Ian,"Very spacious apartment, and in a great neighb...",2015,7,26


In [21]:
# Store the clean data
reviews_seattle_clean.to_csv('data/review_seattle_cleaned.csv', index=False)

---
# Modeling
We can conduct sentiment analysis on comments to evaluate how positive or negative each comment is but the model cannot be quantitatively evaluated with test scores with no labels provided in this case.

Instead, **topic modeling** will be performed to category the reviews into 10 common topics. 

Two algorithms will be trialed :
1. Latent Dirichlet allocation, LDA
2. Non-negative Matrix Factorization

### Import libraries

In [None]:
import nltk
nltk.download()

import spacy
nlp = spacy.load('en_core_web_sm')

In [82]:
# Libraries for topic modeling
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation, NMF

### Load the clean data

In [51]:
df = pd.read_csv('data/review_seattle_cleaned.csv')
df.head()

Unnamed: 0,listing_id,id,date,reviewer_id,reviewer_name,comments,year,month,day
0,7202016,38917982,2015-07-19,28943674,Bianca,Cute and cozy place. Perfect location to every...,2015,7,19
1,7202016,39087409,2015-07-20,32440555,Frank,Kelly has a great room in a very central locat...,2015,7,20
2,7202016,39820030,2015-07-26,37722850,Ian,"Very spacious apartment, and in a great neighb...",2015,7,26
3,7202016,40813543,2015-08-02,33671805,George,Close to Seattle Center and all it has to offe...,2015,8,2
4,7202016,41986501,2015-08-10,34959538,Ming,Kelly was a great host and very accommodating ...,2015,8,10


In [105]:
df.shape

(84831, 9)

### MODEL1 | Topic Modeling with LDA

In [85]:
# Instanticate count vectorizer
# remove English stopwords, 
# Ignore document frequency higher than max_df, lower than min_df
cv = CountVectorizer(stop_words='english', max_df=0.90, min_df=2)

In [86]:
# Vectorize the comments by number of word counts
dtm = cv.fit_transform(df['comments'])

In [87]:
dtm

<84831x20096 sparse matrix of type '<class 'numpy.int64'>'
	with 2450653 stored elements in Compressed Sparse Row format>

In [88]:
# Instantiate LDA model for 10 topics
LDA_model = LatentDirichletAllocation(n_components=10)

In [91]:
# Fit the vectorized text to LDA model
# LDA_model.fit(dtm)

Interrupted the above operations as fitting a LDA_model took too much time. Let's see if NMF model can be fit faster.

### MODEL2 | Topic Modeling with NMF

In [92]:
# Instanticate tfidf vectorizer
# remove English stopwords, 
# Ignore document frequency higher than max_df, lower than min_df
tfidf = TfidfVectorizer(stop_words='english', max_df=0.90, min_df=2)

In [93]:
dtm = tfidf.fit_transform(df['comments'])

In [94]:
dtm # 84831 vectorized words x 20096 records

<84831x20096 sparse matrix of type '<class 'numpy.float64'>'
	with 2450653 stored elements in Compressed Sparse Row format>

In [96]:
# Instantiate NMF model and fit tfidf word vectors
# max_iter set to 1000 to avoid convergence warning
NMF_model = NMF(n_components=10, max_iter=1000) 
NMF_model.fit(dtm)

NMF(max_iter=1000, n_components=10)

In [98]:
NMF_model.components_.shape # 10 topics x 20096 coefficients to word

(10, 20096)

In [107]:
for i, topic in enumerate(NMF_model.components_) : 
    print(f'Printing the most relevant words for Topic# {i}')
    
    # Get 20 words with the hightest coefficents 
    print([tfidf.get_feature_names()[idx] for idx in topic.argsort()[-20:]])
    print()

Printing the most relevant words for Topic# 0
['recommend', 'hosts', 'welcoming', 'thank', 'amazing', 'perfect', 'airbnb', 'host', 'welcome', 'staying', 'like', 'lovely', 'seattle', 'experience', 'felt', 'time', 'beautiful', 'feel', 'wonderful', 'home']

Printing the most relevant words for Topic# 1
['17', '11', '21', '13', '22', '10', '14', '12', '16', '18', '23', '38', 'day', 'host', 'days', 'arrival', 'reservation', 'posting', 'automated', 'canceled']

Printing the most relevant words for Topic# 2
['recommended', 'recommendations', 'price', 'check', 'overall', 'excellent', 'spot', 'accommodating', 'view', 'awesome', 'space', 'hosts', 'communication', 'easy', 'time', 'thanks', 'experience', 'host', 'location', 'great']

Printing the most relevant words for Topic# 3
['accommodating', 'fantastic', 'amazing', 'responsive', 'seattle', 'lauren', 'view', 'highly', 'needed', 'spacious', 'located', 'comfortable', 'building', 'check', 'helpful', 'recommend', 'perfect', 'location', 'clean', 'a