# Import libraries

In [1]:
import numpy as np
import pandas as pd
%matplotlib inline

import os
from zipfile import ZipFile

In [2]:
# Libraries for sentiment analysis
import nltk
nltk.download()

import spacy
nlp = spacy.load('en_core_web_sm')

showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml


In [3]:
%config Completer.use_jedi = False # Solve error in autocomplete

# Data understanding

## Load data
Extract dataframes from zipfile

In [4]:
DATA_PATH = os.path.join(os.getcwd(), 'data')
FILE_PATH_SEATTLE = os.path.join(DATA_PATH, 'seattle.zip')
FILE_PATH_BOSTON = os.path.join(DATA_PATH, 'boston.zip')

In [5]:
def extract_df_from_airbnb_zipfile(PATH_ZIPFILE) :
    ''' 
    Extract csv files from a zipfile and return a list of dataframes
    
    INPUT  : file path to a zipfile to open
    OUTPUT : a dictionary that contains dataframes of files 
    extracted from the zip file
    '''

    zf = ZipFile(PATH_ZIPFILE) 
    dfs = {
        text_file.filename : pd.read_csv(zf.open(text_file.filename ))
        for text_file in zf.infolist() 
        if text_file.filename.endswith('.csv')
    }
    
    print('Printing a dictionary with filenames as keys')
    for filename in dfs.keys() :
        print(f'Filename (keys): {filename}')
    
    return dfs

## Exploring Seattle data first

In [6]:
dfs_seattle = extract_df_from_airbnb_zipfile(FILE_PATH_SEATTLE)

Printing a dictionary with filenames as keys
Filename (keys): calendar.csv
Filename (keys): listings.csv
Filename (keys): reviews.csv


In [7]:
reviews_seattle = dfs_seattle['reviews.csv']
reviews_seattle.head()

Unnamed: 0,listing_id,id,date,reviewer_id,reviewer_name,comments
0,7202016,38917982,2015-07-19,28943674,Bianca,Cute and cozy place. Perfect location to every...
1,7202016,39087409,2015-07-20,32440555,Frank,Kelly has a great room in a very central locat...
2,7202016,39820030,2015-07-26,37722850,Ian,"Very spacious apartment, and in a great neighb..."
3,7202016,40813543,2015-08-02,33671805,George,Close to Seattle Center and all it has to offe...
4,7202016,41986501,2015-08-10,34959538,Ming,Kelly was a great host and very accommodating ...


In [8]:
reviews_seattle.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 84849 entries, 0 to 84848
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   listing_id     84849 non-null  int64 
 1   id             84849 non-null  int64 
 2   date           84849 non-null  object
 3   reviewer_id    84849 non-null  int64 
 4   reviewer_name  84849 non-null  object
 5   comments       84831 non-null  object
dtypes: int64(3), object(3)
memory usage: 3.9+ MB


There are 18 missing comments and date is not in datetime object.
As `comments` is the key feature for the sentivity analysis, any records missing comments will be dropped.

# Data preparation

### Dropping null values in 'comments' column

In [9]:
# Copy the dataframe for cleaning
df_copy = reviews_seattle.copy()

In [10]:
# Inspecting missing comments
df_copy[df_copy['comments'].isnull()]

Unnamed: 0,listing_id,id,date,reviewer_id,reviewer_name,comments
11034,461567,11614247,2014-04-09,12120141,Abbey,
15383,9460,10563024,2014-02-24,12498029,Debra,
15914,2906694,48629316,2015-09-27,44218980,Anush,
16097,910784,9950520,2014-01-21,179481,Enrico,
27280,10695,52378,2010-06-13,105412,Wei,
29467,1018204,10024984,2014-01-26,10571694,Jordan,
30619,6079216,34824019,2015-06-12,31556342,Mack,
31414,3354614,18103248,2014-08-21,12426758,Jeff,
35210,3554558,24863045,2015-01-03,24488791,Eleanor,
37971,1790020,15640556,2014-07-13,16884291,Michael,


In [11]:
# Dropping records with missing comments 
df_copy = df_copy[df_copy['comments'].notnull()]
df_copy.shape # 18 records deleted from 84849

(84831, 6)

## Remove blank records
Some types there are blank reviews which are not recognized as NaN values. The below code is to find the index of blank entries.

In [12]:
blanks = []

for i, comment in enumerate(df_copy['comments']) :
    if type(comment) == str : 
        if comment.isspace() : 
            blanks.append(i)
            
print(len(blanks))
print(blanks)

0
[]


No blank records found!

### Converting 'date' column into datetime object

In [13]:
df_copy['date'] = pd.to_datetime(df_copy['date'], format='%Y-%m-%d')

### Breaking the date into year, month & day

In [14]:
df_copy['year'] = df_copy['date'].dt.year
df_copy['month'] = df_copy['date'].dt.month
df_copy['day'] = df_copy['date'].dt.day

In [15]:
reviews_seattle_clean = df_copy

# Evaluation of the results

## Unsupervised VADER Sentiment analysis
For this project, nltk vader will be used to conduct polarity analysis (positive / negative) for the reviews. 

No label is provided to evaludate the test results.

### Import modules
Prerequisite : nltk.download('vader_lexcion') 

In [16]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer

In [17]:
# Instantiate SentimentIntensityAnalyzer
sent_analyzer = SentimentIntensityAnalyzer()

In [18]:
sent1 = reviews_seattle_clean['comments'][1]
print(sent1, '\n\n', sent_analyzer.polarity_scores(sent1))

Kelly has a great room in a very central location. 
Beautiful building , architecture and a style that we really like. 
We felt guite at home here and wish we had spent more time.
Went for a walk and found Seattle Center with a major food festival in progress. What a treat.
Visited the Space Needle and the Chihuly Glass exhibit. Then Pikes Place Market. WOW.  Thanks for a great stay. 

 {'neg': 0.0, 'neu': 0.609, 'pos': 0.391, 'compound': 0.9872}


According to the official NLTK documentation, compound score is normalized the score to be between -1 and 1 using an alpha that approximates the max expected value. The official Github documentation can be found at the link [here](https://github.com/nltk/nltk/blob/develop/nltk/sentiment/vader.py)

The review is generally positive given the sentences like 'What a treat', 'Wow' 'Thanks for a great stay' and the analyzer scores them fairly well -- compound score of 0.9872.