## Data Cleaning - Reviews

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction import text

from nltk.tokenize import sent_tokenize, word_tokenize, RegexpTokenizer
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords

pd.set_option('display.max_rows', 1000)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

In [2]:
df = pd.read_csv('./data/clean_review_and_scores2.csv')

In [3]:
df.head(3)

Unnamed: 0,listing_id,id,date,reviewer_id,reviewer_name,comments,review_scores_rating,review_scores_accuracy,review_scores_cleanliness,review_scores_checkin,review_scores_communication,review_scores_location,review_scores_value,language,word_count
0,2595,19760.0,2009-12-10,38960.0,Anita,I ve stayed with my friend at the Midtown Cast...,4.7,4.72,4.62,4.76,4.79,4.86,4.41,en,93.0
1,2595,34320.0,2010-04-09,71130.0,Kai-Uwe,"We ve been staying here for about 9 nights, en...",4.7,4.72,4.62,4.76,4.79,4.86,4.41,en,67.0
2,2595,46312.0,2010-05-25,117113.0,Alicia,We had a wonderful stay at Jennifer s charming...,4.7,4.72,4.62,4.76,4.79,4.86,4.41,en,25.0


In [4]:
#remove puncutation from comments

df['comments'] = df['comments'].str.replace(r'[^\w\s]', '', regex=True)

In [5]:
#change comments to lowercase

df['comments'] = df['comments'].str.lower()

In [6]:
df.head(3)

Unnamed: 0,listing_id,id,date,reviewer_id,reviewer_name,comments,review_scores_rating,review_scores_accuracy,review_scores_cleanliness,review_scores_checkin,review_scores_communication,review_scores_location,review_scores_value,language,word_count
0,2595,19760.0,2009-12-10,38960.0,Anita,i ve stayed with my friend at the midtown cast...,4.7,4.72,4.62,4.76,4.79,4.86,4.41,en,93.0
1,2595,34320.0,2010-04-09,71130.0,Kai-Uwe,we ve been staying here for about 9 nights enj...,4.7,4.72,4.62,4.76,4.79,4.86,4.41,en,67.0
2,2595,46312.0,2010-05-25,117113.0,Alicia,we had a wonderful stay at jennifer s charming...,4.7,4.72,4.62,4.76,4.79,4.86,4.41,en,25.0


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 602394 entries, 0 to 602393
Data columns (total 15 columns):
 #   Column                       Non-Null Count   Dtype  
---  ------                       --------------   -----  
 0   listing_id                   602394 non-null  int64  
 1   id                           602394 non-null  float64
 2   date                         602394 non-null  object 
 3   reviewer_id                  602394 non-null  float64
 4   reviewer_name                602394 non-null  object 
 5   comments                     602394 non-null  object 
 6   review_scores_rating         602394 non-null  float64
 7   review_scores_accuracy       602394 non-null  float64
 8   review_scores_cleanliness    602394 non-null  float64
 9   review_scores_checkin        602394 non-null  float64
 10  review_scores_communication  602394 non-null  float64
 11  review_scores_location       602394 non-null  float64
 12  review_scores_value          602394 non-null  float64
 13 

In [8]:
df.isna().sum()

listing_id                     0
id                             0
date                           0
reviewer_id                    0
reviewer_name                  0
comments                       0
review_scores_rating           0
review_scores_accuracy         0
review_scores_cleanliness      0
review_scores_checkin          0
review_scores_communication    0
review_scores_location         0
review_scores_value            0
language                       0
word_count                     0
dtype: int64

In [9]:
#split data into separate categories

zero = df[(df['review_scores_rating'] >= 0) & (df['review_scores_rating'] <1)]
one = df[(df['review_scores_rating'] >= 1) & (df['review_scores_rating'] <2)]
two = df[(df['review_scores_rating'] >= 2) & (df['review_scores_rating'] <3)]
three = df[(df['review_scores_rating'] >= 3) & (df['review_scores_rating'] <4)]
four = df[(df['review_scores_rating'] >= 4) & (df['review_scores_rating'] <5)]
five = df[df['review_scores_rating'] == 5]

In [10]:
#check counts of reviews 

print(len(zero))
print(len(one))
print(len(two))
print(len(three))
print(len(four))
print(len(five))

4
153
181
2391
568756
30909


In [35]:
#randomly sampled 10% of original dataset
zerosampled = zero.sample(frac=0.02)
onesampled = one.sample(frac=0.02)
twosampled = two.sample(frac=0.02)
threesampled = three.sample(frac=0.02)
foursampled = four.sample(frac=0.02)
fivesampled = five.sample(frac=0.02)
zerototwo = df[(df['review_scores_rating'] >= 0) & (df['review_scores_rating'] <3)].sample(frac=0.02)

In [36]:
print(len(zerosampled))
print(len(onesampled))
print(len(threesampled))
print(len(foursampled))
print(len(fivesampled))
print(len(zerototwo))

0
3
48
11375
618
7


* for ratings that are between 0 and less than 4, there are significantly less observations
* attempt to combine for vectorizing to analyze in groups. 3 will be middle, 0-2 will be its own category

In [28]:
#export to csv to run in colab -- kernel dies when attempting to run

zerototwo = df[(df['review_scores_rating'] >= 0) & (df['review_scores_rating'] <3)].sample(frac=0.1)

df.to_csv('./data/all_reviews_for_vectorizing.csv', index=False)
zerototwo.to_csv('./data/0to2stars_for_vectorizing.csv', index=False)
threesampled.to_csv('./data/3stars_for_vectorizing.csv', index=False)
foursampled.to_csv('./data/4stars_for_vectorizing.csv', index=False)
fivesampled.to_csv('./data/5stars_for_vectorizing.csv', index=False)

In [14]:
#colab link: https://colab.research.google.com/drive/1MDlUv_tvVwNCOQc3SGnPu6k_22pq98k6?usp=sharing