### Data Cleaning - Raw Data for Reviews and Ratings

In [1]:
import pandas as pd
import numpy as np
from langdetect import detect

pd.set_option('display.max_rows', 1000)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

In [2]:
url = 'https://drive.google.com/file/d/1UTYaZWaIPRl_MNdyzPOEilQf33nUQm4_/view?usp=sharing'
path = 'https://drive.google.com/uc?export=download&id='+url.split('/')[-2]

listings = pd.read_csv(path)

  exec(code_obj, self.user_global_ns, self.user_ns)


In [3]:
url2 = 'https://drive.google.com/file/d/1icjWWfkJppQxtkhwYlwlWeWGOaKYe8QQ/view?usp=sharing'
path2 = 'https://drive.google.com/uc?export=download&id='+url2.split('/')[-2]

reviews = pd.read_csv(path2)

In [4]:
reviews.head(3)

Unnamed: 0,listing_id,id,date,reviewer_id,reviewer_name,comments
0,2595,17857,2009-11-21,50679,Jean,Notre séjour de trois nuits.\r<br/>Nous avons ...
1,2595,19176,2009-12-05,53267,Cate,Great experience.
2,2595,19760,2009-12-10,38960,Anita,I've stayed with my friend at the Midtown Cast...


In [5]:
listings = listings[['id', 'review_scores_rating', 'review_scores_accuracy', 'review_scores_cleanliness', 'review_scores_checkin', 'review_scores_communication', 'review_scores_location', 'review_scores_value']]
listings.columns = listings.columns.str.replace('id', 'listing_id')

In [6]:
#drop nulls -- dataset is large enough

listings.isna().sum()
listings.dropna(inplace=True)

In [7]:
#drop nulls -- dataset is large enough

reviews.isna().sum()
reviews.dropna(inplace=True)

In [8]:
#merge listings and review data together

df = reviews.merge(listings, how='left', on='listing_id')
df.head(3)

Unnamed: 0,listing_id,id,date,reviewer_id,reviewer_name,comments,review_scores_rating,review_scores_accuracy,review_scores_cleanliness,review_scores_checkin,review_scores_communication,review_scores_location,review_scores_value
0,2595,17857,2009-11-21,50679,Jean,Notre séjour de trois nuits.\r<br/>Nous avons ...,4.7,4.72,4.62,4.76,4.79,4.86,4.41
1,2595,19176,2009-12-05,53267,Cate,Great experience.,4.7,4.72,4.62,4.76,4.79,4.86,4.41
2,2595,19760,2009-12-10,38960,Anita,I've stayed with my friend at the Midtown Cast...,4.7,4.72,4.62,4.76,4.79,4.86,4.41


In [9]:
#check why csv is importing strangely based on html breaks 

df['comments'].iloc[12]

"Jennifer's place is cozy and a short walking distance from Time Square. The place was big enough for two. \r<br/>Jennifer left detailed instructions and was also helpful in terms of recommending great places in NY.\r<br/>\r<br/>Everything was great:)"

In [10]:
#remove html code within comments

df['comments'] = df['comments'].replace('\r<br/>',' ', regex=True) 
df['comments'] = df['comments'].replace('\'',' ', regex=True) 

In [11]:
#check replace worked

df['comments'].iloc[12]

'Jennifer s place is cozy and a short walking distance from Time Square. The place was big enough for two.  Jennifer left detailed instructions and was also helpful in terms of recommending great places in NY.  Everything was great:)'

In [12]:
df['comments'].iloc[0]

'Notre séjour de trois nuits. Nous avons apprécier L appartement qui est très bien situé. Agréable, propre et bien soigné. C est idéal pour une famille de 3 ou 4 personnes. Petits soucis en arrivant il y avait personne pour nous recevoir, et il manquait le savon pour la douche, le liquide vaisselle, nous les avons reçu de surlendemain. Il y a aussi le bruit du Métro de NY, donc une première nuit difficile si on est pas habitué. Jennifer est correcte le remboursement de la caution était très rapide.  A part ces petits détails notre court séjour c est bien passé.  Si j ai la possibilité de revenir sur NY pour les vacances, je reprendrai à "The Midtown Castle" Jean Possession - Ile de La Réunion '

In [13]:
#detect language of comments
#used stackoverflow code to fix errors when detecting languages:
#https://stackoverflow.com/questions/63573625/langdetectexception-how-to-fix-it-by-adding-all-rows-not-detected-as-other

def detect_lang(x):
    try:
        lang = detect(x)
    except:
        lang = 'Other'
    return lang

df['language'] = df['comments'].apply(detect_lang)

In [14]:
df.head(3)

Unnamed: 0,listing_id,id,date,reviewer_id,reviewer_name,comments,review_scores_rating,review_scores_accuracy,review_scores_cleanliness,review_scores_checkin,review_scores_communication,review_scores_location,review_scores_value,language
0,2595,17857,2009-11-21,50679,Jean,Notre séjour de trois nuits. Nous avons appréc...,4.7,4.72,4.62,4.76,4.79,4.86,4.41,fr
1,2595,19176,2009-12-05,53267,Cate,Great experience.,4.7,4.72,4.62,4.76,4.79,4.86,4.41,ro
2,2595,19760,2009-12-10,38960,Anita,I ve stayed with my friend at the Midtown Cast...,4.7,4.72,4.62,4.76,4.79,4.86,4.41,en


In [15]:
df['word_count'] = df['comments'].str.count(' ')+1

In [16]:
#export cleaned dataset

df.to_csv('../data/clean_review_and_scores.csv', index=False)