In [41]:
import numpy as np
import pandas as pd

from textblob import TextBlob, Blobber
from textblob.sentiments import NaiveBayesAnalyzer
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer

import re
import json

In [42]:
nltk.download('punkt')
nltk.download('movie_reviews')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\k.osadchenko\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package movie_reviews to
[nltk_data]     C:\Users\k.osadchenko\AppData\Roaming\nltk_data...
[nltk_data]   Package movie_reviews is already up-to-date!


True

In [43]:
data_train = pd.read_csv('../input/sf-booking/hotels_train.csv')
data_train.head(3)

Unnamed: 0,hotel_address,additional_number_of_scoring,review_date,average_score,hotel_name,reviewer_nationality,negative_review,review_total_negative_word_counts,total_number_of_reviews,positive_review,review_total_positive_word_counts,total_number_of_reviews_reviewer_has_given,reviewer_score,tags,days_since_review,lat,lng
0,Stratton Street Mayfair Westminster Borough Lo...,581,2/19/2016,8.4,The May Fair Hotel,United Kingdom,Leaving,3,1994,Staff were amazing,4,7,10.0,"[' Leisure trip ', ' Couple ', ' Studio Suite ...",531 day,51.507894,-0.143671
1,130 134 Southampton Row Camden London WC1B 5AF...,299,1/12/2017,8.3,Mercure London Bloomsbury Hotel,United Kingdom,poor breakfast,3,1361,location,2,14,6.3,"[' Business trip ', ' Couple ', ' Standard Dou...",203 day,51.521009,-0.123097
2,151 bis Rue de Rennes 6th arr 75006 Paris France,32,10/18/2016,8.9,Legend Saint Germain by Elegancia,China,No kettle in room,6,406,No Positive,0,14,7.5,"[' Leisure trip ', ' Solo traveler ', ' Modern...",289 day,48.845377,2.325643


In [44]:
data_test = pd.read_csv('../input/sf-booking/hotels_test.csv')
data_test.head(3)

Unnamed: 0,hotel_address,additional_number_of_scoring,review_date,average_score,hotel_name,reviewer_nationality,negative_review,review_total_negative_word_counts,total_number_of_reviews,positive_review,review_total_positive_word_counts,total_number_of_reviews_reviewer_has_given,tags,days_since_review,lat,lng
0,Via Senigallia 6 20161 Milan Italy,904,7/21/2017,8.1,Hotel Da Vinci,United Kingdom,Would have appreciated a shop in the hotel th...,52,16670,Hotel was great clean friendly staff free bre...,62,1,"[' Leisure trip ', ' Couple ', ' Double Room '...",13 days,45.533137,9.171102
1,Arlandaweg 10 Westpoort 1043 EW Amsterdam Neth...,612,12/12/2016,8.6,Urban Lodge Hotel,Belgium,No tissue paper box was present at the room,10,5018,No Positive,0,7,"[' Leisure trip ', ' Group ', ' Triple Room ',...",234 day,52.385649,4.834443
2,Mallorca 251 Eixample 08008 Barcelona Spain,46,11/26/2015,8.3,Alexandra Barcelona A DoubleTree by Hilton,Sweden,Pillows,3,351,Nice welcoming and service,5,15,"[' Business trip ', ' Solo traveler ', ' Twin ...",616 day,41.393192,2.16152


In [45]:
df_train = data_train.copy()
df_test = data_test.copy()

In [46]:
df = pd.concat([df_train, df_test], axis=0)

In [47]:
tb_positive_review_dict = {}
tb_negative_review_dict = {}

In [48]:
tb = Blobber(analyzer=NaiveBayesAnalyzer())

In [49]:
def pos_review_preprocess(msg):
    global tb_positive_review_dict
    
    msg_cleared = msg.strip()
    if msg_cleared:
        result = {
            'polarity': 0.0,
            'subjectivity': 0.0,
            'classification': 0.0,
            'p_pos': 0.5,
            'p_neg': 0.5
        }
        blob = TextBlob(msg_cleared)
        result['polarity'] = blob.sentiment_assessments.polarity
        result['subjectivity'] = blob.sentiment_assessments.subjectivity
        
        blobber_sentiment = tb(msg_cleared).sentiment
        result['classification'] = -1 if blobber_sentiment.classification == 'neg' else 1 if blobber_sentiment.classification == 'pos' else 0
        result['p_pos'] = blobber_sentiment.p_pos
        result['p_neg'] = blobber_sentiment.p_neg
        
        tb_positive_review_dict[msg] = result
        
def neg_review_preprocess(msg):
    global tb_negative_review_dict
    
    msg_cleared = msg.strip()
    if msg_cleared:
        result = {
            'polarity': 0.0,
            'subjectivity': 0.0,
            'classification': 0.0,
            'p_pos': 0.5,
            'p_neg': 0.5
        }
        blob = TextBlob(msg_cleared)
        result['polarity'] = blob.sentiment_assessments.polarity
        result['subjectivity'] = blob.sentiment_assessments.subjectivity
        
        blobber_sentiment = tb(msg_cleared).sentiment
        result['classification'] = -1 if blobber_sentiment.classification == 'neg' else 1 if blobber_sentiment.classification == 'pos' else 0
        result['p_pos'] = blobber_sentiment.p_pos
        result['p_neg'] = blobber_sentiment.p_neg

        tb_negative_review_dict[msg] = result

In [50]:
df['positive_review'].apply(pos_review_preprocess)
df['negative_review'].apply(neg_review_preprocess)

0         None
1         None
2         None
3         None
4         None
          ... 
128930    None
128931    None
128932    None
128933    None
128934    None
Name: negative_review, Length: 515738, dtype: object

In [53]:
json_object = json.dumps(tb_negative_review_dict, indent=4)
 
with open("../input/sfbookingtbsentimentcache/textblob_negative_review_dict.json", "w") as outfile:
    outfile.write(json_object)

json_object = json.dumps(tb_positive_review_dict, indent=4)
 
with open("../input/sfbookingtbsentimentcache/textblob_positive_review_dict.json", "w") as outfile:
    outfile.write(json_object)

In [55]:
with open('../input/sfbookingtbsentimentcache/textblob_negative_review_dict.json', 'r') as in_file:
    tb_negative_review_dict = json.load(in_file)
    
with open('../input/sfbookingtbsentimentcache/textblob_positive_review_dict.json', 'r') as in_file:
    tb_positive_review_dict = json.load(in_file)

In [56]:
df['positive_review_polarity'] = 0.0
df['positive_review_subjectivity'] = 0.0
df['positive_review_classification'] = 0.0
df['positive_review_p_pos'] = 0.5
df['positive_review_p_neg'] = 0.5

df['negative_review_polarity'] = 0.0
df['negative_review_subjectivity'] = 0.0
df['negative_review_classification'] = 0.0
df['negative_review_p_pos'] = 0.5
df['negative_review_p_neg'] = 0.5

In [57]:
df['positive_review_polarity'] = df['positive_review'].apply(lambda x: 0.0 if (len(x.strip()) == 0) or (tb_positive_review_dict[x] is None) else tb_positive_review_dict[x]['polarity'])
df['positive_review_subjectivity'] = df['positive_review'].apply(lambda x: 0.0 if (len(x.strip()) == 0) or (tb_positive_review_dict[x] is None) else tb_positive_review_dict[x]['subjectivity'])
df['positive_review_classification'] = df['positive_review'].apply(lambda x: 0.0 if (len(x.strip()) == 0) or (tb_positive_review_dict[x] is None) else tb_positive_review_dict[x]['classification'])
df['positive_review_p_pos'] = df['positive_review'].apply(lambda x: 0.5 if (len(x.strip()) == 0) or (tb_positive_review_dict[x] is None) else tb_positive_review_dict[x]['p_pos'])
df['positive_review_p_neg'] = df['positive_review'].apply(lambda x: 0.5 if (len(x.strip()) == 0) or (tb_positive_review_dict[x] is None) else tb_positive_review_dict[x]['p_neg'])

df['negative_review_polarity'] = df['negative_review'].apply(lambda x: 0.0 if (len(x.strip()) == 0) or (tb_negative_review_dict[x] is None) else tb_negative_review_dict[x]['polarity'])
df['negative_review_subjectivity'] = df['negative_review'].apply(lambda x: 0.0 if (len(x.strip()) == 0) or (tb_negative_review_dict[x] is None) else tb_negative_review_dict[x]['subjectivity'])
df['negative_review_classification'] = df['negative_review'].apply(lambda x: 0.0 if (len(x.strip()) == 0) or (tb_negative_review_dict[x] is None) else tb_negative_review_dict[x]['classification'])
df['negative_review_p_pos'] = df['negative_review'].apply(lambda x: 0.5 if (len(x.strip()) == 0) or (tb_negative_review_dict[x] is None) else tb_negative_review_dict[x]['p_pos'])
df['negative_review_p_neg'] = df['negative_review'].apply(lambda x: 0.5 if (len(x.strip()) == 0) or (tb_negative_review_dict[x] is None) else tb_negative_review_dict[x]['p_neg'])

In [58]:
df.head()

Unnamed: 0,hotel_address,additional_number_of_scoring,review_date,average_score,hotel_name,reviewer_nationality,negative_review,review_total_negative_word_counts,total_number_of_reviews,positive_review,...,positive_review_polarity,positive_review_subjectivity,positive_review_classification,positive_review_p_pos,positive_review_p_neg,negative_review_polarity,negative_review_subjectivity,negative_review_classification,negative_review_p_pos,negative_review_p_neg
0,Stratton Street Mayfair Westminster Borough Lo...,581,2/19/2016,8.4,The May Fair Hotel,United Kingdom,Leaving,3,1994,Staff were amazing,...,0.6,0.9,1.0,0.747973,0.252027,0.0,0.0,1.0,0.566667,0.433333
1,130 134 Southampton Row Camden London WC1B 5AF...,299,1/12/2017,8.3,Mercure London Bloomsbury Hotel,United Kingdom,poor breakfast,3,1361,location,...,0.0,0.0,-1.0,0.427083,0.572917,-0.4,0.6,-1.0,0.295764,0.704236
2,151 bis Rue de Rennes 6th arr 75006 Paris France,32,10/18/2016,8.9,Legend Saint Germain by Elegancia,China,No kettle in room,6,406,No Positive,...,-0.113636,0.545455,-1.0,0.452381,0.547619,0.0,0.0,1.0,0.760417,0.239583
3,216 Avenue Jean Jaures 19th arr 75019 Paris Fr...,34,9/22/2015,7.5,Mercure Paris 19 Philharmonie La Villette,United Kingdom,No Negative,0,607,Friendly staff quiet comfortable room spotles...,...,0.428333,0.666667,1.0,0.954507,0.045493,0.15,0.4,1.0,0.6125,0.3875
4,Molenwerf 1 1014 AG Amsterdam Netherlands,914,3/5/2016,8.5,Golden Tulip Amsterdam West,Poland,Torn sheets,4,7586,The staff was very friendly and helpful Break...,...,0.2775,0.596667,1.0,0.995193,0.004807,0.0,0.0,-1.0,0.457006,0.542994
