In [2]:
import pandas as pd

In [3]:
df = pd.read_csv("London.csv", encoding= 'unicode_escape')
df.head()

Unnamed: 0,Property Name,Review Rating,Review Title,Review Text,Location Of The Reviewer,Date Of Review
0,Apex London Wall Hotel,5.0,Ottima qualità prezzo,Siamo stati a Londra per un week end ed abbiam...,"Casale Monferrato, Italy",10/20/2012
1,Corinthia Hotel London,5.0,"By far, my best hotel in the world",I had a pleasure of staying in this hotel for ...,"Savannah, Georgia",3/23/2016
2,The Savoy,5.0,First visit to the American Bar at the Savoy,A very lovely first visit to this iconic hotel...,London,7/30/2013
3,Rhodes Hotel,4.0,Nice stay,3 of us stayed at the Rhodes Hotel for 4 night...,"Maui, Hawaii",06/02/2012
4,The Savoy,5.0,Perfection,Form the moment we arrived until we left we ex...,"London, United Kingdom",11/24/2017


In [4]:
df.shape

(27331, 6)

In [5]:
# Firtly check for nan nalues
df.isnull().values.any()

True

In [63]:
# After drop them
df = df.dropna(how='any')
df.shape
df.to_csv("New_London")

# I lost many values but I think it will be a good sample to make predictions

In [7]:
df.columns

Index(['Property Name', 'Review Rating', 'Review Title', 'Review Text',
       'Location Of The Reviewer', 'Date Of Review'],
      dtype='object')

In [23]:
class Sentiment:
    positive = "Positive"
    negative = "Negative"
    neural = "Neural"
    


class Review:
    def __init__(self, text, rating):
        self.text = text
        self.rating = rating
        self.sentiment = self.get_rating()
        
    def get_rating(self):
        if self.rating > 4:
            return Sentiment.positive
        elif self.rating >= 3 and self.rating <= 4 :
            return Sentiment.neural
        else:
            return Sentiment.negative
        

In [24]:
reviews = []
for index,item in df.iterrows():
    reviews.append(Review(item["Review Text"], item["Review Rating"]))
    
# Chech some data
reviews[0].rating
#reviews[0].text      
    

5.0

In [25]:
# Split the data to train and test the model
from sklearn.model_selection import train_test_split
training, test = train_test_split(reviews, test_size=0.33, random_state=42)

In [26]:
len(training)

15662

In [27]:
len(test)

7715

In [28]:
# We want to pass text and predict if the text is positive, negative or neural

train_x = [x.text for x in training]
train_y = [x.sentiment for x in training]

test_x = [x.text for x in test]
test_y = [x.sentiment for x in test]

In [32]:
# I want to conver the text to numeric type
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()

# fit and transform 

train_x_vectors = vectorizer.fit_transform(train_x)

test_x_vectors = vectorizer.transform(test_x)

print(train_x_vectors[0].toarray())

[[0 0 0 ... 0 0 0]]


In [33]:
# check for the best model 
## Classification using SVM

from sklearn import svm

clf_svm = svm.SVC(kernel="linear")

clf_svm.fit(train_x_vectors, train_y)

test_x[0]
#text_x_vectors[0]

# we can predict if this text is positive or negative

clf_svm.predict(test_x_vectors[0])

array(['Neural'], dtype='<U8')

In [17]:
## desicion tree

from sklearn.tree import DecisionTreeClassifier

clf_dec = DecisionTreeClassifier()
clf_dec.fit(train_x_vectors, train_y)
clf_dec.predict(test_x_vectors[0])


array(['Positive'], dtype='<U8')

In [18]:
## Logistic regression

from sklearn.linear_model import LogisticRegression

clf_log = LogisticRegression()
clf_log.fit(train_x_vectors, train_y)
clf_log.predict(test_x_vectors[0])

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


array(['Positive'], dtype='<U8')

In [19]:
# Evaluation
## Mean Accuracy
print(clf_svm.score(test_x_vectors, test_y))
print(clf_dec.score(test_x_vectors, test_y))
print(clf_log.score(test_x_vectors, test_y))

0.901101749837978
0.861049902786779
0.9113415424497732


In [20]:
# F1 Scores to compare the classifiers
from sklearn.metrics import f1_score

In [34]:
print(f1_score(test_y, clf_svm.predict(test_x_vectors), average=None, labels=(Sentiment.positive, Sentiment.neural, Sentiment.negative)))
print(f1_score(test_y, clf_dec.predict(test_x_vectors), average=None, labels=(Sentiment.positive, Sentiment.neural, Sentiment.negative)))
print(f1_score(test_y, clf_log.predict(test_x_vectors), average=None, labels=(Sentiment.positive, Sentiment.neural, Sentiment.negative)))


[0.83611459 0.56583794 0.56467316]
[0.8077807  0.12324493 0.29912023]
[0.83412322 0.17250886 0.56355283]


In [54]:
# The amount of the data that examined in the tree categories 
print(test_y.count(Sentiment.positive))
print(test_y.count(Sentiment.neural))
print(test_y.count(Sentiment.negative))

5138
2220
357


In [62]:
# Quality analysis
 
test_set = ["I really enjoy this hotel", "Bad unhappy the worst hotel room", "It was a very bad hotel", "I don not recommend this place"]
new_test = vectorizer.transform(test_set)

clf_svm.predict(new_test)

array(['Positive', 'Negative', 'Positive', 'Positive'], dtype='<U8')

In [None]:
# I have to change my model 

In [61]:
test_set = ["I really enjoy this hotel", "It the worst hotel room", "It was a very bad hotel", "I don not recommend this place"]
new_test = vectorizer.transform(test_set)

clf_dec.predict(new_test)

array(['Positive', 'Positive', 'Positive', 'Positive'], dtype='<U8')