Data Source: https://www.trustpilot.com/review/www.worldremit.com

### Import necessary packages

In [184]:
import json
import numpy as np
import random
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import f1_score
import ast #to convert string into dictionary
from IPython.display import clear_output
from sklearn import svm
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix
from sklearn.linear_model import LogisticRegression

### Preprocessing - Loading and cleaning the data

In [167]:

class Sentiment:
    NEGATIVE = "NEGATIVE"
    NEUTRAL = "NEUTRAL"
    POSITIVE = "POSITIVE"

class Review:
    def __init__(self, text, score):
        self.text = text
        self.score = score
        self.sentiment = self.get_sentiment()
        
    def get_sentiment(self):
        if self.score <= 2:
            return Sentiment.NEGATIVE
        elif self.score == 3:
            return Sentiment.NEUTRAL
        else: #Score of 4 or 5
            return Sentiment.POSITIVE
#     def __str__(self):
#         return  
class ReviewContainer:
    def __init__(self, reviews):
        self.reviews = reviews
        
    def get_text(self):
        return [x.text for x in self.reviews]
    
    def get_sentiment(self):
        return [x.sentiment for x in self.reviews]
        
    def evenly_distribute(self):
        negative = list(filter(lambda x: x.sentiment == Sentiment.NEGATIVE, self.reviews))
        positive = list(filter(lambda x: x.sentiment == Sentiment.POSITIVE, self.reviews))
        positive_shrunk = positive[:len(negative)]
        self.reviews = negative + positive_shrunk
        random.shuffle(self.reviews)
        
        

In [168]:
#Most of the cleaning was done during the data web scraping

with open("./data/Worldremit/reviews.txt") as fp:
    for index,line in enumerate(fp):

        review = ast.literal_eval(line)
        print(review)
        break


{'socialShareUrl': 'https://www.trustpilot.com/reviews/5ed0251025e5d20a88a2057d', 'businessUnitId': '5090eace00006400051ded85', 'businessUnitDisplayName': 'WorldRemit', 'consumerId': '5ed0250fdfdf8632f9ee7ab6', 'consumerName': 'May', 'reviewId': '5ed0251025e5d20a88a2057d', 'reviewHeader': 'Wow - Great Service', 'reviewBody': 'Wow. Great Service with no issues.  Money was available same day in no time.', 'stars': 5}


In [169]:
#Most of the cleaning was done during the data web scraping

reviews = []
with open("./data/Worldremit/reviews.txt") as fp:
    for index,line in enumerate(fp):
        #print(line)
        review = ast.literal_eval(line)
        reviews.append(Review(review['reviewBody'], review['stars']))
        #review = json.loads(line)

In [170]:
len(reviews)

36456

In [171]:
from sklearn.model_selection import train_test_split

training, test = train_test_split(reviews, test_size=0.33, random_state=42)


In [172]:
train_container = ReviewContainer(training)

test_container = ReviewContainer(test)

In [173]:
train_container.evenly_distribute()
train_x = train_container.get_text()
train_y = train_container.get_sentiment()

test_container.evenly_distribute()
test_x = test_container.get_text()
test_y = test_container.get_sentiment()

print(train_y.count(Sentiment.POSITIVE))
print(train_y.count(Sentiment.NEGATIVE))
print(train_y.count(Sentiment.NEUTRAL))


1005
1005
0


## Bag of words Vectorization

Links:
https://scikit-learn.org/stable/modules/feature_extraction.html

https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html

Consider this two sentenses 

1) Excellent Services by the world remit team.Recomend.

2) Bad Services. Transaction delayed for three days.Don't recommend.

From above two sentenses can be writen as a bag of words as follows (no word repetition):

**Excellent Services by the world remit team recommend bad transaction delayed for three days don't**

We now tokenize this bag of words as shown below.

----
| 0 | Excellent | Sevices| by | the | world | remit | team | recommend | bad | services | transaction | delayed | for | three | days | don't |
| --- | --- | ---| --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- |
| 1 | 1 | 1| 1 | 1 | 1 | 1 | 1 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 |
| 2 | 0 | 1| 0 | 0 | 0 | 0 | 0 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 |

In [174]:
# Vectorizaation on sklearn - simple example
corpus = [
    'This is the first document.',
    'This document is the second document.',
     'And this is the third one.',
    'Is this the first document?',
 ]
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(corpus)
print(X) #this is just a matrix with position as tuple and token in that position
print(vectorizer.get_feature_names()) #all unique words in all 
print(X.toarray())#clearnly a matrix with each row as the tokenized values of each sentense

  (0, 8)	1
  (0, 3)	1
  (0, 6)	1
  (0, 2)	1
  (0, 1)	1
  (1, 8)	1
  (1, 3)	1
  (1, 6)	1
  (1, 1)	2
  (1, 5)	1
  (2, 8)	1
  (2, 3)	1
  (2, 6)	1
  (2, 0)	1
  (2, 7)	1
  (2, 4)	1
  (3, 8)	1
  (3, 3)	1
  (3, 6)	1
  (3, 2)	1
  (3, 1)	1
['and', 'document', 'first', 'is', 'one', 'second', 'the', 'third', 'this']
[[0 1 1 1 0 0 1 0 1]
 [0 2 0 1 0 1 1 0 1]
 [1 0 0 1 1 0 1 1 1]
 [0 1 1 1 0 0 1 0 1]]


In [175]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

# This book is great !
# This book was so bad

vectorizer = TfidfVectorizer()
train_x_vectors = vectorizer.fit_transform(train_x)

test_x_vectors = vectorizer.transform(test_x)

print(train_x[10])
print(train_x_vectors[10])
print(train_x_vectors[10].toarray())

i live to work with world remit
  (0, 2885)	0.650162112849576
  (0, 4024)	0.3395877355590502
  (0, 5402)	0.3318739534148053
  (0, 5394)	0.4957091453277229
  (0, 5362)	0.2765650412513953
  (0, 4882)	0.17208514495374555
[[0. 0. 0. ... 0. 0. 0.]]


### Fitting the different models within sklearn

In [176]:


#train SVM classifier
clf_svm = svm.SVC(kernel='linear')

clf_svm.fit(train_x_vectors, train_y)
#random prediction using SVM

i = np.random.randint(0,1000)
print(i)
print(test_x[i])
print("Actual:",test_y[i])
print("Prediction",clf_svm.predict(test_x_vectors[i]))



843
When signing up,  the birthdate  is a  hussle. One has to scroll alot.
Actual: POSITIVE
Prediction ['NEGATIVE']


In [177]:
from sklearn.tree import DecisionTreeClassifier

clf_dec = DecisionTreeClassifier()
clf_dec.fit(train_x_vectors, train_y)

i = np.random.randint(0,1000)
print(i)
print(test_x[i])
print("Actual:",test_y[i])
print("Prediction",clf_dec.predict(test_x_vectors[0]))



438
Money I sent was received in about 1 minute.  Very easy to send and very prompt service.
Actual: POSITIVE
Prediction ['POSITIVE']


In [178]:
from sklearn.naive_bayes import GaussianNB

clf_gnb = DecisionTreeClassifier()
clf_gnb.fit(train_x_vectors, train_y)

clf_gnb.predict(test_x_vectors[0])


i = np.random.randint(0,1000)
print(i)
print(test_x[i])
print("Actual:",test_y[i])
print("Prediction",clf_gnb.predict(test_x_vectors[0]))



76
Really bad service!! They charged an extra free one day later which was not included in the receipt. Do not use this company!
Actual: NEGATIVE
Prediction ['POSITIVE']


In [179]:
pred_svm = clf_gnb.predict(test_x_vectors)
confusion_matrix(test_y,pred_svm)

array([[419, 130],
       [103, 446]])

In [185]:


clf_log = LogisticRegression()
clf_log.fit(train_x_vectors, train_y)

clf_log.predict(test_x_vectors[0])





array(['POSITIVE'], dtype='<U8')

### Scoring all the models fitted

In [181]:
# Mean Accuracy
print(clf_svm.score(test_x_vectors, test_y))
print(clf_dec.score(test_x_vectors, test_y))
print(clf_gnb.score(test_x_vectors, test_y))
print(clf_log.score(test_x_vectors, test_y))

0.8897996357012751
0.773224043715847
0.7877959927140255
0.8852459016393442


In [182]:
# F1 Scores

f1_score(test_y, clf_svm.predict(test_x_vectors), average=None, labels=[Sentiment.POSITIVE, Sentiment.NEGATIVE])
#f1_score(test_y, clf_log.predict(test_x_vectors), average=None, labels=[Sentiment.POSITIVE, Sentiment.NEUTRAL, Sentiment.NEGATIVE])




array([0.89049774, 0.88909258])

### Tuning the model using   Grid Search

In [44]:
from sklearn.model_selection import GridSearchCV

parameters = {'kernel': ('linear', 'rbf'), 'C': (1,4,8,16,32)}

svc = svm.SVC()
clf = GridSearchCV(svc, parameters, cv=5)
clf.fit(train_x_vectors, train_y)



GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
                           decision_function_shape='ovr', degree=3,
                           gamma='auto_deprecated', kernel='rbf', max_iter=-1,
                           probability=False, random_state=None, shrinking=True,
                           tol=0.001, verbose=False),
             iid='warn', n_jobs=None,
             param_grid={'C': (1, 4, 8, 16, 32), 'kernel': ('linear', 'rbf')},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=0)

In [46]:
print(clf.score(test_x_vectors, test_y))

0.8897996357012751


### Saving the trained models as .json file

In [47]:
import pickle

with open('./models/review_classifier.pkl', 'wb') as f:
    pickle.dump(clf, f)

In [48]:
with open('./models/review_classifier.pkl', 'rb') as f:
    loaded_clf = pickle.load(f)

In [51]:
print(test_x[120])

loaded_clf.predict(test_x_vectors[120])

Fast and reliable


array(['POSITIVE'], dtype='<U8')

### Test your texts

In [194]:
test_set = ['very fun', "Really bad service!!", 'Fast']
new_test = vectorizer.transform(test_set)

clf_svm.predict(new_test)


array(['POSITIVE', 'NEGATIVE', 'POSITIVE'], dtype='<U8')