In [2]:
import sys
import nltk
import sklearn
import pandas
import numpy

import pandas as pd
import numpy as np

# load the dataset of Reviews
df = pd.read_csv('deceptive-opinion.csv',sep=",")

# check class distribution
classes = df.loc[:,"deceptive"]

from sklearn.preprocessing import LabelEncoder

# convert class labels to binary values 
encoder = LabelEncoder()
Y = encoder.fit_transform(classes)

text_messages = df.loc[:,"text"]

# Replace email addresses with 'email'
processed = text_messages.str.replace(r'^.+@[^\.].*\.[a-z]{2,}$',
                                 'emailaddress')

# Replace URLs with 'webaddress'
processed = processed.str.replace(r'^http\://[a-zA-Z0-9\-\.]+\.[a-zA-Z]{2,3}(/\S*)?$',
                                  'webaddress')

# Replace money symbols with 'moneysymb' (£ can by typed with ALT key + 156)
processed = processed.str.replace(r'£|\$', 'moneysymb')
    
# Replace 10 digit phone numbers (formats include paranthesis, spaces, no spaces, dashes) with 'phonenumber'
processed = processed.str.replace(r'^\(?[\d]{3}\)?[\s-]?[\d]{3}[\s-]?[\d]{4}$',
                                  'phonenumbr')
    
# Replace numbers with 'numbr'
processed = processed.str.replace(r'\d+(\.\d+)?', 'numbr')

# Remove punctuation
processed = text_messages.str.replace(r'[^\w\d\s]', ' ')

# Replace whitespace between terms with a single space
processed = processed.str.replace(r'\s+', ' ')

# Remove leading and trailing whitespace
processed = processed.str.replace(r'^\s+|\s+?$', '')

processed = processed.str.lower()

from nltk.corpus import stopwords

# remove stop words from review text

stop_words = set(stopwords.words('english'))

processed = processed.apply(lambda x: ' '.join(
    term for term in x.split() if term not in stop_words))

# Remove word stems using a Porter stemmer
ps = nltk.PorterStemmer()

processed = processed.apply(lambda x: ' '.join(
    ps.stem(term) for term in x.split()))


# create bag-of-words
from nltk.tokenize import word_tokenize

all_words = []
for message in processed:
    words = word_tokenize(message)
    for w in words:
        all_words.append(w)

all_words = nltk.FreqDist(all_words)
format(all_words.most_common(15))
word_features = list(all_words.keys())[:1500]

def find_features(message):
    words = word_tokenize(message)
    features = {}
    for word in word_features:
        features[word] = (word in words)

    return features

# Now lets do it for all the reviews
messages = list(zip(processed, Y))


# call find_features function for each review
featuresets = [(find_features(text), label) for (text, label) in messages]

# we can split the featuresets into training and testing datasets using sklearn
from sklearn import model_selection

# split the data into training and testing datasets
training, testing = model_selection.train_test_split(featuresets, test_size = 0.25, random_state=42)

from nltk.classify.scikitlearn import SklearnClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

# Define models to train
names = ["Naive Bayes","Logistic Regression","SVM Linear"
        ]

classifiers = [
        MultinomialNB(),LogisticRegression(),SVC(kernel = 'linear')
]

models = zip(names, classifiers)

for name, model in models:
    nltk_model = SklearnClassifier(model)
    nltk_model.train(training)
    accuracy = nltk.classify.accuracy(nltk_model, testing)*100
    print("{} Accuracy: {}".format(name, accuracy))
    
# Ensemble methods - Voting classifier
from sklearn.ensemble import VotingClassifier

names = ["Naive Bayes","Logistic Regression", "SVM Linear"
        ]

classifiers = [ MultinomialNB(),LogisticRegression(),SVC(kernel = 'linear')
              ]

models = list(zip(names, classifiers))

nltk_ensemble = SklearnClassifier(VotingClassifier(estimators = models, voting = 'hard', n_jobs = -1))
nltk_ensemble.train(training)
accuracy = nltk.classify.accuracy(nltk_model, testing)*100
print("Voting Classifier: Accuracy: {}".format(accuracy))

Naive Bayes Accuracy: 89.0




Logistic Regression Accuracy: 87.0
SVM Linear Accuracy: 84.25
Voting Classifier: Accuracy: 84.25


In [3]:
# make class label prediction for testing set
txt_features, labels = zip(*testing)

prediction = nltk_ensemble.classify_many(txt_features)

# print a confusion matrix and a classification report
print(classification_report(labels, prediction))

pd.DataFrame(
    confusion_matrix(labels, prediction),
    index = [['actual', 'actual'], ['truthful', 'deceptive']],
    columns = [['predicted', 'predicted'], ['truthful', 'deceptive']])

              precision    recall  f1-score   support

           0       0.91      0.85      0.88       206
           1       0.85      0.91      0.88       194

   micro avg       0.88      0.88      0.88       400
   macro avg       0.88      0.88      0.88       400
weighted avg       0.88      0.88      0.88       400



Unnamed: 0_level_0,Unnamed: 1_level_0,predicted,predicted
Unnamed: 0_level_1,Unnamed: 1_level_1,truthful,deceptive
actual,truthful,175,31
actual,deceptive,18,176


In [8]:
print(text_messages[850])
print(processed[850])
test = find_features(processed[850])
prediction = nltk_ensemble.classify(test)
print(prediction)
print(Y[850])

If I didn't have to stay here, I wouldn't. Without exception, I swear I am not lying, every day of my stay brings a new problem. Over the past 3 months, I have stayed here 6 times for 3 nights at a time; problems every time. From the wrong food with room service, to room keys that dont work, unrefreshed bath towels, forgotten and dirty in-room coffee service, unclean ice bucket overnight, forgotten room cleaning, inability to connect to the Internet....I'm too tired to keep writing all the problems. For the price of this chi-chi downtown hotel, I expect more. Finally, their team is certainly not trained in customer service. The front desk teams have attitudes, and Aria's hostesses think they are something else. I'm stuck using this hotel since it is adjoined to Aon, but I would never go back if I had a choice. The only good thing is the food at Aria - outstanding.

stay without except swear lie everi day stay bring new problem past 3 month stay 6 time 3 night time problem everi time wr