In [1]:
import textacy
import spacy
nlp = spacy.load('en')

In [2]:
# from sklearn.cross_validation import train_test_split
from sklearn.model_selection import train_test_split
from collections import Counter
from nltk.stem import PorterStemmer
from nltk import word_tokenize
import os
import random
import codecs

random.seed(42)

stemmer = PorterStemmer()

import numpy as np

np.random.seed(42)

def train(classifier, X, y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=33)

    classifier.fit(X_train, y_train)
    print("Accuracy: %s" % classifier.score(X_test, y_test))
    return classifier

### Download dataset from
http://myleott.com/op-spam.html

and uncompress it in some folder

In [3]:
base_path = "/Users/aliosha/Downloads/op_spam_v1.4"

In [4]:
def read_review(filename):
    with open(filename, "rb") as fp:
        return fp.read()

In [5]:
real_reviews = []
fake_reviews = []

for dir_name, subdir_list, file_list in os.walk(base_path):
    if "deceptive" in dir_name:
        for fname in file_list:
            if ".txt" in fname:
                fake_reviews.append(read_review(os.path.join(dir_name,fname)))
    if "truthful" in dir_name:
        for fname in file_list:
            if ".txt" in fname:
                real_reviews.append(read_review(os.path.join(dir_name,fname)))
          
print(f"{len(fake_reviews)} fake reviews, {len(real_reviews)} real reviews")

800 fake reviews, 800 real reviews


In [7]:
indexes = list(range(len(fake_reviews)+len(real_reviews)))
random.shuffle(indexes)
indexes[:10]

[856, 549, 970, 111, 736, 517, 586, 1326, 808, 491]

In [8]:
real_reviews[0]

b'My $200 Gucci sunglasses were stolen out of my bag on the 16th. I filed a report with the hotel security and am anxious to hear back from them. This was such a disappointment, as we liked the hotel and were having a great time in Chicago. Our room was really nice, with 2 bathrooms. We had 2 double beds and a comfortable hideaway bed. We had a great view of the lake and park. The hotel charged us $25 to check in early (10am).\n'

In [9]:
data = []
target = []
all_reviews = fake_reviews + real_reviews
for i in indexes:
    data.append(all_reviews[i])
    if i <= len(fake_reviews):
        target.append(0)
    else:
        target.append(1)
    

In [15]:
target[:10]

[1, 0, 1, 0, 0, 0, 0, 1, 1, 0]

In [26]:
%%time

from sklearn.naive_bayes import BernoulliNB, MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
 
trial1 = Pipeline([
    ('vectorizer', TfidfVectorizer()),
    ('classifier', BernoulliNB()),
])
 
train(trial1, data, target)
# Accuracy: 0.8463497453310697


Accuracy: 0.8675
CPU times: user 321 ms, sys: 10.5 ms, total: 331 ms
Wall time: 353 ms


In [20]:
%%time

from sklearn.svm import SVC

C = 1.0  # SVM regularization parameter

trial_svc = Pipeline([
    ('vectorizer', TfidfVectorizer()),
    ('classifier', SVC(kernel='linear', C=C)),
])
 
train(trial_svc, data, target)
# Accuracy: 0.8463497453310697


Accuracy: 0.86
CPU times: user 1.51 s, sys: 19 ms, total: 1.53 s
Wall time: 1.56 s


In [28]:
from nltk.corpus import stopwords


trial2 = Pipeline([
    ('vectorizer', TfidfVectorizer(stop_words=stopwords.words('english'))),
    ('classifier', MultinomialNB()),
])
 
train(trial2, data, target)
# Accuracy: 0.88


Accuracy: 0.88


Pipeline(memory=None,
     steps=[('vectorizer', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=Tr...      vocabulary=None)), ('classifier', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))])

In [25]:
trial3 = Pipeline([
    ('vectorizer', TfidfVectorizer(stop_words=stopwords.words('english'))),
    ('classifier', BernoulliNB()),
])
 
train(trial3, data, target)
# Accuracy: 0.88


Accuracy: 0.865


Pipeline(memory=None,
     steps=[('vectorizer', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=Tr...lary=None)), ('classifier', BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, fit_prior=True))])

In [26]:
from nltk.corpus import stopwords


trial2 = Pipeline([
    ('vectorizer', TfidfVectorizer(stop_words=stopwords.words('english'))),
    ('classifier', MultinomialNB(alpha=0.05)),
])
 
train(trial2, data, target)
# Accuracy: 0.875


Accuracy: 0.875


Pipeline(memory=None,
     steps=[('vectorizer', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=Tr...     vocabulary=None)), ('classifier', MultinomialNB(alpha=0.05, class_prior=None, fit_prior=True))])

In [29]:
trial3 = Pipeline([
    ('vectorizer', TfidfVectorizer(stop_words=stopwords.words('english'))),
    ('classifier', MultinomialNB(alpha=2)),
])
 
train(trial3, data, target)


Accuracy: 0.8775


Pipeline(memory=None,
     steps=[('vectorizer', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=Tr...        vocabulary=None)), ('classifier', MultinomialNB(alpha=2, class_prior=None, fit_prior=True))])

In [31]:
cleaned_data = [textacy.preprocess_text(str(text), no_emails=True, no_punct=True, no_urls=True, lowercase=True) for text in data]

In [33]:
trial_c = Pipeline([
    ('vectorizer', TfidfVectorizer(stop_words=stopwords.words('english'))),
    ('classifier', MultinomialNB(alpha=1)),
])

train(trial_c, cleaned_data, target)

Accuracy: 0.875


Pipeline(memory=None,
     steps=[('vectorizer', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=Tr...        vocabulary=None)), ('classifier', MultinomialNB(alpha=1, class_prior=None, fit_prior=True))])

In [34]:
from sklearn.linear_model import SGDClassifier

trial_l = Pipeline([
    ('vectorizer', TfidfVectorizer(stop_words=stopwords.words('english'))),
    ('classifier', SGDClassifier()),
])

train(trial_c, data, target)

Accuracy: 0.88


Pipeline(memory=None,
     steps=[('vectorizer', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=Tr...        vocabulary=None)), ('classifier', MultinomialNB(alpha=1, class_prior=None, fit_prior=True))])

In [36]:
trial1.predict([data[0]])

array([1])

In [37]:
trial_c.predict([data[0]])

array([1])

In [35]:
data[0]

b'This was a refreshing change from the ordinary. I loved the location, the service and the amenities offered by this hotel. The room was charming with a window seat and a water view. The decor was unique but cheerful. Free wireless internet services were a plus here. The staff was helpful and attentive. We loved having goldfish share our room! I would definitely stay here again.\n'

In [38]:
target[0]

1

In [45]:
from sklearn.metrics import classification_report

In [41]:
%%time 

results_c = trial_c.predict(data)

CPU times: user 339 ms, sys: 6.73 ms, total: 345 ms
Wall time: 392 ms


In [49]:
print(classification_report(target, results_c))

             precision    recall  f1-score   support

          0       0.93      0.97      0.95       801
          1       0.97      0.92      0.95       799

avg / total       0.95      0.95      0.95      1600



In [57]:
for i in range(10):
    print(f"review {i}")
    print(data[i])

print("true or false?")
print("...")
for i in range(10):
    if target[i]:
        print(f"review {i}", True)
    else:
        print(f"review {i}", False)

review 0
b'This was a refreshing change from the ordinary. I loved the location, the service and the amenities offered by this hotel. The room was charming with a window seat and a water view. The decor was unique but cheerful. Free wireless internet services were a plus here. The staff was helpful and attentive. We loved having goldfish share our room! I would definitely stay here again.\n'
review 1
b"This hotel was an absolute dream to stay in. I couldn't have picked a better hotel to spend in such luxury. I enjoyed the set up of the rooms and the space they offered.The room decor was extremely elegant and the location was great as well. I honestly felt like I was at a spa! The price was very reasonable as well. I would definetly recommend this hotel to everyone I know. If you ever get a chance to spend a night here definetly choose the suite rooms!!!\n"
review 2
b"The Conrad has all the right stuff - great bedding, fancy decorations, beautiful lobby, etc. However, they are seriously

In [58]:
misclassified =  []
for i, item in enumerate(target):
    if item != results_c[i]:
        misclassified.append((data[i], item))
len(misclassified)

84

In [59]:
misclassified[:10]

[(b"A friend highly recommended this hotel and we couldn't have been happier! It was wonderful ~ my husband and kids were already planning our next trip there before we had even left the hotel. The kids loved the goldfish in the room and thought the window seats were the best. \n",
  1),
 (b"This hotel was full of drunks. The lobby was full of crowds at night and the bar was way too busy. There were people stumbling all over the hotel at all hours of the night, yelling in the hallways, making lots of noise. The staff had little to say. I wasn't aware of the noise level when I booked the room and when I threatened to make other reservations, the people in the lobby had nothing to say. We asked for a water view and got a city view. Again, the people at the front desk weren't accomodating at all. Overall, we had a horrible experience.\n",
  0),
 (b"I just got back from the Monaco in Chicago! I was very pleasantly surprised, as my husband booked this trip as a last minute getaway, and he u

In [68]:
%%time

import string 

def stemming_tokenizer(text):
    stemmer = PorterStemmer()
    return [stemmer.stem(w) for w in word_tokenize(text)]
 
trial_stem = Pipeline([
    ('vectorizer', TfidfVectorizer(tokenizer=stemming_tokenizer,
                             stop_words=stopwords.words('english') + list(string.punctuation))),
    ('classifier', MultinomialNB(alpha=1)),
])

train(trial_stem, data, target)


Accuracy: 0.8675
CPU times: user 10.6 s, sys: 103 ms, total: 10.7 s
Wall time: 12.1 s


In [15]:
def extract_words(text):
    lemmas = [word.lemma_ for word in nlp(text) if not word.is_stop or word.]
    return lemmas
    
fake_words = []

In [12]:
text= data[0]

In [13]:
text

b'This was a refreshing change from the ordinary. I loved the location, the service and the amenities offered by this hotel. The room was charming with a window seat and a water view. The decor was unique but cheerful. Free wireless internet services were a plus here. The staff was helpful and attentive. We loved having goldfish share our room! I would definitely stay here again.\n'

In [16]:
extract_words(str(text))

["b'this",
 'refreshing',
 'change',
 'ordinary',
 '.',
 '-PRON-',
 'love',
 'location',
 ',',
 'service',
 'amenity',
 'offer',
 'hotel',
 '.',
 'the',
 'room',
 'charming',
 'window',
 'seat',
 'water',
 'view',
 '.',
 'the',
 'decor',
 'unique',
 'cheerful',
 '.',
 'free',
 'wireless',
 'internet',
 'service',
 'plus',
 '.',
 'the',
 'staff',
 'helpful',
 'attentive',
 '.',
 '-PRON-',
 'love',
 'have',
 'goldfish',
 'share',
 'room',
 '!',
 '-PRON-',
 'definitely',
 'stay',
 'again.\\n',
 "'"]

In [42]:
cleaned_fake = [textacy.preprocess_text(str(text), no_emails=True, no_punct=True, no_urls=True, lowercase=True, no_numbers=True) for text in fake_reviews]


In [43]:
cleaned_real = [textacy.preprocess_text(str(text), no_emails=True, no_punct=True, no_urls=True, lowercase=True, no_numbers=True) for text in real_reviews]


In [44]:
extract_words(cleaned_fake[0])

['bwe',
 'stay',
 'schicago',
 'hilton',
 'number',
 'day',
 'number',
 'night',
 'conference',
 'normally',
 'easy',
 'go',
 'amenity',
 'cleanliness',
 'likehowever',
 'experience',
 'hilton',
 'awful',
 'take',
 'time',
 'actually',
 'write',
 'review',
 'truly',
 'stay',
 'hotel',
 'arrive',
 'room',
 'clear',
 'carpet',
 'not',
 'vacuum',
 'figuer',
 'okay',
 'carpet',
 'see',
 'bathroom',
 'bathroom',
 'superficial',
 'indicator',
 'housekeeping',
 'have',
 'recently',
 'clean',
 'ie',
 'paper',
 'band',
 'toilet',
 'paper',
 'cap',
 'drinking',
 'glass',
 'etc',
 'clear',
 'actual',
 'cleaning',
 'take',
 'place',
 'spot',
 'probably',
 'urine',
 'toilet',
 'seat',
 'kid',
 'remnant',
 'lipsmudge',
 'glass',
 'know',
 'people',
 'work',
 'year',
 'hotel',
 'industry',
 'warn',
 'lazy',
 'housekeeping',
 'thing',
 'appear',
 'clean',
 'fact',
 'effort',
 'thing',
 'sanitary',
 'hilton',
 'proof',
 'call',
 'downstairs',
 'complain',
 'send',
 'chambermaid',
 'hour',
 'later',
 'f

In [50]:
real_words = Counter([word for line in cleaned_real for word in extract_words(line)])

In [51]:
fake_words = Counter([word for line in cleaned_fake for word in extract_words(line)])


In [52]:
real_words.most_common(20)

[('room', 1709),
 ('hotel', 1618),
 ('number', 1128),
 ('stay', 1002),
 ('not', 612),
 ('great', 535),
 ('chicago', 472),
 ('good', 457),
 ('staff', 437),
 ('night', 432),
 ('n', 411),
 ('service', 396),
 ('bed', 375),
 ('location', 352),
 ('$', 312),
 ('nice', 300),
 ('time', 277),
 ('desk', 258),
 ('bathroom', 254),
 ('get', 248)]

In [59]:
fake_set = set([word for word, _ in fake_words.most_common(50)])

In [60]:
real_set = set([word for word, _ in real_words.most_common(50)])

In [61]:
fake_set - real_set

{'arrive',
 'business',
 'city',
 'definitely',
 'expect',
 'experience',
 'feel',
 'food',
 'hour',
 'husband',
 'need',
 'recommend',
 'reservation',
 'take',
 'visit',
 'wait',
 'want'}

In [62]:
real_set - fake_set

{'$',
 'bar',
 'bathroom',
 'book',
 'breakfast',
 'call',
 'comfortable',
 'door',
 'floor',
 'free',
 'helpful',
 'lobby',
 'n',
 'pay',
 'small',
 'tell',
 'walk'}

In [63]:
fake_set & real_set

{'area',
 'ask',
 'bed',
 'bi',
 'check',
 'chicago',
 'clean',
 'come',
 'day',
 'desk',
 'find',
 'friendly',
 'get',
 'go',
 'good',
 'great',
 'hotel',
 'like',
 'location',
 'look',
 'nice',
 'night',
 'not',
 'number',
 'place',
 'price',
 'restaurant',
 'room',
 'service',
 'staff',
 'stay',
 'time',
 'view'}