In [1]:
# from sklearn.cross_validation import train_test_split
from sklearn.model_selection import train_test_split
from collections import Counter
from nltk.stem import PorterStemmer
from nltk import word_tokenize
import os
import random
import codecs

random.seed(42)

In [2]:
base_path = "/Users/aliosha/Downloads/op_spam_v1.4"

In [3]:
def read_review(filename):
    with open(filename, "rb") as fp:
        return fp.read()

In [4]:
real_reviews = []
fake_reviews = []

for dir_name, subdir_list, file_list in os.walk(base_path):
    if "deceptive" in dir_name:
        for fname in file_list:
            if ".txt" in fname:
                fake_reviews.append(read_review(os.path.join(dir_name,fname)))
    if "truthful" in dir_name:
        for fname in file_list:
            if ".txt" in fname:
                real_reviews.append(read_review(os.path.join(dir_name,fname)))
          
print(f"{len(fake_reviews)} fake reviews, {len(real_reviews)} real reviews")

800 fake reviews, 800 real reviews


In [5]:
from gensim.summarization import summarize

In Gensim, summarizing is based on ranks of text sentences using a variation of the TextRank algorithm 

https://arxiv.org/abs/1602.03606

In [6]:
text = real_reviews[0]
text

b'My $200 Gucci sunglasses were stolen out of my bag on the 16th. I filed a report with the hotel security and am anxious to hear back from them. This was such a disappointment, as we liked the hotel and were having a great time in Chicago. Our room was really nice, with 2 bathrooms. We had 2 double beds and a comfortable hideaway bed. We had a great view of the lake and park. The hotel charged us $25 to check in early (10am).\n'

In [7]:
print ('Summary:')
print (summarize(str(text)))

Summary:
This was such a disappointment, as we liked the hotel and were having a great time in Chicago.


In [8]:
for text in real_reviews[:10]:
    print(text)
    print("###")
    print("summary")
    print (summarize(str(text)))
    print("###")

b'My $200 Gucci sunglasses were stolen out of my bag on the 16th. I filed a report with the hotel security and am anxious to hear back from them. This was such a disappointment, as we liked the hotel and were having a great time in Chicago. Our room was really nice, with 2 bathrooms. We had 2 double beds and a comfortable hideaway bed. We had a great view of the lake and park. The hotel charged us $25 to check in early (10am).\n'
###
summary
This was such a disappointment, as we liked the hotel and were having a great time in Chicago.
###
b"This was a gorgeous hotel from the outside and in the lobby. However, when we reached the elevator things started to look quite dingy. Our bathrooms in our room had grout that was in need of a deep scrubbing and had an overall appearance of dirty. Wireless internet is available, but that's an additional fee. Are you kidding? Even the roach motel offers wireless for free! Also, the fitness room was available for only $15 per day. Wow, I thought the r

In [9]:
for text in real_reviews[:10]:
    print(text)
    print("###")
    print("summary")
    print (summarize(str(text), word_count=50))
    print("###")

b'My $200 Gucci sunglasses were stolen out of my bag on the 16th. I filed a report with the hotel security and am anxious to hear back from them. This was such a disappointment, as we liked the hotel and were having a great time in Chicago. Our room was really nice, with 2 bathrooms. We had 2 double beds and a comfortable hideaway bed. We had a great view of the lake and park. The hotel charged us $25 to check in early (10am).\n'
###
summary
I filed a report with the hotel security and am anxious to hear back from them.
This was such a disappointment, as we liked the hotel and were having a great time in Chicago.
We had a great view of the lake and park.
The hotel charged us $25 to check in early (10am).\n'
###
b"This was a gorgeous hotel from the outside and in the lobby. However, when we reached the elevator things started to look quite dingy. Our bathrooms in our room had grout that was in need of a deep scrubbing and had an overall appearance of dirty. Wireless internet is availabl

In [15]:
from gensim.summarization import keywords

for text in real_reviews[:10]:
    print ('Keywords:')
    print (keywords(text))    
    print("summary")
    print (summarize(str(text), word_count=50))
    print("###")


Keywords:
hotel
great
summary
I filed a report with the hotel security and am anxious to hear back from them.
This was such a disappointment, as we liked the hotel and were having a great time in Chicago.
We had a great view of the lake and park.
The hotel charged us $25 to check in early (10am).\n'
###
Keywords:
wireless
better
room
things
nearly
minutes
motel
summary
Wireless internet is available, but that's an additional fee.
Are you kidding?
Also, the fitness room was available for only $15 per day.
I called the next morning requesting more and was told that wasn't housekeeping's area but they would send some up.
Glad I wasn't in a big hurry.
###
Keywords:
looking
cool
summary
After being out seeing the sites of Chicago for several hours and being very hot we were looking forward to resting in a cool room before going to a show .
After determining that the unit needed a motor it was another hour so we did not get to rest in a cool room before going out.
###
Keywords:
bar
high
left

In [11]:
german_text = """Hatte ein Arrangement über Weihnachten gebucht, zwei Übernachtungen, Frühstück, ein Cocktail, Stadtrundfahrt und Stadtführer für 148,00€. Sehr gutes Preis- Leistungsverhältnis ! Das Zimmer groß, sehr sauber, Bett bequem und gratis Mineralwasser. Schönes Schwimmbad, Fitnessraum und Saunen. Frühstück war gut und Extrawünsche wurden erfüllt.Alle Sehenswürdigkeiten sind in der näheren Umgebung, oder gut zu erreichen. Das Personal durchweg sehr freundlich, ich habe mich willkommen gefühlt und komme gerne Wieder."""

In [14]:
print(summarize(german_text, word_count=30))

Hatte ein Arrangement über Weihnachten gebucht, zwei Übernachtungen, Frühstück, ein Cocktail, Stadtrundfahrt und Stadtführer für 148,00€.
Das Zimmer groß, sehr sauber, Bett bequem und gratis Mineralwasser.


In [17]:
print(keywords(german_text))

und
sehr
das
ein
fruhstuck
gut
sehenswurdigkeiten
naheren
gerne


In [19]:
help(summarize)

Help on function summarize in module gensim.summarization.summarizer:

summarize(text, ratio=0.2, word_count=None, split=False)
    Get a summarized version of the given text.
    
    The output summary will consist of the most representative sentences
    and will be returned as a string, divided by newlines.
    
    Note
    ----
    The input should be a string, and must be longer than :const:`~gensim.summarization.summarizer.INPUT_MIN_LENGTH`
    sentences for the summary to make sense.
    The text will be split into sentences using the split_sentences method in the :mod:`gensim.summarization.texcleaner`
    module. Note that newlines divide sentences.
    
    
    Parameters
    ----------
    text : str
        Given text.
    ratio : float, optional
        Number between 0 and 1 that determines the proportion of the number of
        sentences of the original text to be chosen for the summary.
    word_count : int or None, optional
        Determines how many words will the