In [5]:
import sklearn
import nltk
import re

Data downloaded from MonkeyLearn blog. The data contains 223 hotel reviews, and it has three columns: review texts, review sentiments (e.g. Positive, Negative, and Neutral), and the review topics (e.g. Location, Comfort, Facilities, Cleanliness, etc).

In [4]:
import csv
reviews = [row for row in csv.reader(open('reviews.csv'))]

Using NLTK filter out stopwords, remove non-alphabetic characters, and stem each word to its root:

In [6]:
# We need this dataset in order to use the tokenizer
nltk.download('punkt')
from nltk.tokenize import word_tokenize

# Also download the list of stopwords to filter out
nltk.download('stopwords')
from nltk.corpus import stopwords

from nltk.stem.porter import PorterStemmer
stemmer = PorterStemmer()

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/kavitasurasura/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/kavitasurasura/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [7]:
def process_text(text):
    # Make all the strings lowercase and remove non alphabetic characters
    text = re.sub('[^A-Za-z]', ' ', text.lower())
 
    # Tokenize the text; this is, separate every sentence into a list of words
    # Since the text is already split into sentences you don't have to call sent_tokenize
    tokenized_text = word_tokenize(text)
 
    # Remove the stopwords and stem each word to its root
    clean_text = [
        stemmer.stem(word) for word in tokenized_text
        if word not in stopwords.words('english')
    ]
 
    # Remember, this final output is a list of words
    return clean_text

In [8]:
# Remove the first row, since it only has the labels
reviews = reviews[1:]
 
texts = [row[0] for row in reviews]
topics = [row[2] for row in reviews]
 
# Process the texts to so they are ready for training
# But transform the list of words back to string format to feed it to sklearn
texts = [" ".join(process_text(text)) for text in texts]

In [10]:
#texts

In [11]:
from sklearn.feature_extraction.text import CountVectorizer
matrix = CountVectorizer(max_features=1000)
vectors = matrix.fit_transform(texts).toarray()

In [12]:
from sklearn.model_selection import train_test_split
vectors_train, vectors_test, topics_train, topics_test = train_test_split(vectors, topics)

In [13]:
from sklearn.naive_bayes import GaussianNB
classifier = GaussianNB()
classifier.fit(vectors_train, topics_train)

GaussianNB(priors=None, var_smoothing=1e-09)

In [14]:
# Predict with the testing set
topics_pred = classifier.predict(vectors_test)
 
# ...and measure the accuracy of the results
from sklearn.metrics import classification_report
print(classification_report(topics_test, topics_pred))

              precision    recall  f1-score   support

 Cleanliness       0.33      0.40      0.36         5
     Comfort       0.54      0.57      0.55        23
  Facilities       0.50      0.46      0.48        24

    accuracy                           0.50        52
   macro avg       0.46      0.47      0.47        52
weighted avg       0.50      0.50      0.50        52

