# Topic Classification

In [9]:
import re
import nltk
import csv

In [2]:
review = [row for row in csv.reader(open('data/reviews.csv'))]

In [10]:
review[:10]

[['Text', 'Sentiment', 'Topic'],
 ['The rooms are extremely small, practically only a bed.',
  'negative',
  'Comfort'],
 ['Room safe did not work.', 'negative', 'Facilities'],
 ['Mattress very comfortable.', 'positive', 'Comfort'],
 ['Very uncomfortable, thin mattress, with plastic cover that rustles every time you move.',
  'negative',
  'Comfort'],
 ['No bathroom in room', 'negative', 'Facilities'],
 ['The bed was soooo comfy.', 'positive', 'Comfort'],
 ['someone must have been smoking in the room next door.',
  'negative',
  'Cleanliness'],
 ['The bed is very comfortable.', 'positive', 'Comfort'],
 ['Very spacious rooms, quiet and very comfortable.', 'positive', 'Comfort']]

In [11]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\verma\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [12]:
from nltk.tokenize import word_tokenize
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
stemmer = PorterStemmer()

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\verma\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [13]:
def process_text(text):
    # Make all the strings lowercase and remove non alphabetic characters
    text = re.sub('[^A-Za-z]', ' ', text.lower())

    # Tokenize the text; this is, separate every sentence into a list of words
    tokenized_text = word_tokenize(text)

    # Remove stopwords and stem each word to its root
    clean_text = [
        stemmer.stem(word) for word in tokenized_text
        if word not in stopwords.words('english')
    ]

    # Remember, this final output is a list of words
    return clean_text

In [14]:
# labels ['Text', 'Sentiment', 'Topic'] removed
reviews = review[1:]
texts = [row[0] for row in reviews]    # list of texts
topics = [row[2] for row in reviews]   # list of topics

In [15]:
texts[:5]

['The rooms are extremely small, practically only a bed.',
 'Room safe did not work.',
 'Mattress very comfortable.',
 'Very uncomfortable, thin mattress, with plastic cover that rustles every time you move.',
 'No bathroom in room']

In [16]:
#processing data and creating a string again from list returned by process_text()
texts = [" ".join(process_text(text)) for text in texts]  

In [17]:
texts[:5]

['room extrem small practic bed',
 'room safe work',
 'mattress comfort',
 'uncomfort thin mattress plastic cover rustl everi time move',
 'bathroom room']

In [18]:
# texts must be vectorized 
from sklearn.feature_extraction.text import CountVectorizer
matrix = CountVectorizer(max_features=1000)
vectors = matrix.fit_transform(texts).toarray()

In [19]:
print(vectors.shape)
print(matrix.vocabulary_)

(207, 500)
{'room': 354, 'extrem': 151, 'small': 388, 'practic': 319, 'bed': 40, 'safe': 359, 'work': 494, 'mattress': 265, 'comfort': 80, 'uncomfort': 464, 'thin': 432, 'plastic': 312, 'cover': 97, 'rustl': 358, 'everi': 142, 'time': 441, 'move': 276, 'bathroom': 38, 'soooo': 397, 'comfi': 79, 'someon': 395, 'must': 278, 'smoke': 392, 'next': 286, 'door': 119, 'spaciou': 403, 'quiet': 335, 'peopl': 306, 'bedroom': 41, 'sofa': 394, 'bit': 45, 'unconfort': 465, 'light': 244, 'common': 81, 'dim': 115, 'air': 12, 'condit': 84, 'fine': 164, 'type': 460, 'like': 245, 'let': 242, 'water': 483, 'run': 357, 'get': 182, 'wet': 488, 'take': 422, 'minut': 272, 'figur': 161, 'make': 260, 'hot': 209, 'gon': 187, 'na': 279, 'window': 492, 'singl': 379, 'glaze': 185, 'heat': 203, 'could': 94, 'escap': 137, 'although': 16, 'fair': 153, 'outsid': 298, 'terribl': 430, 'cubbyhol': 100, 'market': 264, 'corridor': 90, 'filthi': 163, 'electr': 128, 'cabl': 55, 'whole': 489, 'build': 53, 'smelli': 391, 'show

In [20]:
from sklearn.model_selection import train_test_split
vectors_train, vectors_test, topics_train, topics_test = train_test_split(vectors, topics)

In [21]:
from sklearn.naive_bayes import GaussianNB
classifier = GaussianNB()
classifier.fit(vectors_train, topics_train)

# Predict with the testing set
topics_pred = classifier.predict(vectors_test)

# measure the accuracy of the results
from sklearn.metrics import classification_report
print(classification_report(topics_test, topics_pred))

              precision    recall  f1-score   support

 Cleanliness       0.50      0.30      0.37        10
     Comfort       0.57      0.60      0.59        20
  Facilities       0.56      0.64      0.60        22

    accuracy                           0.56        52
   macro avg       0.54      0.51      0.52        52
weighted avg       0.55      0.56      0.55        52



Naive Bayes, explained by Andrew Ng : https://www.youtube.com/watch?v=z5UQyCESW64
Support Vector Machines, https://www.youtube.com/watch?v=N1vOgolbjSc
Deep Learning explained simply in four parts https://www.youtube.com/playlist?list=PLZHQObOWTQDNU6R1_67000Dx_ZCJB-3pi