In [1]:
import re
import math
import string
import operator
import pandas as pd
from collections import Counter
from operator import add
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.probability import FreqDist as nF
from nltk.corpus import stopwords
from nltk import bigrams
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.tokenize import word_tokenize
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from pyspark import SparkContext
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [2]:
# Prepare the dataset to remove to remove stop words by intializing the system
def initializeSystem():
    print ("Preparing stop words.")
    stop = set(stopwords.words('english') + punctuation + ['rt', 'via', 'i\'m', 'us', 'it'])
    for x in stop:
        stopWords.append(stemmer.stem(lemmatiser.lemmatize(x, pos="v")))

# Tokenize will separate each word as tokens
def tokenize(s):
    return tokens_re.findall(s)

# This function removes non-alhanumeric characters, removes the emoticons if present and stems and lemmatize each word
def preprocess(s, lowercase=True):
    s = re.sub('[^a-zA-Z0-9\n\.]', ' ', s)
    tokens = tokenize(s)
    if lowercase:
        tokens = [token if emoticon_re.search(token) else stemmer.stem(lemmatiser.lemmatize(token.lower(), pos="v")) for
                  token in tokens]
    return tokens

# Each row is broken down into space separated words or terms of list and sent for stemming and lemming. Here stop words are removed
def processString(string):
    # terms_all = [term for term in preprocess(string)]
    terms_stop = [term for term in preprocess(string) if term not in stopWords and len(str(term)) > 1]
    return ' '.join(terms_stop)

In [3]:
def euclideanDistance(instance1, instance2, length):
    distance = 0
    for x in range(length):
        distance += pow((instance1[x] - instance2[x]), 2)
    return math.sqrt(distance)

def getResponse(neighbors):
    classVotes = {}
    for x in range(len(neighbors)):
        response = neighbors[x][-1]
#         print('res',response)
        if response in classVotes:
            classVotes[response] += 1
        else:
            classVotes[response] = 1
    sortedVotes = sorted(classVotes.items(), key=operator.itemgetter(1), reverse=True)
    return sortedVotes[0][0]

def predic_rating(s):
    neighbors = train_features.map(lambda y: (y,euclideanDistance(y, count_vectorizer.transform([s]).toarray()[0], len(y)-2))).sortBy(lambda y: y[1]).map(lambda x:x[0]).take(3)
    result = getResponse(neighbors)
    print(result)

In [4]:
# String is used to send each row for preprocessing
emoticons_str = r"""
    (?:
        [:=;] # Eyes
        [oO\-]? # Nose (optional)
        [D\)\]\(\]/\\OpP] # Mouth
    )"""

# Regex for non alphanumeric characters
regex_str = [
    r'<[^>]+>',  # HTML tags
    r"(?:[a-z][a-z\-_]+[a-z])",  # words with - and '
    r'(?:[\w_]+)',  # other words
    r'(?:\S)'  # anything else
]

In [5]:
stopWords = []
decisionAttributes = []
tokens_re = re.compile(r'(' + '|'.join(regex_str) + ')', re.VERBOSE | re.IGNORECASE)
emoticon_re = re.compile(r'^' + emoticons_str + '$', re.VERBOSE | re.IGNORECASE)
punctuation = list(string.punctuation)
stemmer = PorterStemmer()
lemmatiser = WordNetLemmatizer()
initializeSystem()

Preparing stop words.


In [6]:
sc = SparkContext.getOrCreate()

In [7]:
df= pd.read_csv('Hotel_Reviews.csv')
df = df.sample(frac=1).reset_index(drop=True)
df=df[:10000]

In [8]:
df1=pd.DataFrame()
df1['review'] = df['Negative_Review']+df['Positive_Review']
df1['rating'] = df['Reviewer_Score']
df1.rating=df1.rating.astype(int)

In [9]:
lines = sc.parallelize(df1.values)
lines.first()

array([' Not clean old and sadNo Positive', 5], dtype=object)

In [10]:
del df
del df1

In [11]:
review = lines.map(lambda x: x[0])
rating = lines.map(lambda x: x[1])

In [12]:
review1=review.map(lambda a: processString(str(a).lower())).collect()

max_df : float in range [0.0, 1.0] or int, default=1.0
When building the vocabulary ignore terms that have a document frequency strictly higher than the given threshold (corpus-specific stop words). If float, the parameter represents a proportion of documents, integer absolute counts. This parameter is ignored if vocabulary is not None.

min_df : float in range [0.0, 1.0] or int, default=1
When building the vocabulary ignore terms that have a document frequency strictly lower than the given threshold. This value is also called cut-off in the literature. If float, the parameter represents a proportion of documents, integer absolute counts. This parameter is ignored if vocabulary is not None.

max_features : int or None, default=None
If not None, build a vocabulary that only consider the top max_features ordered by term frequency across the corpus.

This parameter is ignored if vocabulary is not None.

In [13]:
count_vectorizer = CountVectorizer(max_df = 0.7, max_features=500)
counts = count_vectorizer.fit_transform(review1)

In [14]:
features=pd.DataFrame(counts.toarray(), columns=count_vectorizer.get_feature_names())
features['label']=rating.collect()

In [15]:
features.shape

(10000, 501)

In [16]:
features.head()

Unnamed: 0,10,12,15,20,30,abl,absolut,access,accommod,across,...,within,without,wonder,work,worst,worth,would,year,yet,label
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,5
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,10
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,7
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,10
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,9


In [17]:
feature_lines = sc.parallelize(features.values)

In [18]:
train_features, test_features = feature_lines.randomSplit([0.6, 0.4])

In [19]:
results=[]
correct=0
for x in test_features.collect():
    neighbors = train_features.map(lambda y: (y,euclideanDistance(y, x, len(y)-1))).sortBy(lambda y: y[1]).map(lambda x:x[0]).take(3)
    result = getResponse(neighbors)
    results.append(result)
#     print('> predicted=' + repr(result) + ', actual=' + repr(x[-1]))
    if x[-1] == result:
        correct += 1
accuracy=(correct/float(test_features.count())) * 100.0
print('Accuracy: ' + repr(accuracy) + '%')

Accuracy: 24.415713575335655%


In [21]:
predic_rating('The beds were very good, The staff were very supportive')

10


In [22]:
predic_rating('smoke small poor late limit heat noise bad late service dark wonder temperatur')

7


In [23]:
predic_rating('awesome rooms great view complimentarty dinner')

8


In [24]:
print(results[:400])

[10, 8, 7, 10, 7, 8, 8, 10, 9, 8, 7, 10, 8, 10, 8, 9, 9, 7, 7, 10, 9, 7, 9, 8, 8, 8, 9, 4, 10, 8, 9, 9, 9, 10, 7, 10, 7, 9, 10, 9, 7, 8, 7, 10, 7, 7, 10, 8, 10, 8, 9, 8, 6, 9, 9, 9, 9, 7, 10, 10, 9, 10, 10, 5, 8, 9, 7, 9, 9, 8, 9, 8, 10, 5, 8, 8, 8, 10, 8, 7, 7, 10, 10, 9, 7, 3, 8, 7, 10, 7, 10, 8, 5, 10, 7, 9, 9, 8, 9, 10, 9, 8, 9, 9, 7, 8, 7, 8, 8, 8, 10, 10, 7, 10, 8, 8, 10, 6, 10, 8, 10, 9, 9, 9, 9, 8, 10, 10, 10, 5, 8, 7, 9, 8, 8, 10, 10, 9, 10, 9, 9, 9, 8, 8, 10, 7, 9, 10, 8, 8, 10, 8, 10, 10, 8, 8, 9, 10, 10, 7, 8, 10, 9, 7, 10, 8, 8, 9, 9, 8, 9, 8, 8, 10, 7, 9, 8, 10, 9, 10, 8, 9, 7, 9, 8, 7, 8, 10, 10, 10, 7, 9, 8, 9, 9, 5, 8, 8, 9, 8, 10, 10, 8, 9, 8, 8, 8, 7, 7, 9, 9, 7, 10, 10, 7, 8, 8, 4, 8, 7, 7, 8, 10, 10, 9, 8, 8, 8, 10, 8, 10, 9, 8, 8, 10, 9, 9, 9, 10, 10, 8, 10, 9, 8, 9, 10, 8, 9, 9, 9, 8, 8, 9, 9, 8, 7, 10, 8, 9, 8, 8, 7, 8, 10, 10, 10, 9, 9, 8, 7, 10, 8, 10, 10, 8, 10, 7, 9, 9, 9, 9, 10, 8, 9, 10, 8, 10, 8, 2, 9, 7, 9, 3, 4, 8, 9, 8, 5, 8, 2, 10, 10, 9, 7, 8, 8, 10,

In [25]:
print(count_vectorizer.get_feature_names())

[u'10', u'12', u'15', u'20', u'30', u'abl', u'absolut', u'access', u'accommod', u'across', u'actual', u'air', u'airport', u'alreadi', u'also', u'although', u'alway', u'amaz', u'amen', u'amsterdam', u'anoth', u'anyth', u'apart', u'area', u'around', u'arriv', u'ask', u'atmospher', u'attent', u'attract', u'avail', u'away', u'back', u'bad', u'bag', u'balconi', u'bar', u'barcelona', u'basic', u'bath', u'bathroom', u'beauti', u'bed', u'bedroom', u'best', u'better', u'big', u'bigger', u'birthday', u'bite', u'block', u'book', u'bottl', u'break', u'breakfast', u'brilliant', u'bring', u'bu', u'buffet', u'build', u'busi', u'cafe', u'call', u'car', u'card', u'care', u'carpet', u'center', u'centr', u'central', u'chang', u'channel', u'charg', u'check', u'choic', u'choos', u'citi', u'clean', u'cleanli', u'close', u'cloth', u'coffe', u'cold', u'com', u'come', u'comfi', u'comfort', u'complain', u'complaint', u'complimentari', u'con', u'concierg', u'condit', u'connect', u'consid', u'control', u'conveni'

In [26]:
# import math
# import operator
# predictions=[]
# for x in range(len(test_features1)):
#     neighbors = getNeighbors(train_features1, test_features1[x], 3)
#     result = getResponse(neighbors)
#     predictions.append(result)
#     print('> predicted=' + repr(result) + ', actual=' + repr(test_features1[x][-1]))
# accuracy = getAccuracy(test_features1, predictions)
# print('Accuracy: ' + repr(accuracy) + '%')

In [27]:
# rdd = sc.parallelize([1,2,3,4, 5])
# combinations = rdd.cartesian(rdd)
# combinations.count()