In [3]:
import numpy as np
import pandas as pd

In [72]:
dataset = pd.read_csv('Restaurant_Reviews.tsv', delimiter = '\t')
dataset

Unnamed: 0,Review,Liked
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1
...,...,...
995,I think food should have flavor and texture an...,0
996,Appetite instantly gone.,0
997,Overall I was not impressed and would not go b...,0
998,"The whole experience was underwhelming, and I ...",0


In [12]:
import re # Regular expression operations: Data Cleaning
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

Text Preprocessing:
1 Remove Punctuations and Numbers
2 Stemming

In [29]:
corpus = []
for i in range(0, 1000):
    review = re.sub('[^a-zA-Z]', ' ', dataset['Review'][i]) # column : "Review", row ith
    review = review.lower() # convert all cases to lower cases
    review = review.split() # split to array(default delimiter is " ")

    # creating PorterStemmer object to
    # take main stem of each word
    ps = PorterStemmer()

    # loop for stemming each word
    # in string array at ith row
    review = [ps.stem(word) for word in review
                if not word in set(stopwords.words('english'))]
    # rejoin all string array elements
    # to create back into a string
    review = ' '.join(review)
    # reviews after cleaning: corpus
    corpus.append(review)

Bag of Word Model

In [58]:
# Creating the Bag of Words model
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

cv = CountVectorizer(max_features = 1500)

# X contains corpus (dependent variable)
X = cv.fit_transform(corpus).toarray()

# y contains answers whether review is positive or negative
y = dataset.iloc[:, 1].values

In [63]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

In [64]:
# Fitting Random Forest Classification to the Training set
from sklearn.ensemble import RandomForestClassifier

# n_estimators can be said as number of trees
model = RandomForestClassifier(n_estimators = 501, criterion = 'entropy')

model.fit(X_train, y_train)

RandomForestClassifier(criterion='entropy', n_estimators=501)

In [68]:
from sklearn.metrics import accuracy_score, confusion_matrix
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
cm = confusion_matrix(y_test, y_pred)
print(accuracy)
cm

0.74


array([[78, 10],
       [42, 70]], dtype=int64)

In [119]:
aptReview = pd.read_csv('aptReviews.csv')
aptReview

Unnamed: 0,id,rating,text
0,t7J5R1LVDmTUVnFAaT2smg,5,Haim hi was Awesome broker he help us with the...
1,t7J5R1LVDmTUVnFAaT2smg,1,wasted my time - was shown an apartment that i...
2,t7J5R1LVDmTUVnFAaT2smg,1,I didnt even want to select a star but Yelp to...
3,U3sf7qA8-a7E671H-IBhag,5,- Friendly doormen \n- Helpful management team...
4,U3sf7qA8-a7E671H-IBhag,5,I love my corner apartment with double exposur...
...,...,...,...
2632,fVml8IZPiZns9KOJ-Fwv4w,5,We were so lucky to find Emma and work with he...
2633,fVml8IZPiZns9KOJ-Fwv4w,5,Emma James is such a fantastic broker! She is...
2634,DvRfghf-pHOcu6NeSbGjOQ,5,Obsessive cleaning is simply fantastic. From G...
2635,DvRfghf-pHOcu6NeSbGjOQ,5,Tracey and Magine have cleaned my studio apart...


In [122]:
corpus = []
for i in range(0, 2637):
    review = re.sub('[^a-zA-Z]', ' ', aptReview['text'][i])
    review = review.lower() # convert all cases to lower cases
    review = review.split() # split to array(default delimiter is " ")

    ps = PorterStemmer()

    review = [ps.stem(word) for word in review
                if not word in set(stopwords.words('english'))]
    review = ' '.join(review)
    # reviews after cleaning: corpus
    corpus.append(review)
df = pd.DataFrame(aptReview)
df['corpus'] = corpus

In [123]:
aptReview

Unnamed: 0,id,rating,text,corpus
0,t7J5R1LVDmTUVnFAaT2smg,5,Haim hi was Awesome broker he help us with the...,haim hi awesom broker help us best way recomme...
1,t7J5R1LVDmTUVnFAaT2smg,1,wasted my time - was shown an apartment that i...,wast time shown apart love got everyth fill ev...
2,t7J5R1LVDmTUVnFAaT2smg,1,I didnt even want to select a star but Yelp to...,didnt even want select star yelp told inord le...
3,U3sf7qA8-a7E671H-IBhag,5,- Friendly doormen \n- Helpful management team...,friendli doormen help manag team quick respons...
4,U3sf7qA8-a7E671H-IBhag,5,I love my corner apartment with double exposur...,love corner apart doubl exposur avid tenant ma...
...,...,...,...,...
2632,fVml8IZPiZns9KOJ-Fwv4w,5,We were so lucky to find Emma and work with he...,lucki find emma work buy brooklyn apart search...
2633,fVml8IZPiZns9KOJ-Fwv4w,5,Emma James is such a fantastic broker! She is...,emma jame fantast broker kind patient knowledg...
2634,DvRfghf-pHOcu6NeSbGjOQ,5,Obsessive cleaning is simply fantastic. From G...,obsess clean simpli fantast gina book appoint ...
2635,DvRfghf-pHOcu6NeSbGjOQ,5,Tracey and Magine have cleaned my studio apart...,tracey magin clean studio apart differ time co...


In [80]:
X_apt = cv.fit_transform(corpus).toarray()
y_pred = model.predict(X_apt)
df['Liked'] = y_pred

In [81]:
aptReview

Unnamed: 0,id,rating,text,corpus,Liked
0,hyHdkkg-P_f6LQq6U3ZvtA,5,Haim hi was Awesome broker he help us with the...,haim hi awesom broker help us best way recomme...,0
1,Vx0eH8RgkZLOa4Q_kuULbA,1,wasted my time - was shown an apartment that i...,wast time shown apart love got everyth fill ev...,1
2,XnuZZ_EG6MxlbGHAnetCIQ,1,I didnt even want to select a star but Yelp to...,didnt even want select star yelp told inord le...,0
3,LTAcBBAgX6bfguXT2pw__A,5,- Friendly doormen \n- Helpful management team...,friendli doormen help manag team quick respons...,1
4,fm0l89DKMph7Pk_1NhcUAQ,5,I love my corner apartment with double exposur...,love corner apart doubl exposur avid tenant ma...,0
...,...,...,...,...,...
2631,woPA_krxXOH6PgpqaxROxg,5,We were so lucky to find Emma and work with he...,lucki find emma work buy brooklyn apart search...,1
2632,f4ztUkRK7gfgWr2Ic-KOCA,5,Emma James is such a fantastic broker! She is...,emma jame fantast broker kind patient knowledg...,1
2633,z8yUPOjHO8pvOryrllj35g,5,"I never knew a place like THS leasing corp, be...",never knew place like th leas corp alway paid ...,1
2634,yCzeTaveYnIYjcFy7cdY6Q,1,This place is a scam! I'd advise anyone not t...,place scam advis anyon desper time crisi peopl...,0


In [90]:
data1 = pd.read_csv('apartments.csv')
data2 = pd.read_csv('aptReviews.csv')
output = pd.merge(data1, data2, on='id', how='inner')
output

Unnamed: 0,name,id,latitude,longitude,zipcode,rating_x,rating_y,text
0,Urban Pads,t7J5R1LVDmTUVnFAaT2smg,40.675071,-73.952629,11216,4.0,5,Haim hi was Awesome broker he help us with the...
1,Urban Pads,t7J5R1LVDmTUVnFAaT2smg,40.675071,-73.952629,11216,4.0,1,wasted my time - was shown an apartment that i...
2,Urban Pads,t7J5R1LVDmTUVnFAaT2smg,40.675071,-73.952629,11216,4.0,1,I didnt even want to select a star but Yelp to...
3,21 Chelsea,U3sf7qA8-a7E671H-IBhag,40.741863,-73.994888,10011,4.5,5,- Friendly doormen \n- Helpful management team...
4,21 Chelsea,U3sf7qA8-a7E671H-IBhag,40.741863,-73.994888,10011,4.5,5,I love my corner apartment with double exposur...
...,...,...,...,...,...,...,...,...
2585,65 Bay Street,HqDAulNn4YvjVfIVqex74g,40.719790,-74.036190,7302,3.0,5,Stunning building. Great\n amenities. I can te...
2586,65 Bay Street,HqDAulNn4YvjVfIVqex74g,40.719790,-74.036190,7302,3.0,1,"Horrible Horrible windows...heavy drafts, cond..."
2587,Flatbush Patio,NO9FsyA-it49Ts2Gk9uZQQ,40.659442,-73.961219,11225,2.5,4,We have lived in Patio Gardens for four months...
2588,Flatbush Patio,NO9FsyA-it49Ts2Gk9uZQQ,40.659442,-73.961219,11225,2.5,2,I want to love this apartment... but we have h...


In [87]:
import math
from operator import itemgetter

In [88]:
def check_sent(word, sentences):
    final = [all([w in x for w in word]) for x in sentences]
    sent_len = [sentences[i] for i in range(0, len(final)) if final[i]]
    return int(len(sent_len))

def get_top_n(dict_elem, n):
    result = dict(sorted(dict_elem.items(), key = itemgetter(1), reverse = True)[:n])
    return result

for i in range(0, 5):
    tf_score = {}
    for each_word in corpus[i]:
        each_word = each_word.replace('.','')
        if each_word in tf_score:
            tf_score[each_word] += 1
        else:
            tf_score[each_word] = 1
    # Dividing by total_word_length for each dictionary element
    tf_score.update((x, y/int(len(corpus[i]))) for x, y in tf_score.items())
    idf_score = {}
    for each_word in corpus[i]:
        each_word = each_word.replace('.','')
        if each_word in idf_score:
            idf_score[each_word] = check_sent(each_word, corpus[i])
        else:
            idf_score[each_word] = 1
    # Dividing by total_word_length for each dictionary element
    idf_score.update((x, math.log(int(len(corpus[i]))/y)) for x, y in idf_score.items())
    tf_idf_score = {key: tf_score[key] * idf_score.get(key, 0) for key in tf_score.keys()}
    print(tf_idf_score)
    print(get_top_n(tf_idf_score, 3))

{'h': 0.18206940410608577, 'a': 0.18206940410608577, 'i': 0.12995078937308352, 'm': 0.18206940410608577, ' ': 0.2949986664999194, 'w': 0.09759237601741147, 'e': 0.24049984340803612, 's': 0.12995078937308352, 'o': 0.1577173368694205, 'b': 0.09759237601741147, 'r': 0.1577173368694205, 'k': 0.09759237601741147, 'l': 0.05816304180005636, 'p': 0.09759237601741147, 'u': 0.09759237601741147, 't': 0.12995078937308352, 'y': 0.12995078937308352, 'c': 0.09759237601741147, 'n': 0.12995078937308352, 'd': 0.05816304180005636, 'g': 0.05816304180005636, 'v': 0.05816304180005636}
{' ': 0.2949986664999194, 'e': 0.24049984340803612, 'h': 0.18206940410608577}
{'w': 0.08459249977267377, 'a': 0.24413606414846883, 's': 0.11337324605540518, 't': 0.25690064858723155, ' ': 0.2986265782046758, 'i': 0.11337324605540518, 'm': 0.11337324605540518, 'e': 0.25690064858723155, 'h': 0.08459249977267377, 'o': 0.13837845818712774, 'n': 0.11337324605540518, 'p': 0.13837845818712774, 'r': 0.16057620877200912, 'l': 0.1133732

In [102]:
import nltk
nltk.download("punkt")
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Tekhne\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


In [181]:
i = 300
doc = aptReview['text'][i] + aptReview['text'][i+1] + aptReview['text'][i+2]
df2 = pd.DataFrame(data2)
review_total = df2['text'].str.cat()
review_total



In [192]:
# total_words = doc.lower()
total_words = review_total.lower()
total_words = total_words.split()
total_word_length = len(total_words)
print(total_word_length)
# total_sentences = sent_tokenize(doc)
total_sentences = sent_tokenize(review_total)
total_sent_len = len(total_sentences)
print(total_sent_len)
tokenized_words = [word_tokenize(sent) for sent in total_sentences]

69500
3980


Extracting Information from Text
'Part of Speech' tagging: match the tokens with the corresponding tags (nouns, verbs, adjectives, adverbs, etc.)
sentence segmentation -> tokenization -> part of speech tagging (pos-tagged sentences) -> entity detection (chunked sentences) -> relation detection

In [193]:
pos_words = [nltk.pos_tag(word) for word in tokenized_words]
pos_words

[[('Haim', 'NNP'),
  ('hi', 'NN'),
  ('was', 'VBD'),
  ('Awesome', 'NNP'),
  ('broker', 'NN'),
  ('he', 'PRP'),
  ('help', 'VBZ'),
  ('us', 'PRP'),
  ('with', 'IN'),
  ('the', 'DT'),
  ('best', 'JJS'),
  ('way', 'NN'),
  ('!', '.'),
  ('!', '.'),
  ('!', '.')],
 [('I', 'PRP'),
  ('recommend', 'VBP'),
  ('this', 'DT'),
  ('company', 'NN'),
  ('and', 'CC'),
  ('this', 'DT'),
  ('guy', 'NN'),
  ('!', '.')],
 [('We', 'PRP'),
  ('thank', 'VBP'),
  ('you', 'PRP'),
  ('for', 'IN'),
  ('everything', 'NN'),
  ('!', '.')],
 [('!', '.'),
  ('wasted', 'VBN'),
  ('my', 'PRP$'),
  ('time', 'NN'),
  ('-', ':'),
  ('was', 'VBD'),
  ('shown', 'VBN'),
  ('an', 'DT'),
  ('apartment', 'NN'),
  ('that', 'IN'),
  ('i', 'NN'),
  ('loved', 'VBD'),
  (',', ','),
  ('got', 'VBD'),
  ('everything', 'NN'),
  ('filled', 'VBN'),
  ('out', 'RP'),
  ('even', 'RB'),
  ('made', 'VBD'),
  ('$', '$'),
  ('500', 'CD'),
  ('deposit', 'NN'),
  ('to', 'TO'),
  ('get', 'VB'),
  ('the', 'DT'),
  ('``', '``'),
  ('apartment', '

In [226]:
def extract_JJ(sent):
    grammar = r"""
    NP:
        {<JJ>}
    """
    chunker = nltk.RegexpParser(grammar)
    ne = set()
    chunk = chunker.parse(nltk.pos_tag(nltk.word_tokenize(sent)))
    for tree in chunk.subtrees(filter=lambda t: t.label() == 'NP'):
        ne.add(' '.join([child[0] for child in tree.leaves()]))
    return ne

In [227]:
keywords = extract_JJ(review_total)
keywords

{'Gorgeous',
 'solid',
 'north',
 'greedy',
 'w/o',
 'sorry',
 'electric',
 'ignored',
 'twelve',
 'like',
 'happy',
 'clean',
 'lasting',
 'rental',
 'quick',
 'motivated',
 'cafe',
 'trustworthy',
 'unexplained',
 'DISAPPOINTED',
 'competent',
 'ambitious',
 'available',
 'un-useful',
 'incredible',
 'classic',
 'rigid',
 'custodial',
 'family-owned',
 'filthy',
 'vacant',
 'sympathetic',
 'overhead',
 'slow',
 'thrilled',
 'cute',
 'agent',
 'additional',
 'communal',
 'uneven',
 'early',
 'lease',
 'shut-in',
 'sure',
 'spotless',
 'lucky',
 'GOOD',
 'double',
 'awesome',
 'dead',
 'useful',
 'eager',
 'low-pressure',
 'itemized',
 '1957-1963',
 'sewer',
 'generous',
 'petty',
 'downstairs',
 'thankful',
 'female',
 '24-hour',
 'extraordinary',
 'reflect',
 'skeptic',
 'holiday',
 'no-nonsense',
 'tight',
 'salary',
 'social',
 'rent-stabilized',
 'utter',
 'i',
 'neglected',
 'pleasurable',
 'electric/Somfy',
 'Modern',
 'responsible',
 'unfruitful',
 'Monthly',
 '6-8',
 'frequent

In [228]:
tf_score = {}
for each_word in keywords:
    each_word = each_word.replace('.','')
    if each_word not in stop_words:
        if each_word in tf_score:
            tf_score[each_word] += 1
        else:
            tf_score[each_word] = 1

# Dividing by total_word_length for each dictionary element
tf_score.update((x, y/int(total_word_length)) for x, y in tf_score.items())

In [229]:
idf_score = {}
for each_word in keywords:
    each_word = each_word.replace('.','')
    if each_word not in stop_words:
        if each_word in idf_score:
            idf_score[each_word] = check_sent(each_word, total_sentences)
        else:
            idf_score[each_word] = 1

# Performing a log and divide
idf_score.update((x, math.log(int(total_sent_len)/y)) for x, y in idf_score.items())


In [234]:
tf_idf_score = {key: tf_score[key] * idf_score.get(key, 0) for key in tf_score.keys()}
# print(tf_idf_score)
len(tf_idf_score)

1236

In [231]:
print(get_top_n(tf_idf_score, 50).keys())

dict_keys(['Gorgeous', 'solid', 'north', 'greedy', 'w/o', 'sorry', 'electric', 'ignored', 'twelve', 'like', 'happy', 'clean', 'lasting', 'rental', 'quick', 'motivated', 'cafe', 'trustworthy', 'unexplained', 'DISAPPOINTED', 'competent', 'ambitious', 'available', 'un-useful', 'incredible', 'classic', 'rigid', 'custodial', 'family-owned', 'filthy', 'vacant', 'sympathetic', 'overhead', 'slow', 'thrilled', 'cute', 'agent', 'additional', 'communal', 'uneven', 'early', 'lease', 'shut-in', 'sure', 'spotless', 'lucky', 'GOOD', 'double', 'awesome', 'dead'])


In [241]:
ordered_keywords = pd.DataFrame.from_dict(get_top_n(tf_idf_score, 1236), orient='index', columns=['score'])
ordered_keywords

Unnamed: 0,score
Gorgeous,0.000119
solid,0.000119
north,0.000119
greedy,0.000119
w/o,0.000119
...,...
well-kept,0.000119
serene,0.000119
5am-,0.000119
nyc,0.000015


In [236]:
ordered_keywords.to_csv('keywords.csv')