In [19]:
import pandas as pd

data =  pd.read_csv(r'yelp_review.csv',encoding = "utf8",keep_default_na=False,nrows=200000)


from sqlalchemy import create_engine
engine = create_engine('sqlite://', echo=False)
data.to_sql('yelp_reviews',con=engine)

data.head()

Unnamed: 0,review_id,user_id,business_id,stars,date,text,useful,funny,cool
0,vkVSCC7xljjrAI4UGfnKEQ,bv2nCi5Qv5vroFiqKGopiw,AEx2SYEUJmTxVVB18LlCwA,5,2016-05-28,Super simple place but amazing nonetheless. It...,0,0,0
1,n6QzIUObkYshz4dz2QRJTw,bv2nCi5Qv5vroFiqKGopiw,VR6GpWIda3SfvPC-lg9H3w,5,2016-05-28,Small unassuming place that changes their menu...,0,0,0
2,MV3CcKScW05u5LVfF6ok0g,bv2nCi5Qv5vroFiqKGopiw,CKC0-MOWMqoeWf6s-szl8g,5,2016-05-28,Lester's is located in a beautiful neighborhoo...,0,0,0
3,IXvOzsEMYtiJI0CARmj77Q,bv2nCi5Qv5vroFiqKGopiw,ACFtxLv8pGrrxMm6EgjreA,4,2016-05-28,Love coming here. Yes the place always needs t...,0,0,0
4,L_9BTb55X0GDtThi6GlZ6w,bv2nCi5Qv5vroFiqKGopiw,s2I_Ni76bjJNK9yG60iD-Q,4,2016-05-28,Had their chocolate almond croissant and it wa...,0,0,0


In [20]:
data.shape

(200000, 9)

In [21]:
reviews = data[:45000]["text"]
reviews.head()

0    Super simple place but amazing nonetheless. It...
1    Small unassuming place that changes their menu...
2    Lester's is located in a beautiful neighborhoo...
3    Love coming here. Yes the place always needs t...
4    Had their chocolate almond croissant and it wa...
Name: text, dtype: object

In [22]:
import string
from nltk.corpus import stopwords

def tokenize_reviews(review):
    rev = [char for char in review if char not in string.punctuation]
    rev = ''.join(rev)
    
    return [word for word in rev.split() if word.lower() not in stopwords.words('english')]

print(tokenize_reviews(reviews[0]))

['Super', 'simple', 'place', 'amazing', 'nonetheless', 'around', 'since', '30s', 'still', 'serve', 'thing', 'started', 'bologna', 'salami', 'sandwich', 'mustard', 'Staff', 'helpful', 'friendly']


In [23]:
rev_tokens = []
dataset = []
for review in reviews:
    if len(review) > 10:
        rev_tokens.append(tokenize_reviews(review))
        dataset.append(review)

In [24]:
print(rev_tokens[44998])

['New', 'town', 'wasnt', 'sure', 'store', 'would', 'satisfyso', 'went', 'convenience', 'Smiths', 'small', 'shopping', 'centerstrip', 'mall', 'kitty', 'corner', 'hotel', 'Best', 'Western', 'Ive', 'twice', 'week', 'lucked', 'times', 'great', 'parking', 'Unfamiliar', 'layout', 'store', 'simply', 'started', 'one', 'end', 'worked', 'way', 'side', 'nice', 'wine', 'selection', 'conveniently', 'located', 'next', 'fancy', 'cheese', 'area', 'Smiths', 'pretty', 'much', 'standard', 'grocery', 'store', 'produce', 'deli', 'bakery', 'grocery', 'health', 'beauty', 'oh', 'pharmacy', 'somewhat', 'confused', 'random', 'Kroger', 'items', 'display', 'throughout', 'store', 'inquired', 'clearance', 'near', 'expired', 'items', 'Kroger', 'owns', 'Smiths', 'pleasantly', 'surprised', 'able', 'use', 'Kroger', 'plus', 'card', 'applicable', 'discounts', 'cashier', 'first', 'time', 'warm', 'welcoming', 'wished', 'best', 'new', 'home', 'breath', 'fresh', 'air', 'someone', 'friendly', 'wait', 'enjoy', 'farmers', 'mark

In [25]:
file = open(r'pos_words.txt')
positive  = file.read()
positive = positive.lower()
file = open(r'neg_words.txt')
negative  = file.read()

In [26]:
def assign_score(token):
    score = []
    for word in token:
        if word in positive:
            score.append(1)
        elif word in negative:
            score.append(-1)
        else:
            score.append(0)
    return score

In [27]:
rev_scores = []
for token in rev_tokens:
    rev_scores.append(assign_score(token))

In [164]:
from statistics import mean
labels = []
pos = neg = neut = 0         
for score in rev_scores:
    if score:
        val =  mean(score)
        if val > 0.29:
            labels.append(1)
            pos += 1
        elif 0.212 <= val <= 0.29:
            labels.append(0)
            neut += 1
        else:
            labels.append(-1)
            neg += 1
    else:
        labels.append(0)

print(pos,neg,neut)

14913 14465 15621


In [186]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(dataset,labels,test_size=0.25,random_state=42)

In [187]:
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsOneClassifier
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer,CountVectorizer

nb = Pipeline([('vect', CountVectorizer(analyzer = 'word',lowercase = True,stop_words='english')),
               ('tfidf', TfidfTransformer(smooth_idf=True)),
               ('clf', LogisticRegression(penalty='l2',solver='newton-cg',multi_class='multinomial')),
              ])
nb.fit(x_train, y_train)

Pipeline(memory=None,
     steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words='english',
        ...ty='l2', random_state=None, solver='newton-cg',
          tol=0.0001, verbose=0, warm_start=False))])

In [122]:
from sklearn.model_selection import cross_val_score
scores = cross_val_score(nb, x_train,y_train , cv=5)
print(scores)

[0.67022222 0.66148148 0.66162963 0.65837037 0.65357831]


In [167]:
y_pred = nb.predict(x_test)
from sklearn.metrics import accuracy_score
accuracy_score(y_test, y_pred)

0.6728

In [168]:
y_pred = []
y_pred = nb.predict(x_test)
from sklearn.metrics import classification_report
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

          -1       0.71      0.71      0.71      3615
           0       0.56      0.55      0.56      3835
           1       0.74      0.76      0.75      3800

   micro avg       0.67      0.67      0.67     11250
   macro avg       0.67      0.67      0.67     11250
weighted avg       0.67      0.67      0.67     11250



In [169]:
import heapq
counter = []
counter = engine.execute("SELECT COUNT(*) AS count,business_id FROM yelp_reviews GROUP BY business_id").fetchall()
rest = heapq.nlargest(10,counter)

In [173]:
results = []
correct = 0
for count,id in rest:
    data_base = []
    stars = []
    prediction = ""
    data_base = engine.execute("SELECT text from yelp_reviews where business_id = ?",(id)).fetchall()
    stars = engine.execute("SELECT stars as INTEGER from yelp_reviews where business_id = ?",(id)).fetchall()
    new_db = []
    stars_db = []
    for data in data_base:
        new_db.append(str(data))

    for star in stars:
        stars_db.append(star[0])
        
    ypred = nb.predict(new_db)
    y_pred_new = []
    for pred in ypred:
        if pred == 0:
            y_pred_new.append(3.5)
        elif pred == -1:
            y_pred_new.append(1.5)
        else:
            y_pred_new.append(5)
        
#         print(y_pred_new)
    predicted = mean(y_pred_new)
    actual = mean(stars_db)
    if abs(predicted - actual) < 0.7:
        prediction = "correct"
        correct += 1
    else:
        prediction = "wrong"
    results.append([predicted,actual,id,prediction])

In [174]:
from tabulate import tabulate
print(tabulate(results,headers=['Predicted', 'Actual', 'Business_id','Result'], tablefmt='orgtbl'))

|   Predicted |   Actual | Business_id            | Result   |
|-------------+----------+------------------------+----------|
|     3.31151 |  3.63095 | RESDUcs7fIiihp38-d6_6g | correct  |
|     3.46875 |  4.11667 | 4JNXUYY8wbaaDmk3BPzlWw | correct  |
|     3.369   |  3.71616 | K7lWdNUhCbcnEvI0NhGewg | correct  |
|     3.13636 |  3.9798  | cYwJA2A6I12KNkm2rtXd5g | wrong    |
|     2.97135 |  4.27083 | DkYS3arLOhA8si5uUEmHOw | wrong    |
|     3.01453 |  3.90116 | f4x1YBxkLrZg652xt2KR5g | wrong    |
|     2.73052 |  3.9026  | 5LNZ67Yw9RD6nf4_UhXOjw | wrong    |
|     3.02632 |  3.40132 | SMPbvZLSMMb7KU76YNYMGg | correct  |
|     3.21812 |  3.44295 | ujHiaprwCQ5ewziu0Vi9rw | correct  |
|     3.12847 |  3.61806 | 2weQS-RnoOBhb1KsHKyoSQ | correct  |
