In [1]:
import numpy as np
import scipy as sp
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.metrics import confusion_matrix
from sklearn.cluster import AgglomerativeClustering
from sklearn.metrics import silhouette_score



df_businesses = pd.read_csv('philly_restaurants_categories.csv')



chunks = pd.read_json('yelp_academic_dataset_review.json', lines=True,chunksize=100000)
df_reviews = pd.DataFrame()

for chunk in chunks:
    temp = chunk[chunk['business_id'].isin(df_businesses['business_id'])]
    df_reviews = pd.concat([df_reviews,temp])




merged_reviews = df_reviews[['business_id','text']].groupby('business_id').agg({'text': ' '.join})
print(merged_reviews['text'])



print("finished")


business_id
-AbzJTLQdbdQrhRzQLgsKA    Kingyo has summer appetizer specials for $3.00...
-HxLFWKVgXSU8JlR21PBkg    "Carluccio's: Way Delish"\n\nEver find yoursel...
-LmhsdQproqCf5EQoD06rQ    Down in south Philly for a meeting at my daugh...
-MkngKKkTIVfnUbq2S1ucQ    Dee-frickin'-licious thick crust pizza!!! I or...
-PMXnNJ1D67NkAupRHNkpQ    I've always preferred domino's over pizza hut!...
                                                ...                        
zUJMvjK6aBeQtVCowZ85-w    Excelent! Food is wondeful, hot, and fresh! Th...
zeounyPVXFZEz1c9KtptLA    Pizza, wings, steaks are good.. chicken finger...
zgX8sYCRGVJ9M5LETpJ60A    I popped in there one afternoon to pick up som...
zqisPpgCURrgLf4TVnI8RQ    This review is for poke bowl. Very fresh salad...
zzyx5x0Z7xXWWvWnZFuxlQ    Maybe the pizza is good here... but I can real...
Name: text, Length: 951, dtype: object
finished


In [3]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import KFold


merged_reviews["text"] = merged_reviews["text"].str.lower()
merged_reviews["text"] = merged_reviews["text"].str.replace("[^a-zA-Z]", " ")

kf = KFold(n_splits=5, shuffle=True, random_state=0)

log_reg = LogisticRegression()
svm = SVC()
knn = KNeighborsClassifier()

accuracies = []
precisions = []
recalls = []
f1_scores = []
conf_matrices = []

for train_index, test_index in kf.split(merged_reviews):
    
    train_data = merged_reviews.iloc[train_index]
    test_data = merged_reviews.iloc[test_index]
    

    vectorizer = TfidfVectorizer(stop_words='english',max_df=0.6,max_features=1000)
    x_train = vectorizer.fit_transform(train_data['text'])
    y_train = train_data["text"]

    x_test = vectorizer.transform(test_data["text"])
    y_test = test_data["text"]



    #Logistic Regression
    log_reg.fit(x_train, y_train)
    y_pred = log_reg.predict(x_test)
    acc = accuracy_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred,average='micro')
    rec = recall_score(y_test, y_pred,average='micro')
    f1 = f1_score(y_test, y_pred,average='micro')
    conf = confusion_matrix(y_test,y_pred)
    
    accuracies.append(acc)
    precisions.append(prec)
    recalls.append(rec)
    f1_scores.append(f1)
    conf_matrices.append(conf)


    #SVM
    svm.fit(x_train, y_train)
    y_pred = svm.predict(x_test)
    acc = accuracy_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred,average='micro')
    rec = recall_score(y_test, y_pred,average='micro')
    f1 = f1_score(y_test, y_pred,average='micro')
    conf = confusion_matrix(y_test,y_pred)
    
    accuracies.append(acc)
    precisions.append(prec)
    recalls.append(rec)
    f1_scores.append(f1)
    conf_matrices.append(conf)



    #KNN
    knn.fit(x_train, y_train)
    y_pred = knn.predict(x_test)
    acc = accuracy_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred,average='micro')
    rec = recall_score(y_test, y_pred,average='micro')
    f1 = f1_score(y_test, y_pred,average='micro')
    conf = confusion_matrix(y_test,y_pred)

    accuracies.append(acc)
    precisions.append(prec)
    recalls.append(rec)
    f1_scores.append(f1)
    conf_matrices.append(conf)



  merged_reviews["text"] = merged_reviews["text"].str.replace("[^a-zA-Z]", " ")
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


In [4]:
'''for i in range(15):
    print(i)
    print("Accuracies: ",accuracies[i])
    print("Precisions: ",precisions[i])
    print("recalls: ",recalls[i])
    print("f1: ",f1_scores[i])
    print("conf: ",conf_matrices[i])
'''
accuracies1 = np.array(accuracies).reshape(-1, 3)
precisions1 = np.array(precisions).reshape(-1, 3)
recalls1 = np.array(recalls).reshape(-1, 3)
f1_scores1 = np.array(f1_scores).reshape(-1, 3)
for i in range(len(conf_matrices)):
    print(len(conf_matrices[i]))
conf_matrices1 = np.array(conf_matrices).reshape(-1,3)

avg_accuracies = np.mean(accuracies1, axis=0)
avg_precisions = np.mean(precisions1, axis=0)
avg_recalls = np.mean(recalls1, axis=0)
avg_f1_scores = np.mean(f1_scores1, axis=0)
avg_conf_matrices = np.mean(conf_matrices1, axis=0)


print("avg accuracy: ", avg_accuracies)
print("avg precision: ", avg_precisions)
print("avg recall: ", avg_recalls)
print("avg f1-Score: ", avg_f1_scores)
print("avg confusion matrix: ", avg_conf_matrices)

329
321
297
330
330
291
329
330
305
330
323
291
334
327
299


  conf_matrices1 = np.array(conf_matrices).reshape(-1,3)


ValueError: operands could not be broadcast together with shapes (329,329) (330,330) 