In [4]:
import json
import pandas as pd

### Loading Business data

In [5]:
biz_file = open("/Users/ktxdev/Documents/MS in AI/Data Acquisition & Management/Datasets/yelp/yelp_academic_dataset_business.json")
biz_df = pd.DataFrame([json.loads(x) for x in biz_file.readlines()])
biz_file.close()

### Loading the reviews file

In [6]:
reviews_file = open("/Users/ktxdev/Documents/MS in AI/Data Acquisition & Management/Datasets/yelp/yelp_academic_dataset_review.json")
reviews_df = pd.DataFrame([json.loads(x) for x in reviews_file.readlines()])
reviews_file.close()

### Putting out Nightlife and Restaurants only

In [39]:
two_biz = biz_df[biz_df['categories'].apply(lambda x: any(x is not None and cat in x for cat in ['Nightlife', 'Restaurants']))]

### Joining all the reviews for the two types of business

In [41]:
two_biz_reviews = two_biz.merge(reviews_df, on='business_id', how='inner')

### Select features

In [43]:
two_biz_reviews = two_biz_reviews[['business_id', 'name', 'stars_y', 'text', 'categories']]

### Create Target column with Nightlife as True

In [57]:
two_biz_reviews['target'] = two_biz_reviews.apply(lambda x: 'Nightlife' in x['categories'], axis=1)

### Creating a balanced classification dataset

In [45]:
nightlife = two_biz_reviews[two_biz_reviews.apply(lambda x: 'Nightlife' in x['categories'], axis=1)]
restaurants = two_biz_reviews[two_biz_reviews.apply(lambda x: 'Rastaurants' in x['categories'], axis=1)]

nightlife_subset = nightlife.sample(frac=0.1, random_state=123)
restaurants_subset = restaurants.sample(frac=0.021, random_state=123)

two_biz_reviews_subset = pd.concat([nightlife_subset, restaurants_subset])

from sklearn.model_selection import train_test_split

train_data, test_data = train_test_split(two_biz_reviews_subset, train_size=0.7, random_state=123)

print(train_data.shape)
print(test_data.shape)


(107783, 6)
(46193, 6)


### Transform features

In [68]:
from sklearn.feature_extraction.text import CountVectorizer

bow_transform = CountVectorizer()
X_tr_bow = bow_transform.fit_transform(train_data['text'])
X_te_bow = bow_transform.transform(test_data['text'])

print(len(bow_transform.vocabulary_))

y_tr = train_data['target']
y_te = test_data['target']

67289
  (0, 64949)	1
  (0, 63617)	1
  (0, 3339)	1
  (0, 64690)	1
  (0, 60569)	2
  (0, 26478)	1
  (0, 28836)	1
  (0, 10094)	2
  (0, 60063)	1
  (0, 64765)	1
  (0, 42800)	1
  (0, 23481)	2
  (0, 60376)	1
  (0, 36697)	1
  (0, 49861)	1
  (0, 21782)	1
  (0, 19965)	1
  (0, 59847)	7
  (0, 65042)	1
  (0, 31929)	3
  (0, 63878)	1
  (0, 10092)	1
  (0, 53230)	1
  (0, 31873)	2
  (0, 21948)	1
  :	:
  (107782, 45190)	1
  (107782, 14870)	1
  (107782, 24850)	1
  (107782, 22839)	1
  (107782, 30719)	3
  (107782, 40472)	2
  (107782, 38161)	1
  (107782, 45609)	1
  (107782, 40855)	1
  (107782, 59184)	1
  (107782, 54170)	1
  (107782, 44647)	1
  (107782, 61650)	1
  (107782, 60099)	1
  (107782, 5122)	1
  (107782, 19402)	1
  (107782, 61043)	1
  (107782, 40118)	1
  (107782, 3138)	1
  (107782, 53281)	1
  (107782, 59574)	1
  (107782, 44286)	1
  (107782, 53648)	1
  (107782, 47034)	1
  (107782, 54953)	1


### Create the tf-idf representation using the bag-of-words matrix

In [50]:
from sklearn.feature_extraction.text import TfidfTransformer

tfidf_transformer = TfidfTransformer(norm=None)

X_tr_tfidf = tfidf_transformer.fit_transform(X_tr_bow)
X_te_tfidf = tfidf_transformer.transform(X_te_bow)

# l2-normalize the bag-of-words representation
from sklearn.preprocessing import normalize

X_tr_l2 = normalize(X_tr_bow, axis=0)
X_te_l2 = normalize(X_te_bow, axis=0)

### Training logistic regression classifiers with default parameters

In [67]:
from sklearn.linear_model import LogisticRegression

def simple_logistic_classify(X_tr, y_tr, X_test, y_test, description):
    m = LogisticRegression(random_state=123).fit(X_tr, y_tr)
    s = m.score(X_test, y_test)
    print("Test score with", description, 'features:', s)
    return m

m1 = simple_logistic_classify(X_tr_bow, y_tr, X_te_bow, y_te, 'bow')
m2 = simple_logistic_classify(X_tr_l2, y_tr, X_te_l2, y_te, 'l2-normalized')
m3 = simple_logistic_classify(X_tr_tfidf, y_tr, X_te_tfidf, y_te, 'tf-idf')

ValueError: This solver needs samples of at least 2 classes in the data, but the data contains only one class: True