In [14]:
from packages import *
import json
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.svm import LinearSVC
from sklearn.metrics import f1_score, r2_score
import matplotlib.pyplot as plt

In [2]:
def load_json(filename):
    data = []
    with open(filename, 'r') as f:
        for line in f:
             data.append(json.loads(line))
    return data

In [3]:
business_data = load_json(os.path.join(DATA_DIR, 'business.json'))

In [3]:
review_data = load_json(os.path.join(DATA_DIR, 'review.json'))

In [5]:
user_data = load_json(os.path.join(DATA_DIR,'user.json'))

In [4]:
review_texts, stars = [], []
for r in review_data:
    review_texts.append(r['text'])
    stars.append(r['stars'])

In [5]:
vectorizer = CountVectorizer()

In [6]:
bag_of_words = vectorizer.fit_transform(review_texts[:100000])

In [7]:
svc = LinearSVC()
svc.fit(bag_of_words[:75000], stars[:75000])



LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
          intercept_scaling=1, loss='squared_hinge', max_iter=1000,
          multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
          verbose=0)

In [8]:
predictions = svc.predict(bag_of_words[75000:])

In [11]:
f1_score(stars[75000:100000], predictions, average='micro')

0.59872

In [12]:
f1_score(stars[75000:100000], predictions, average='macro')

0.49837494386614134

In [16]:
r2_score(stars[75000:100000], predictions)

0.5306071868489244

In [38]:
len(review_data[10]['text'].split())

129

In [11]:
s = 'Specialty Food, Restaurants, Dim Sum, Imported Food, Food, Chinese, Ethnic Food, Seafood'

In [18]:
review_lengths = []
for r in review_data:
    review_lengths.append(len(r['text'].split()))

In [53]:
np.percentile(review_lengths, 94)

291.0

In [17]:
len(review_data[0]['text'].split())

39

In [10]:
business_data[1]

{'business_id': 'QXAEGFB4oINsVuTFxEYKFQ',
 'name': 'Emerald Chinese Restaurant',
 'address': '30 Eglinton Avenue W',
 'city': 'Mississauga',
 'state': 'ON',
 'postal_code': 'L5R 3E7',
 'latitude': 43.6054989743,
 'longitude': -79.652288909,
 'stars': 2.5,
 'review_count': 128,
 'is_open': 1,
 'attributes': {'RestaurantsReservations': 'True',
  'GoodForMeal': "{'dessert': False, 'latenight': False, 'lunch': True, 'dinner': True, 'brunch': False, 'breakfast': False}",
  'BusinessParking': "{'garage': False, 'street': False, 'validated': False, 'lot': True, 'valet': False}",
  'Caters': 'True',
  'NoiseLevel': "u'loud'",
  'RestaurantsTableService': 'True',
  'RestaurantsTakeOut': 'True',
  'RestaurantsPriceRange2': '2',
  'OutdoorSeating': 'False',
  'BikeParking': 'False',
  'Ambience': "{'romantic': False, 'intimate': False, 'classy': False, 'hipster': False, 'divey': False, 'touristy': False, 'trendy': False, 'upscale': False, 'casual': True}",
  'HasTV': 'False',
  'WiFi': "u'no'",
 

In [8]:
user_data[0]

{'user_id': 'l6BmjZMeQD3rDxWUbiAiow',
 'name': 'Rashmi',
 'review_count': 95,
 'yelping_since': '2013-10-08 23:11:33',
 'useful': 84,
 'funny': 17,
 'cool': 25,
 'elite': '2015,2016,2017',
 'friends': 'c78V-rj8NQcQjOI8KP3UEA, alRMgPcngYSCJ5naFRBz5g, ajcnq75Z5xxkvUSmmJ1bCg, BSMAmp2-wMzCkhTfq9ToNg, jka10dk9ygX76hJG0gfPZQ, dut0e4xvme7QSlesOycHQA, l4l5lBnK356zBua7B-UJ6Q, 0HicMOOs-M_gl2eO-zES4Q, _uI57wL2fLyftrcSFpfSGQ, T4_Qd0YWbC3co6WSMw4vxg, iBRoLWPtWmsI1kdbE9ORSA, xjrUcid6Ymq0DoTJELkYyw, GqadWVzJ6At-vgLzK_SKgA, DvB13VJBmSnbFXBVBsKmDA, vRP9nQkYTeNioDjtxZlVhg, gT0A1iN3eeQ8EMAjJhwQtw, 6yCWjFPtp_AD4x93WAwmnw, 1dKzpNnib-JlViKv8_Gt5g, 3Bv4_JxHXq-gVLOxYMQX0Q, ikQyfu1iViYh8T0us7wiFQ, f1GGltNaB7K5DR1jf3dOmg, tgeFUChlh7v8bZFVl2-hjQ, -9-9oyXlqsMG2he5xIWdLQ, Adj9fBPVJad8vSs-mIP7gw, Ce49RY8CKXVsTifxRYFTsw, M1_7TLi8CbdA89nFLlH4iw, wFsNv-hqbW_F5-IRqfBN6g, 0Q1L7zXHocaUZ2gsG2XJeg, cBFgmOCBdhYa0xoFEAzp_g, VrD_AgiFvzqtlR15vir3SQ, cpE-7HK514Sr5vpSen9CEQ, F1UYelhPFB-zIKlt0ygIZg, CQAL1hvsLMCzuJf9AglsXw, 1KnY1w