In [25]:
# packages
import gzip
import numpy as np
import pandas as pd
import json
import sklearn
from sklearn.model_selection import train_test_split
from sklearn import linear_model
from collections import defaultdict

In [3]:
def parse(path):
  g = gzip.open(path, 'r')
  for l in g:
    yield json.loads(l)

In [4]:

review_filepath = "../assignment2/review-Idaho.json.gz"
meta_filepath = "../assignment2/meta-Idaho.json.gz"

reviews = []
bizs = []
for l in parse(review_filepath):
    reviews.append(l)

for l in parse(meta_filepath):
    bizs.append(l)

In [5]:
# from eda notebook

b_avg_ratings = []
b_num_reviews = []
b_num_similar_bizs = []
b_prices = []
b_states = [] # i.e. None, 'Closed ⋅ Opens 10AM Fri' , 'Permanently closed' , 'Open ⋅ Closes 5PM' , maybe more types
# ^ only listed once per biz, not at time of each review meaning the open/closed is not always true for every one of it's reviews, but maybe 'permanently closed' would help predict time
b_has_prices = []
for b in bizs:
    b_avg_ratings.append(b['avg_rating'])
    b_num_reviews.append(b['num_of_reviews'])
    b_num_similar_bizs.append((0 if b['relative_results'] == None else len(b['relative_results'])))
    if b['price'] == None:
        b_has_prices.append(0)
        b_prices.append(b['price']) # missing data
    else:
        b_has_prices.append(1)
        b_prices.append(len(b['price']))
    try:
        b_states.append(b['state']) # maybe split into counts of each one when analyzing
    except Exception as e:
        b_states.append(None)
r_ratings = []
r_time_difs = []
r_num_pics = []
r_has_response = []
r_resp_times = []
for r in reviews:
    r_ratings.append(r['rating'])
    try:
        r_time_difs.append(r['resp']['time'] - r['time']) #Nones might cause errors or NaNs, handle later
        r_resp_times.append(r['resp']['time'])
        r_has_response.append(1)
    except Exception as e:
        r_time_difs.append(None)
        r_resp_times.append(None)
        r_has_response.append(0)
    try:
        r_num_pics.append(len(r['pics']))
    except Exception as e:
        r_num_pics.append(0)

In [6]:
reviews_with_responses = [review for review in reviews if review.get("resp")]
print(len(reviews_with_responses))
reviews_with_responses = [review for review in reviews 
                          if ((review.get("resp")) and (review['resp']['time'] - review['time'] > 0))]
len(reviews_with_responses)

729928


700132

In [7]:
for r in reviews_with_responses:
    r['resp_time'] = r['resp']['time'] - r['time']

In [8]:
reviews_with_responses[0]

{'user_id': '115114032166130224762',
 'name': 'Mariah Schaeffer',
 'time': 1589989743506,
 'rating': 5,
 'text': 'Kevin and Shannon are amazing! They were very sweet and made the job fast! Also they are one of the cheeper junk removal services! I will for sure use again!',
 'pics': None,
 'resp': {'time': 1591139557205,
  'text': 'Thank you for hiring us for your junk removal!'},
 'gmap_id': '0x54afb4c19c4bffff:0x9389114191ca2781',
 'resp_time': 1149813699}

# Model #1 - linear regression

In [9]:
def feature(d):
    feat = [1]
    try:
        text_length = len(d['text'])
    except (KeyError, TypeError):
        text_length = 0 
    feat.append(text_length)
    feat.append(d.get('rating', 0))
    feat.append(d.get('time', 0))
    return feat

In [10]:

X = [feature(r) for r in reviews_with_responses]
y = [r['resp_time'] for r in reviews_with_responses]


In [11]:
model = sklearn.linear_model.LinearRegression(fit_intercept=False)
model.fit(X, y)

In [12]:
y_pred = model.predict(X)
sse = sum([x**2 for x in (y - y_pred)])
mse = sse / len(y)
mse

1.3744326911207206e+20

# Advanced Model

In [62]:
def feat(d):
    feat = [1]
    try:
        text_length = len(d['text'])
    except (KeyError, TypeError):
        text_length = 0 
    feat.append(text_length)
    feat.append(d.get('rating', 0))
    feat.append(d.get('time', 0))
    feat.append(d.get('resp_time', 0))
    gmap_id = d.get('gmap_id')
    if gmap_id in b and isinstance(b[gmap_id], dict):
        avg_rating = b[gmap_id].get('avg_rating', 0)
    else:
        avg_rating = 0  # Default if gmap_id is invalid or not in b
    
    feat.append(avg_rating)
    

In [63]:
def feat2(d):
    feat = [1]
    feat.append(d.get('avg_rating', 0))

In [64]:
X = [feat(r) for r in reviews_with_responses]
y = [r['resp_time'] for r in reviews_with_responses]

In [66]:
X

[None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,

In [65]:
model2 = sklearn.linear_model.LinearRegression(fit_intercept=False)
model2.fit(X, y)

ValueError: Expected 2D array, got 1D array instead:
array=[None None None ... None None None].
Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.

In [17]:
y_pred = model.predict(X)
sse = sum([x**2 for x in (y - y_pred)])
mse = sse / len(y)
mse

1.3744326911207206e+20

# Scratch

In [19]:
import numpy as np
resp_time_lst = [d['resp_time'] / 1000 / 60 / 60 / 24 for d in reviews_with_responses]
np.quantile(resp_time_lst, [0.25, 0.5, 0.75])

array([0.46475142, 1.77545962, 7.78040038])

In [20]:
resp_time_lst

[13.308028923611111,
 6.845580023148147,
 40.164034745370365,
 308.1323496875,
 0.18931494212962963,
 0.9109248726851852,
 248.98713701388888,
 6.388242731481481,
 336.3063022453704,
 313.9500467824074,
 10.783241087962963,
 31.447186747685183,
 152.15838922453705,
 1.2255548032407406,
 72.23770158564815,
 0.013149722222222222,
 0.02466456018518519,
 12.749598182870372,
 0.1768450810185185,
 0.7330953587962963,
 3.2862648379629626,
 0.14779,
 0.07627248842592593,
 0.08757020833333334,
 0.01581945601851852,
 13.260755254629629,
 0.013149722222222222,
 13.260755254629629,
 72.23770158564815,
 3.2862648379629626,
 10.783241087962963,
 0.08757020833333334,
 0.02035837962962963,
 0.7217396875,
 12.749598182870372,
 1.2255548032407406,
 0.32887030092592595,
 18.36666894675926,
 0.10357716435185182,
 340.26908748842595,
 0.009868726851851852,
 0.019788530092592592,
 0.031149456018518522,
 0.01042212962962963,
 0.2061985300925926,
 314.0200500925926,
 0.003111643518518519,
 0.12248881944444444

In [22]:
bizs[0]

{'name': 'Montour WMA - Idaho Fish and Game',
 'address': 'Montour WMA - Idaho Fish and Game, Montour Rd, Emmett, ID 83617',
 'gmap_id': '0x54af0dfadab474e1:0x3ae5c949132941d8',
 'description': None,
 'latitude': 43.9295808,
 'longitude': -116.3333273,
 'category': ['Hunting area'],
 'avg_rating': 4.4,
 'num_of_reviews': 17,
 'price': None,
 'hours': None,
 'MISC': {'Accessibility': ['Wheelchair accessible parking lot']},
 'state': None,
 'relative_results': ['0x54af0df970009eab:0x998e663a1d2ea45',
  '0x54af0e08fb885aed:0xf626feadb2775442',
  '0x54afa0ce7af7268b:0x4b98fe1767238dc4',
  '0x54afc0324c4ca65b:0x808e759ecacfab1c',
  '0x54afb8ee4d09b825:0x50112376cec8c0ff'],
 'url': 'https://www.google.com/maps/place//data=!4m2!3m1!1s0x54af0dfadab474e1:0x3ae5c949132941d8?authuser=-1&hl=en&gl=us'}

In [31]:
reviews[5]

{'user_id': '108480743392575424354',
 'name': 'Dan Shadix',
 'time': 1525705287828,
 'rating': 5,
 'text': 'Love this float on paddle boards.  Nice flat water and fairly slow.',
 'pics': None,
 'resp': None,
 'gmap_id': '0x54af0dfadab474e1:0x3ae5c949132941d8'}

In [47]:
business_reviews = defaultdict(lambda: None)
for r in reviews:
    if r['resp'] and 'time' in r['resp']:
        business_reviews[r['gmap_id']] = r['resp']['time'] - r['time']

In [51]:
uniq_vals, counts = np.unique([r['gmap_id'] for r in reviews], return_counts=True)
len(uniq_vals[counts > 1])

31616

In [52]:
len(reviews)

3892636

In [53]:
31616/3892636*100

0.8122002673766568