## Yelp Challenge

Dataset Documentation: <br>
https://www.yelp.com/dataset/documentation/main

In [33]:
import pandas as pd
import numpy as np
from cmfrec import CMF
import pycmf

from surprise import SVD
from surprise import accuracy
from surprise import Reader
from surprise import KNNBasic

import matplotlib.pyplot as plt
import tarfile
import json
from tqdm import tqdm

In [11]:
zf = tarfile.open('yelp_dataset.tar') 
#df = pd.read_csv(zf.open('intfile.csv'))
for name in zf.list():
    print (name)

?rw-r--r-- daniellg/users  138279749 2018-11-15 11:22:39 business.json 
?rw-r--r-- daniellg/users  408807658 2018-11-15 11:25:00 checkin.json 
?rw-r--r-- daniellg/users 5347475638 2018-11-15 11:35:37 review.json 
?rw-r--r-- daniellg/users  244535478 2018-11-15 11:26:18 tip.json 
?rw-r--r-- daniellg/users 2485747393 2018-11-15 11:24:48 user.json 
?rw-r--r-- daniellg/users   25661152 2019-01-11 19:06:09 photo.json 
?rw-r--r-- daniellg/users     101186 2019-01-14 11:31:35 Dataset_Challenge_Dataset_Agreement.pdf 
?rw-r--r-- daniellg/users     111822 2019-01-14 11:35:09 Yelp_Dataset_Challenge_Round_13.pdf 


TypeError: 'NoneType' object is not iterable

In [12]:
## Feel free to extract more files here
zf.extract("review.json")
zf.extract("business.json")
zf.extract("user.json")

In [6]:
# load business.json
# 192609 unique businesses?
line_count = len(open("./yelp_dataset/business.json").readlines())
business_ids, cities, states, latitudes, longitudes, stars, review_counts, attributes, categories = [], [], [], [], [], [], [], [], []
with open("./yelp_dataset/business.json") as f:
    for line in tqdm(f, total=line_count):
        blob = json.loads(line)
        business_ids += [blob["business_id"]]
        cities += [blob["city"]]
        states += [blob["state"]]
        latitudes += [blob["latitude"]]
        longitudes += [blob["longitude"]]
        stars += [blob["stars"]]
        review_counts += [blob["review_count"]]
        attributes += [blob["attributes"]]
        categories += [blob["categories"]]
        
businesses = pd.DataFrame(
    {"business_id": business_ids, "city": cities, "state": states, "latitude": latitudes, "longitude": longitudes, "stars": stars, "review_counts": review_counts, "attributes": attributes, "categories":categories }
)

100%|██████████| 192609/192609 [00:02<00:00, 74126.15it/s]


In [7]:
# load user.json
# 1637138 unique users?
line_count = len(open("./yelp_dataset/user.json").readlines())
users, review_counts, elites, average_stars, friends = [], [], [], [], []
with open("./yelp_dataset/user.json") as f:
    for line in tqdm(f, total=line_count):
        blob = json.loads(line)
        users += [blob["user_id"]]
        review_counts += [blob["review_count"]]
        elites += [blob["elite"]]
        average_stars += [blob["average_stars"]]
        friends += [blob["friends"]]
        
users = pd.DataFrame(
    {"user_id": users, "review_count": review_counts,"elite": elites, "average_stars": average_stars, "friends": friends}
)

100%|██████████| 1637138/1637138 [00:20<00:00, 81636.78it/s] 


In [8]:
# load review.json
# 6685900 unique reviews?
line_count = len(open("./yelp_dataset/review.json").readlines())
user_ids, business_ids, stars, dates, texts = [], [], [], [], []
with open("./yelp_dataset/review.json") as f:
    for line in tqdm(f, total=line_count):
        blob = json.loads(line)
        user_ids += [blob["user_id"]]
        business_ids += [blob["business_id"]]
        stars += [blob["stars"]]
        dates += [blob["date"]]
        texts += [blob["text"]]
reviews = pd.DataFrame(
    {"user_id": user_ids, "business_id": business_ids, "rating": stars, "date": dates, "text": texts}
)
user_counts = reviews["user_id"].value_counts()
active_users = user_counts.loc[user_counts >= 5].index.tolist()
reviews = reviews.loc[reviews.user_id.isin(active_users)]

100%|██████████| 6685900/6685900 [00:58<00:00, 114841.67it/s]


In [11]:
ratings_holdout = pd.read_csv('data/ratings_sample_holdout.csv')

In [12]:
ratings_train = pd.read_csv('data/ratings_sample_train.csv')

In [13]:
ratings_val = pd.read_csv('data/ratings_sample_cv.csv')

In [14]:
def process(df):
    df = df.drop(df.columns[0], axis =1)
    df['date']  = pd.to_datetime(df['date'])
    df['week_day'] = df['date'].dt.weekday
    df['month'] = df['date'].dt.month
    df['hour'] = df['date'].dt.hour
    df = df.merge(users, on = 'user_id')
    df = df.merge(businesses, on = 'business_id')
    return df

In [15]:
# states with less than 10 restaurants are already filtered out
ratings_train = process(ratings_train.copy())

In [16]:
ratings_holdout = process(ratings_holdout.copy())

In [17]:
ratings_val = process(ratings_val.copy())

In [19]:
print('There are {0} rows, {1} columns in training set.'.format(ratings_train.shape[0], ratings_train.shape[1]))
print('There are {0} rows, {1} columns in holdout set.'.format(ratings_holdout.shape[0], ratings_holdout.shape[1]))
print('There are {0} rows, {1} columns in validation set.'.format(ratings_val.shape[0], ratings_val.shape[1]))

There are 406042 rows, 20 columns in training set.
There are 28612 rows, 20 columns in holdout set.
There are 28615 rows, 20 columns in validation set.


In [20]:
ratings_train.shape

(406042, 20)

## Baseline: SVD as Matrix Factorization

As a CF algorithm. A matrix factorization technique that reduces the number of features of a data set by reducing space dimensions from N to K where K < N. Thus, in our context, we are finding 2 matrices whose product is the original matrix. 

In [21]:
algo = SVD()

In [28]:
trainset = ratings_train.iloc[:,0:3]
trainset.columns = ['userID', 'itemID','rating']
trainset

Unnamed: 0,userID,itemID,rating
0,n6-Gk65cPZL6Uz8qRm3NYw,WTqjgwHlXbSFevF32_DJVw,5.0
1,bLPmbJ0x70HSHfD-2n_smw,WTqjgwHlXbSFevF32_DJVw,1.0
2,n6-Gk65cPZL6Uz8qRm3NYw,30Q5xBagQHmkwp8Q9I1FCg,5.0
3,wJgtf18VKKZq9A6ZQUybQw,30Q5xBagQHmkwp8Q9I1FCg,4.0
4,jqyS91iI63nyztPAvRk1AQ,30Q5xBagQHmkwp8Q9I1FCg,4.0
5,ktm2YMsb65tRnePcYpVlqg,30Q5xBagQHmkwp8Q9I1FCg,4.0
6,CHUMBPZdstb0OMc26jlldg,30Q5xBagQHmkwp8Q9I1FCg,4.0
7,CHUMBPZdstb0OMc26jlldg,30Q5xBagQHmkwp8Q9I1FCg,3.0
8,iOR5mdeMaR36VbA76IGbMQ,30Q5xBagQHmkwp8Q9I1FCg,2.0
9,X7NHc1i4LgqCPiltnuvR1g,30Q5xBagQHmkwp8Q9I1FCg,5.0


In [31]:
testset = ratings_holdout.iloc[:, 0:3]
testset.columns = ['userID', 'itemID','rating']
testset

Unnamed: 0,userID,itemID,rating
0,n6-Gk65cPZL6Uz8qRm3NYw,hk5wpV-_pi5jmDDVPeG8DA,5.0
1,FIk4lQQu1eTe2EpzQ4xhBA,jLxeBgWhLRbII2ACkgH1Sg,4.0
2,9cOyDUT20H7iTxU9CU8I6Q,jLxeBgWhLRbII2ACkgH1Sg,5.0
3,_N7Ndn29bpll_961oPeEfw,O-b5osM0NO4f31dp6_DatQ,3.0
4,453V8MlGr8y61PpsDAFjKQ,O-b5osM0NO4f31dp6_DatQ,5.0
5,oYYxqbRWb92oBajOQP8NlA,O-b5osM0NO4f31dp6_DatQ,5.0
6,GMkjaAJ3puIf0zn1E1TVXQ,O-b5osM0NO4f31dp6_DatQ,5.0
7,C_hUvw2z0R-Rv0yZb6QCZA,O19VReN1I2TBrJsbXUAIJg,5.0
8,Y0UHmlB06TZ1OQuQoQ4dxw,O19VReN1I2TBrJsbXUAIJg,5.0
9,bCpimZ6jX8SRX7woLu09eQ,O19VReN1I2TBrJsbXUAIJg,5.0


In [38]:
trainset

Unnamed: 0,userID,itemID,rating
0,n6-Gk65cPZL6Uz8qRm3NYw,WTqjgwHlXbSFevF32_DJVw,5.0
1,bLPmbJ0x70HSHfD-2n_smw,WTqjgwHlXbSFevF32_DJVw,1.0
2,n6-Gk65cPZL6Uz8qRm3NYw,30Q5xBagQHmkwp8Q9I1FCg,5.0
3,wJgtf18VKKZq9A6ZQUybQw,30Q5xBagQHmkwp8Q9I1FCg,4.0
4,jqyS91iI63nyztPAvRk1AQ,30Q5xBagQHmkwp8Q9I1FCg,4.0
5,ktm2YMsb65tRnePcYpVlqg,30Q5xBagQHmkwp8Q9I1FCg,4.0
6,CHUMBPZdstb0OMc26jlldg,30Q5xBagQHmkwp8Q9I1FCg,4.0
7,CHUMBPZdstb0OMc26jlldg,30Q5xBagQHmkwp8Q9I1FCg,3.0
8,iOR5mdeMaR36VbA76IGbMQ,30Q5xBagQHmkwp8Q9I1FCg,2.0
9,X7NHc1i4LgqCPiltnuvR1g,30Q5xBagQHmkwp8Q9I1FCg,5.0


In [42]:
trainset[['userID','itemID','rating']]

Unnamed: 0,userID,itemID,rating
0,n6-Gk65cPZL6Uz8qRm3NYw,WTqjgwHlXbSFevF32_DJVw,5.0
1,bLPmbJ0x70HSHfD-2n_smw,WTqjgwHlXbSFevF32_DJVw,1.0
2,n6-Gk65cPZL6Uz8qRm3NYw,30Q5xBagQHmkwp8Q9I1FCg,5.0
3,wJgtf18VKKZq9A6ZQUybQw,30Q5xBagQHmkwp8Q9I1FCg,4.0
4,jqyS91iI63nyztPAvRk1AQ,30Q5xBagQHmkwp8Q9I1FCg,4.0
5,ktm2YMsb65tRnePcYpVlqg,30Q5xBagQHmkwp8Q9I1FCg,4.0
6,CHUMBPZdstb0OMc26jlldg,30Q5xBagQHmkwp8Q9I1FCg,4.0
7,CHUMBPZdstb0OMc26jlldg,30Q5xBagQHmkwp8Q9I1FCg,3.0
8,iOR5mdeMaR36VbA76IGbMQ,30Q5xBagQHmkwp8Q9I1FCg,2.0
9,X7NHc1i4LgqCPiltnuvR1g,30Q5xBagQHmkwp8Q9I1FCg,5.0


In [43]:
# transform to work with surprise
reader = Reader(rating_scale = (0.0, 5.0))
train_data = Dataset.load_from_df(trainset[['userID','itemID','rating']], reader)
test_data = Dataset.load_from_df(testset[['userID','itemID','rating']], reader)

In [48]:
train_sr = train_data.build_full_trainset()

In [49]:
test_sr_before = test_data.build_full_trainset()

In [50]:
test_sr = test_sr_before.build_testset()

In [51]:
algo.fit(train_sr)
predictions = algo.test(test_sr)
accuracy.rmse(predictions)

RMSE: 1.3877


1.3877083808706543