In [12]:
import pandas as pd
import json

# load the data from the JSON file
with open('yelp_academic_dataset_business.json', 'r', encoding='utf8') as f:
    data = [json.loads(line) for line in f]

# create a DataFrame from the data
business_df = pd.DataFrame(data)

# optionally, you can drop any columns that you don't need
business_df = business_df.drop(columns=['hours', 'attributes', 'address', 'city', 'state', 'postal_code', 'is_open', 'name'])

# print the first few rows of the DataFrame
print(business_df.head())

              business_id   latitude   longitude  stars  review_count  \
0  Pns2l4eNsfO8kk83dixA6A  34.426679 -119.711197    5.0             7   
1  mpf3x-BjTdTEA3yCZrAYPw  38.551126  -90.335695    3.0            15   
2  tUFrWirKiKi_TAnsVWINQQ  32.223236 -110.880452    3.5            22   
3  MTSW4McQd7CbVtyjqoe9mw  39.955505  -75.155564    4.0            80   
4  mWMc6_wTdE0EUBKIGXDVfA  40.338183  -75.471659    4.5            13   

                                          categories  
0  Doctors, Traditional Chinese Medicine, Naturop...  
1  Shipping Centers, Local Services, Notaries, Ma...  
2  Department Stores, Shopping, Fashion, Home & G...  
3  Restaurants, Food, Bubble Tea, Coffee & Tea, B...  
4                          Brewpubs, Breweries, Food  


In [13]:
# Trim spaces from the category names
business_df['categories'] = business_df['categories'].str.strip()

# Split the categories column into multiple columns using one-hot encoding
categories = business_df['categories'].str.get_dummies(sep=',')

# Combine the original dataframe with the new one-hot encoded columns
business_df = pd.concat([business_df, categories], axis=1)

# Drop the original categories column
business_df.drop('categories', axis=1, inplace=True)

print(business_df.head())

KeyboardInterrupt: 

In [10]:
print(business_df.dtypes)
# Identify the columns that were one-hot encoded
one_hot_cols = [col for col in business_df.columns if business_df[col].dtype == 'int64']

# Sum the one-hot encoded columns to get the total count for each category
category_counts = business_df[one_hot_cols].sum()

# Sort the counts in descending order
category_counts = category_counts.sort_values(ascending=False)

# Print the category counts
print(category_counts)

business_id      object
latitude        float64
longitude       float64
stars           float64
review_count      int64
                 ...   
Wraps             int64
Yelp Events       int64
Yoga              int64
Ziplining         int64
Zoos              int64
Length: 2459, dtype: object
review_count            6745508
 Restaurants              36978
 Food                     20998
 Shopping                 18915
Restaurants               15290
                         ...   
Lahmacun                      1
Land Surveying                1
 Concept Shops                1
Gemstones & Minerals          1
Skiing                        1
Length: 2455, dtype: int64


In [2]:
# load the data from the JSON file
with open('yelp_academic_dataset_review.json', 'r', encoding='utf8') as f:
    data = [json.loads(line) for line in f]

# create a DataFrame from the data
review_df = pd.DataFrame(data)

# drop columns
review_df = review_df.drop(columns=['review_id', 'useful', 'funny', 'cool', 'text', 'date'])

# print the first few rows of the DataFrame
print(review_df.head())

                  user_id             business_id  stars
0  mh_-eMZ6K5RLWhZyISBhwA  XQfwVwDr-v0ZS3_CbbE5Xw    3.0
1  OyoGAe7OKpv6SyGZT5g77Q  7ATYjTIgM3jUlt4UM3IypQ    5.0
2  8g_iMtfSiwikVnbP2etR0A  YjUWPpI6HXG530lwP-fb2A    3.0
3  _7bHUi9Uuf5__HHc_Q8guQ  kxX2SOes4o-D3ZQBkiMRfA    5.0
4  bcjbaE6dDog4jkNY91ncLQ  e4Vwtrqf-wpJfwesgvdgxQ    4.0


In [3]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split

# Merge the dataframes on the 'business_id' column
merged_df = pd.merge(business_df, review_df, on='business_id', how='inner')

# Create a pivot table of the merged dataframe to create the user-item matrix
user_item_matrix = pd.pivot_table(merged_df, values='stars', index='user_id', columns='business_id')

# Split the user-item matrix into training and test sets
train_data, test_data = train_test_split(user_item_matrix, test_size=0.2)

# Calculate the cosine similarity between users
user_similarity = cosine_similarity(train_data.fillna(0))

# Define the number of recommended businesses
n = 10

# Calculate the cosine similarity between businesses
business_similarity = cosine_similarity(train_data.T.fillna(0))

# Create a dictionary to store the recommended businesses for each business_id
recommended_businesses = {}

# Iterate through each business_id and find the top n similar businesses based on user ratings
for business_id in train_data.columns:
    similar_businesses = business_similarity[business_id].argsort()[-(n+1):-1]
    recommended_businesses[business_id] = list(train_data.columns[similar_businesses])

# Define a function to calculate the mean average precision at k for a user
def mapk(actual, predicted, k):
    if len(predicted) > k:
        predicted = predicted[:k]
    score = 0.0
    num_hits = 0.0
    for i, p in enumerate(predicted):
        if p in actual and p not in predicted[:i]:
            num_hits += 1.0
            score += num_hits / (i+1.0)
    if not actual:
        return 0.0
    else:
        return score / min(len(actual), k)

# Evaluate the model on the test set
k = 10 # number of recommended businesses
mapk_scores = []

for user_id in test_data.index:
    actual_businesses = list(test_data.loc[user_id].dropna().index)
    if len(actual_businesses) >= k:
        recommended_businesses = []
        for business_id in train_data.columns:
            if business_id not in actual_businesses:
                predicted_rating = np.dot(user_similarity[np.where(train_data.index == user_id)[0][0]],
                                          train_data[business_id].fillna(0)) / np.sum(user_similarity[np.where(train_data.index == user_id)[0][0]])
                recommended_businesses.append((business_id, predicted_rating))
        recommended_businesses.sort(key=lambda x: x[1], reverse=True)
        recommended_businesses = [x[0] for x in recommended_businesses[:k]]
        score = mapk(actual_businesses, recommended_businesses, k)
        mapk_scores.append(score)

# Calculate the mean average precision at k for the test set
mean_apk = np.mean(mapk_scores)
print(f'Mean average precision at k = {k} on the test set: {mean_apk:.4f}')


KeyError: 'stars'