In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import json
from datetime import datetime
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, StandardScaler

In [25]:
# Function to load JSON lines into a DataFrame
def load_json_to_df(json_path, fields=None):
    data = []
    with open(json_path, 'r', encoding='utf-8') as f:
        for line in f:
            row = json.loads(line)
            if fields:
                row = {field: row.get(field, None) for field in fields}
            data.append(row)
    return pd.DataFrame(data)

# Load yelp_train.csv
yelp_train_csv_path = './data/yelp_train.csv'  # Update this path if necessary
yelp_train_df = pd.read_csv(yelp_train_csv_path)
print("Yelp Train DataFrame Loaded:")
display(yelp_train_df.head())


Yelp Train DataFrame Loaded:


Unnamed: 0,user_id,business_id,stars
0,vxR_YV0atFxIxfOnF9uHjQ,gTw6PENNGl68ZPUpYWP50A,5.0
1,o0p-iTC5yTBV5Yab_7es4g,iAuOpYDfOTuzQ6OPpEiGwA,4.0
2,-qj9ouN0bzMXz1vfEslG-A,5j7BnXXvlS69uLVHrY9Upw,2.0
3,E43QxgV87Ij6KxMCHcijKw,jUYp798M93Mpcjys_TTgsQ,5.0
4,T13IBpJITI32a1k41rc-tg,3MntE_HWbNNoyiLGxywjYA,5.0


In [27]:
# Load business.json
business_fields = [
    'business_id', 'stars', 'review_count', 'is_open',
    'categories', 'attributes', 'latitude', 'longitude',
    'city', 'state', 'neighborhood'
]
business_json_path = './data/business.json'  # Update this path if necessary
business_df = load_json_to_df(business_json_path, business_fields)
print("\nBusiness DataFrame Loaded:")
display(business_df.head())



Business DataFrame Loaded:


Unnamed: 0,business_id,stars,review_count,is_open,categories,attributes,latitude,longitude,city,state,neighborhood
0,Apn5Q_b6Nz61Tq4XzPdf9A,4.0,24,1,"Tours, Breweries, Pizza, Restaurants, Food, Ho...","{'BikeParking': 'False', 'BusinessAcceptsCredi...",51.091813,-114.031675,Calgary,AB,
1,AjEbIBw6ZFfln7ePHha9PA,4.5,3,0,"Chicken Wings, Burgers, Caterers, Street Vendo...","{'Alcohol': 'none', 'BikeParking': 'False', 'B...",35.960734,-114.939821,Henderson,NV,
2,O8S5hYJ1SMc8fA4QBtVujA,4.0,5,0,"Breakfast & Brunch, Restaurants, French, Sandw...","{'Alcohol': 'beer_and_wine', 'Ambience': '{'ro...",45.540503,-73.5993,Montréal,QC,Rosemont-La Petite-Patrie
3,bFzdJJ3wp3PZssNEsyU23g,1.5,8,1,"Insurance, Financial Services",,33.449999,-112.076979,Phoenix,AZ,
4,8USyCYqpScwiNEb58Bt6CA,2.0,4,1,"Home & Garden, Nurseries & Gardening, Shopping...",{'BusinessAcceptsCreditCards': 'True'},51.035591,-114.027366,Calgary,AB,


In [30]:
# Load user.json
user_fields = [
    'user_id', 'average_stars', 'review_count', 'yelping_since',
    'friends', 'fans', 'elite'
]
user_json_path = './data/user.json'  # Update this path if necessary
user_df = load_json_to_df(user_json_path, user_fields)
print("\nUser DataFrame Loaded:")
display(user_df.head())




User DataFrame Loaded:


Unnamed: 0,user_id,average_stars,review_count,yelping_since,friends,fans,elite
0,lzlZwIpuSWXEnNS91wxjHw,2.0,1,2015-09-28,,0,
1,XvLBr-9smbI0m_a7dXtB7w,5.0,2,2015-09-05,,0,
2,QPT4Ud4H5sJVr68yXhoWFw,4.0,1,2016-07-21,,0,
3,i5YitlHZpf0B3R0s_8NVuw,4.05,19,2014-08-04,,0,
4,s4FoIXE_LSGviTHBe8dmcg,3.0,3,2017-06-18,,0,


In [31]:
# Merge yelp_train_df with business_df on 'business_id'
merged_df = yelp_train_df.merge(business_df, on='business_id', how='left')
print("\nAfter Merging with Business DataFrame:")
display(merged_df.head())



After Merging with Business DataFrame:


Unnamed: 0,user_id,business_id,stars_x,stars_y,review_count,is_open,categories,attributes,latitude,longitude,city,state,neighborhood
0,vxR_YV0atFxIxfOnF9uHjQ,gTw6PENNGl68ZPUpYWP50A,5.0,4.0,876,1,"Bars, Restaurants, Barbeque, Nightlife, Korean","{'Alcohol': 'full_bar', 'Ambience': '{'romanti...",36.046802,-115.244171,Las Vegas,NV,Southwest
1,o0p-iTC5yTBV5Yab_7es4g,iAuOpYDfOTuzQ6OPpEiGwA,4.0,3.0,369,1,"Event Planning & Services, Nightlife, Bars, Sp...","{'Alcohol': 'full_bar', 'Ambience': '{'romanti...",36.057724,-115.122586,Las Vegas,NV,Southeast
2,-qj9ouN0bzMXz1vfEslG-A,5j7BnXXvlS69uLVHrY9Upw,2.0,3.5,45,1,"Chicken Wings, Fast Food, Restaurants, Mexican","{'Alcohol': 'none', 'Ambience': '{'romantic': ...",36.143724,-115.119345,Las Vegas,NV,Downtown
3,E43QxgV87Ij6KxMCHcijKw,jUYp798M93Mpcjys_TTgsQ,5.0,4.5,55,1,"Coffee & Tea, Shopping, Food, Public Markets, ...","{'BikeParking': 'True', 'BusinessAcceptsCredit...",35.216022,-80.852878,Charlotte,NC,South End
4,T13IBpJITI32a1k41rc-tg,3MntE_HWbNNoyiLGxywjYA,5.0,4.0,64,1,"Restaurants, Pizza","{'Alcohol': 'none', 'Ambience': '{'romantic': ...",35.168552,-80.874944,Charlotte,NC,


In [32]:
# Merge the resulting DataFrame with user_df on 'user_id'
merged_df = merged_df.merge(user_df, on='user_id', how='left')
print("\nAfter Merging with User DataFrame:")
display(merged_df.head())



After Merging with User DataFrame:


Unnamed: 0,user_id,business_id,stars_x,stars_y,review_count_x,is_open,categories,attributes,latitude,longitude,city,state,neighborhood,average_stars,review_count_y,yelping_since,friends,fans,elite
0,vxR_YV0atFxIxfOnF9uHjQ,gTw6PENNGl68ZPUpYWP50A,5.0,4.0,876,1,"Bars, Restaurants, Barbeque, Nightlife, Korean","{'Alcohol': 'full_bar', 'Ambience': '{'romanti...",36.046802,-115.244171,Las Vegas,NV,Southwest,4.11,353,2006-06-16,"ir2V_EKfO7XOfKkmX6khCg, uukJrcxFaQFYlbXDql4Kbw...",69,"2018, 2014, 2012, 2016, 2017, 2011, 2013, 2015"
1,o0p-iTC5yTBV5Yab_7es4g,iAuOpYDfOTuzQ6OPpEiGwA,4.0,3.0,369,1,"Event Planning & Services, Nightlife, Bars, Sp...","{'Alcohol': 'full_bar', 'Ambience': '{'romanti...",36.057724,-115.122586,Las Vegas,NV,Southeast,3.92,433,2010-01-08,"CkyNJfLv6yx55Pdjq_8T_g, 55sr442Csr6cJ3c430MSow...",12,"2012, 2013, 2010, 2011"
2,-qj9ouN0bzMXz1vfEslG-A,5j7BnXXvlS69uLVHrY9Upw,2.0,3.5,45,1,"Chicken Wings, Fast Food, Restaurants, Mexican","{'Alcohol': 'none', 'Ambience': '{'romantic': ...",36.143724,-115.119345,Las Vegas,NV,Downtown,3.69,154,2012-06-30,"-xDW3gYiYaoeVASXywTPgw, OueXAik2P-eUcXbd1qGXKw...",9,
3,E43QxgV87Ij6KxMCHcijKw,jUYp798M93Mpcjys_TTgsQ,5.0,4.5,55,1,"Coffee & Tea, Shopping, Food, Public Markets, ...","{'BikeParking': 'True', 'BusinessAcceptsCredit...",35.216022,-80.852878,Charlotte,NC,South End,4.11,668,2012-03-26,"pufGgg3EuY_As7cu__pM1w, 3ONXl3eodyqkhysi-UoseA...",407,"2015, 2016, 2017, 2013, 2014, 2018"
4,T13IBpJITI32a1k41rc-tg,3MntE_HWbNNoyiLGxywjYA,5.0,4.0,64,1,"Restaurants, Pizza","{'Alcohol': 'none', 'Ambience': '{'romantic': ...",35.168552,-80.874944,Charlotte,NC,,3.6,304,2015-12-18,"swUxTfJ96XZJ0ufmljJiXQ, T1oHdzsrFeTrQhHfNdls8A...",11,"2018, 2017, 2016"


In [33]:
print("\nMissing Values in Each Column:")
print(merged_df.isnull().sum())


Missing Values in Each Column:
user_id              0
business_id          0
stars_x              0
stars_y              0
review_count_x       0
is_open              0
categories          62
attributes        5376
latitude             0
longitude            0
city                 0
state                0
neighborhood         0
average_stars        0
review_count_y       0
yelping_since        0
friends              0
fans                 0
elite                0
dtype: int64


In [34]:
# Function to parse 'attributes' column
def parse_attributes(attributes):
    if isinstance(attributes, dict):
        return attributes
    else:
        return {}

# Apply the function to parse 'attributes'
merged_df['attributes'] = merged_df['attributes'].apply(parse_attributes)


In [35]:
# Fill missing 'categories' with empty string
merged_df['categories'] = merged_df['categories'].fillna('')


In [36]:
# Create 'num_attrs' as the number of attributes per business
merged_df['num_attrs'] = merged_df['attributes'].apply(lambda x: len(x) if isinstance(x, dict) else 0)

# Create 'num_categories' as the number of categories per business
merged_df['num_categories'] = merged_df['categories'].apply(lambda x: len([cat for cat in x.split(',') if cat.strip() != '']))


In [37]:
# Function to count number of elite years
def count_elite(elite):
    if elite == 'None' or pd.isnull(elite):
        return 0
    else:
        return len(elite.split(','))

# Apply the function to create 'num_elite' feature
merged_df['num_elite'] = merged_df['elite'].apply(count_elite)


In [38]:
# Function to count number of friends
def count_friends(friends):
    if friends == 'None' or pd.isnull(friends):
        return 0
    else:
        return len(friends.split(','))

# Apply the function to create 'num_friends' feature
merged_df['num_friends'] = merged_df['friends'].apply(count_friends)


In [39]:
# Function to calculate membership years
def calculate_membership_years(yelping_since):
    try:
        date = datetime.strptime(yelping_since, "%Y-%m")
    except ValueError:
        try:
            date = datetime.strptime(yelping_since, "%Y")
        except ValueError:
            return 5.0  # Default value if parsing fails
    return (datetime.now() - date).days / 365.25

# Apply the function to create 'membership_years' feature
merged_df['membership_years'] = merged_df['yelping_since'].apply(calculate_membership_years)


In [40]:
from sklearn.preprocessing import MultiLabelBinarizer

# Split 'categories' and create a list of categories for each business
merged_df['categories_list'] = merged_df['categories'].apply(lambda x: [cat.strip() for cat in x.split(',') if cat.strip() != ''])

# Initialize MultiLabelBinarizer
mlb = MultiLabelBinarizer()

# Fit and transform the 'categories_list'
encoded_categories = mlb.fit_transform(merged_df['categories_list'])

# Create a DataFrame with encoded categories
encoded_categories_df = pd.DataFrame(encoded_categories, columns=mlb.classes_, index=merged_df.index)

# Concatenate with the main DataFrame
merged_df = pd.concat([merged_df, encoded_categories_df], axis=1)

# Drop the temporary 'categories_list' column
merged_df.drop('categories_list', axis=1, inplace=True)


In [41]:
# Initialize LabelEncoders
le_state = LabelEncoder()
le_city = LabelEncoder()

# Fill missing 'state' and 'city' with 'Unknown'
merged_df['state'] = merged_df['state'].fillna('Unknown')
merged_df['city'] = merged_df['city'].fillna('Unknown')

# Fit and transform 'state'
merged_df['state_encoded'] = le_state.fit_transform(merged_df['state'].astype(str))

# Fit and transform 'city'
merged_df['city_encoded'] = le_city.fit_transform(merged_df['city'].astype(str))


In [43]:
# Select relevant numerical features from business and user data
numerical_features = [
    'stars', 'review_count', 'is_open', 'num_attrs', 'num_categories',
    'latitude', 'longitude', 'num_elite', 'num_friends',
    'membership_years'
]

# Select relevant encoded categorical features from 'categories'
category_features = list(encoded_categories_df.columns)

# Select state and city encoded features
location_features = ['state_encoded', 'city_encoded']

# Select relevant user features (excluding 'useful', 'funny', 'cool')
user_features = [
    'average_stars', 'review_count', 'fans',
    'num_elite', 'num_friends', 'membership_years'
]

# Combine all features
feature_columns = numerical_features + category_features + location_features + user_features

# Ensure all features are present in the DataFrame
for feature in feature_columns:
    if feature not in merged_df.columns:
        merged_df[feature] = 0  # Fill missing features with 0

# Define the final feature set and target variable
X = merged_df[feature_columns]
y = merged_df['stars']


In [44]:
# Display all columns in the DataFrame
print("\nColumns in the merged DataFrame:")
print(merged_df.columns.tolist())



Columns in the merged DataFrame:
['user_id', 'business_id', 'stars_x', 'stars_y', 'review_count_x', 'is_open', 'categories', 'attributes', 'latitude', 'longitude', 'city', 'state', 'neighborhood', 'average_stars', 'review_count_y', 'yelping_since', 'friends', 'fans', 'elite', 'num_attrs', 'num_categories', 'num_elite', 'num_friends', 'membership_years', 'ATV Rentals/Tours', 'Acai Bowls', 'Accessories', 'Accountants', 'Acne Treatment', 'Active Life', 'Acupuncture', 'Adult', 'Adult Education', 'Adult Entertainment', 'Advertising', 'Aerial Fitness', 'Aerial Tours', 'Aestheticians', 'Afghan', 'African', 'Air Duct Cleaning', 'Airlines', 'Airport Lounges', 'Airport Shuttles', 'Airport Terminals', 'Airports', 'Allergists', 'Amateur Sports Teams', 'American (New)', 'American (Traditional)', 'Amusement Parks', 'Animal Shelters', 'Antiques', 'Apartments', 'Appliances', 'Appliances & Repair', 'Appraisal Services', 'Aquarium Services', 'Aquariums', 'Arabian', 'Arcades', 'Archery', 'Architects', '

In [46]:
# Compute the correlation matrix
corr_matrix = pd.concat([X, y], axis=1).corr()

# Extract the correlation of all features with the target variable 'rating'
corr_matrix_sorted = corr_matrix.sort_values(by='stars', ascending=False)
print("\nCorrelation of features with rating:")
print(corr_matrix_sorted)


ValueError: The column label 'stars' is not unique.

In [47]:
from pyspark import SparkContext
import pandas as pd
import numpy as np
import json
import csv
import sys
import time
import xgboost
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from datetime import datetime

def main():
    if len(sys.argv) != 4:
        print("Usage: competition.py <folder_path> <val_file_path> <output_file_path>")
        sys.exit(-1)

    folder_path = sys.argv[1]
    val_path = sys.argv[2]
    output_path = sys.argv[3]
    user_json = folder_path + "/user.json"
    business_json = folder_path + "/business.json"
    review_json = folder_path + "/review_train.json"
    yelp_train = folder_path + "/yelp_train.csv"
    photo_json = folder_path + "/photo.json"
    tip_json = folder_path + "/tip.json"

    sc = SparkContext("local[*]", "competition")
    sc.setLogLevel("ERROR")
    start_time = time.time()
    raw_user_data = sc.textFile(user_json)
    raw_business_data = sc.textFile(business_json)
    raw_review_data = sc.textFile(review_json)
    raw_train_data = sc.textFile(yelp_train)
    raw_photo_data = sc.textFile(photo_json)
    raw_tip_data = sc.textFile(tip_json)

    unique_cities = raw_business_data.map(json.loads).map(lambda row: row.get('city', 'Unknown')).filter(lambda city: city != "None").distinct().collect()
    unique_states = raw_business_data.map(json.loads).map(lambda row: row.get('state', 'Unknown')).filter(lambda state: state != "None").distinct().collect()
    city_mapping = {city: idx for idx, city in enumerate(sorted(unique_cities))}
    city_mapping['Unknown'] = -1
    state_mapping = {state: idx for idx, state in enumerate(sorted(unique_states))}
    state_mapping['Unknown'] = -1 
    broadcast_city_mapping = sc.broadcast(city_mapping)
    broadcast_state_mapping = sc.broadcast(state_mapping)
    class User:
        @staticmethod
        def user_feature(row):
            if row["elite"] != "None":
                row['elite_cnt'] = len(row["elite"].split(","))
            else:
                row['elite_cnt'] = 0
            yelping_since = datetime.strptime(row["yelping_since"], "%Y-%m-%d")
            row['yelping_time'] = datetime.now().year- yelping_since.year
            if row["friends"] != "None":
                row['friends_cnt'] = len(row["friends"].split(","))
            else:
                row['friends_cnt'] = 0
            return row
    user_rdd = raw_user_data.map(json.loads).map(User.user_feature).map(lambda row: (row['user_id'], (float(row['average_stars']), row['review_count'], row['elite_cnt'], row['yelping_time'],row["useful"], 
                                                                                                     row["funny"], row["cool"], row["fans"], row['friends_cnt']))).cache().collectAsMap()
        
    class Business:
        @staticmethod
        def business_feature(row, city_mapping, state_mapping):
            row['attributes_cnt'] = len(row['attributes']) if row.get('attributes') else 0
            row['categories_cnt'] = len(row['categories'].split(",")) if row.get('categories') else 0
            city = row.get('city', 'Unknown')
            row['city_encoded'] = city_mapping.get(city, -1)
            state = row.get('state', 'Unknown')
            row['state_encoded'] = state_mapping.get(state, -1)

            return row
    business_rdd = raw_business_data.map(json.loads).map(lambda row: Business.business_feature(row, city_mapping, state_mapping)).map(lambda row: (row['business_id'], (float(row['stars']), row['review_count'], row['attributes_cnt'], row['categories_cnt'], row['city_encoded'], row['state_encoded']))).cache().collectAsMap()

    class Photo:
        @staticmethod
        def photo_feature(row):
            business_id = row.get('business_id', None)
            if business_id is not None:
                return (business_id, 1)
            else:
                return (None, 0)
    def process_photos(raw_photo_data):
        photo_rdd = raw_photo_data.map(json.loads).map(Photo.photo_feature).filter(lambda x: x[0] is not None).reduceByKey(lambda a, b: a + b).map(lambda x: (x[0], x[1])).cache().collectAsMap()
        return photo_rdd
    photo_rdd = process_photos(raw_photo_data)

    class Review:
        def __init__(self, raw_review_rdd):
            self.review_rdd = raw_review_rdd.map(json.loads)\
                .map(lambda row: (
                    row['business_id'],
                    (
                        float(row.get('stars', 0)),
                        float(row.get('useful', 0)),
                        float(row.get('funny', 0)),
                        float(row.get('cool', 0)),
                        1  # Count of reviews
                    )
                ))\
                .reduceByKey(lambda a, b: (
                    a[0] + b[0],  # Sum of stars
                    a[1] + b[1],  # Sum of useful
                    a[2] + b[2],  # Sum of funny
                    a[3] + b[3],  # Sum of cool
                    a[4] + b[4]   # Count
                ))\
                .mapValues(lambda sums: (
                    sums[0] / sums[4],  # Average stars
                    sums[1] / sums[4],  # Average useful
                    sums[2] / sums[4],  # Average funny
                    sums[3] / sums[4]   # Average cool
                ))\
                .cache()\
                .collectAsMap()
            return self.review_rdd
        
    review_rdd = Review(raw_review_data).review_rdd

    train_rdd = raw_train_data.map(lambda line: line.split(",")).filter(lambda cols: len(cols) >= 3).map(lambda cols: (cols[0], cols[1], float(cols[2])))

    user_rdd_bc = sc.broadcast(user_rdd)
    business_rdd_bc = sc.broadcast(business_rdd)
    review_rdd_bc = sc.broadcast(review_rdd)
    photo_rdd_bc = sc.broadcast(photo_rdd)

    def extract_features(record):
        user_id, business_id, stars = record
        user_features = user_rdd_bc.value.get(user_id, (3.0, 0, 0, 0, 0, 0, 0, 0, 0))
        business_features = business_rdd_bc.value.get(business_id, (3.0, 0, 0, 0, 0, 0))
        review_features = review_rdd_bc.value.get(business_id, (0.0, 0.0, 0.0, 0.0))
        photo_features = photo_rdd_bc.value.get(business_id, 0)
        return (
            user_features[0], user_features[1], user_features[2], user_features[3], user_features[4], user_features[5], user_features[6], user_features[7], user_features[8],
            business_features[0], business_features[1], business_features[2], business_features[3], business_features[4], business_features[5],
            review_features[0], review_features[1], review_features[2], review_features[3],
            photo_features, stars
        )
    
    training_features_labels_rdd = train_rdd.map(extract_features)
    training_features_labels = training_features_labels_rdd.collect()

    X_train = np.array([record[:-1] for record in training_features_labels])
    y_train = np.array([record[-1] for record in training_features_labels])

    model = XGBRegressor(
        objective='reg:linear',
        n_estimators=300,
        learning_rate=0.02,
        max_depth=15,
        random_state=42,
        subsample=0.8,
        colsample_bytree=0.8,
        reg_alpha=1,
        reg_lambda=1
    )
    model.fit(X_train, y_train)

    def extract_features_test(record):
        user_id, business_id = record
        user_features = user_rdd_bc.value.get(user_id, (3.0, 0, 0, 0, 0, 0, 0, 0, 0))
        business_features = business_rdd_bc.value.get(business_id, (3.0, 0, 0, 0, 0, 0))
        review_features = review_rdd_bc.value.get(business_id, (0.0, 0.0, 0.0, 0.0))
        photo_features = photo_rdd_bc.value.get(business_id, 0)
        return (
            user_id, business_id,
            user_features[0], user_features[1], user_features[2], user_features[3], user_features[4], user_features[5], user_features[6], user_features[7], user_features[8],
            business_features[0], business_features[1], business_features[2], business_features[3], business_features[4], business_features[5],
            review_features[0], review_features[1], review_features[2], review_features[3],
            photo_features
        )
    
    val_data = sc.textFile(val_path)
    header_val = val_data.first()
    val_data = val_data.filter(lambda line: line != header_val)
    val_data = val_data.map(lambda line: line.split(",")).map(lambda x: (x[0], x[1]))
    validation_features_labels_rdd = val_data.map(extract_features_test)
    validation_features_labels = validation_features_labels_rdd.collect()

    X_val = np.array([record[2:] for record in validation_features_labels])

    y_pred = model.predict(X_val)
    user_business_ids = [(record[0], record[1]) for record in validation_features_labels]
    output = zip(user_business_ids, y_pred)
    with open(output_path, 'w') as f:
        writer = csv.writer(f)
        writer.writerow(["user_id", "business_id", "prediction"])
        for (user_id, business_id), prediction in output:
            writer.writerow([user_id, business_id, prediction])
    print(f"Execution time: {time.time() - start_time}")

if __name__ == "__main__":
    main()

Usage: competition.py <folder_path> <val_file_path> <output_file_path>


SystemExit: -1

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)


In [52]:
import pandas as pd

# Load predictions
pred_df = pd.read_csv('output.csv')

# Load ground truth
ground_df = pd.read_csv('./data/yelp_val.csv')

# Merge on user_id and business_id
merged_df = pd.merge(pred_df, ground_df, on=['user_id', 'business_id'], how='inner')

# Calculate absolute error
merged_df['error'] = (merged_df['prediction'] - merged_df['stars']).abs()

# Define bins and labels
bins = [0, 1, 2, 3, 4, float('inf')]
labels = ['>=0 and <1', '>=1 and <2', '>=2 and <3', '>=3 and <4', '>=4']

# Categorize errors into bins
merged_df['error_bin'] = pd.cut(merged_df['error'], bins=bins, labels=labels, right=False)

# Compute error distribution
error_distribution = merged_df['error_bin'].value_counts().sort_index()

# Display the error distribution
print("# Error Distribution")
print("#######################################################################################################################")
print("#")
for label, count in error_distribution.items():
    print(f"# {label}: {count}")
print("#")


# Error Distribution
#######################################################################################################################
#
# >=0 and <1: 102331
# >=1 and <2: 32731
# >=2 and <3: 6160
# >=3 and <4: 820
# >=4: 2
#


In [None]:
len(' I am using model-based collaborative filtering to predict the ratings of the user-business pairs. I have extracted the features from the user, business, review, photo, and tip data. From Assignment 3, 
 I found that model-based collaborative filtering is giving better results than item-based collaborative 
filtering. Hence, I choose to enhance the model-based collaborative filtering by using XGBoost Regressor.
Besides the features extracted from the user, business, I also used the review, photo, and tip data to 
enhance the model. Features include the average stars, review count, elite count, yelping time, useful, city,
state, latitude, longitude, attributes count, categories count, is open, average useful, average funny, average cool,
photo count, and tip count.  I have used the XGBoost Regressor to predict the ratings of the user-business pairs. 
As for the hyperparameters, I have used 5 fold cross-validation to tune the hyperparameters. At first, I did not choose
 City, State, Latitude, and Longitude as features. But after adding these features, the RMSE value decreased. Then I also
 added the photo and tip count features which further decreased the RMSE value.')