In [146]:
import os
import json
import gc

import numpy as np 
import pandas as pd 
import pyarrow as pa
import pyarrow.parquet as pq

from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler

# -------------------------------------------------------

config = {
            'data_dir': '/kaggle/input/yelp-compressed-dataset',
        }

print(config)


{'data_dir': '/kaggle/input/yelp-compressed-dataset'}


In [147]:
def read_data_file(config, file_name=None, verbose=False):
    assert file_name is not None

    filepath = os.path.join(config['data_dir'], file_name+'.parquet')

    if verbose: print(f"Reading file: {filepath}")
    df = pq.read_table(filepath).to_pandas()
    if verbose: print("df.shape: ", df.shape)

    return df

def drop_inf_rows(df):
    df.replace([np.inf, -np.inf], np.nan, inplace=True)
    df.dropna(inplace=True)
    
def filter_data(config, df, file_name=None, drop_cols=None, dropna=False, dropinf=False, verbose=False):
    
    if verbose: print("Before filtering shape: ", df.shape)
    
    if drop_cols is not None:
        if verbose: 
            print("Before dropping columns: ", df.columns)
            print("Before dropping columns shape:", df.shape)
        df.drop(drop_cols, axis=1, inplace=True)
        if verbose: 
            print("After dropping columns: ", df.columns)
            print("After dropping columns shape:", df.shape)
        
    # Filter out rows with NaN values
    if dropna:
        if verbose: print("Before dropping NaNs shape:", df.shape)
        df.dropna(inplace=True)
        if verbose: print("After dropping NaNs shape:", df.shape)

    # Filter out rows with inf values
    if dropinf:
        if verbose: print("Before dropping Infs shape:", df.shape)
        df.replace([np.inf, -np.inf], np.nan, inplace=True)
        df.dropna(inplace=True)
        if verbose: print("After dropping Infs shape:", df.shape)

    # Filter out ir-relevant data for current analysis
    assert file_name is not None
    if file_name == 'business':
        if verbose: print("Before filterin out non-restaurant data shape:", df.shape)
        df = df[df['categories'].str.lower().str.contains('restaurants')]
        if verbose: print("After filterin out non-restaurant data shape:", df.shape)
    
    if verbose: print("After filtering shape: ", df.shape)

    return df


def handle_outliers_fn(df, cols, mode='drop', threshold=3, verbose=False):
    if verbose: print("Before outlier handling shape: ", df.shape)
    
    if mode == 'drop':
        print("Dropping outliers...")
        for col in cols:
            z_scores = (df[col] - df[col].mean()) / df[col].std()
            df = df[z_scores.abs() < threshold]
    elif mode == 'clip':
        print("Clipping outliers...")
        for col in cols:
            lower_bound = df[col].mean() - threshold * df[col].std()
            upper_bound = df[col].mean() + threshold * df[col].std()
            df[col] = df[col].clip(lower_bound, upper_bound)
    
    if verbose: print("After outlier handling shape: ", df.shape)

    return df


def remove_duplicates(df, verbose=False):
    if verbose: print("Before removing duplicates shape: ", df.shape)
    df.drop_duplicates(inplace=True)
    if verbose: print("After removing duplicates shape: ", df.shape)
    
    return df


def convert_datetime_column(df, col_name):
    for idx, dt_str in enumerate(df[col_name]):
        try:
            df.loc[idx, col_name] = pd.to_datetime(dt_str, format='%Y-%m-%d %H:%M:%S')  # Or format='mixed'
        except ValueError:
            print(f"Failed to convert datetime string at index {idx}: {dt_str}")    
    return df


def encode_dates_fn(df, date_cols=None, verbose=False):
    if verbose: print("Before encoding data shape: ", df.shape)
    
    # If the date columns exist; and convert to the datetime format
    if date_cols is None:
        # infer date columns based on the names
        date_cols = [col for col in df.columns if 'date' in col.lower()]
    if len(date_cols) > 0:
        for col in date_cols:
            # format = '%Y-%m-%d %H:%M:%S'
            # format='mixed'
            df[col] = df[col].apply(lambda x: x.lstrip(', '))
            df[col] = df[col].apply(lambda x: x.strip(' '))
            df[col] = pd.to_datetime(df[col], format='%Y-%m-%d %H:%M:%S')
            # df = convert_datetime_column(df, col)
            
    if verbose: print("After encoding data shape: ", df.shape)
    
    return df


def get_column_descriptor_dict(config, df, id_cols=None, verbose=False):
    col_descriptor_dict = dict()

    # create columns type descriptors - int_cols, float_cols, bool_cols, id_cols, numeric_cols, date_cols etc.
    if id_cols is None:
        col_descriptor_dict['id_cols'] = [col for col in df.columns if 'id' in col.lower()]
    col_descriptor_dict['int_cols'] = [col for col in df.columns if 'int' in str(df[col].dtype)]
    col_descriptor_dict['float_cols'] = [col for col in df.columns if 'float' in str(df[col].dtype)]
    col_descriptor_dict['nume_cols'] = col_descriptor_dict['int_cols'] + \
                                        col_descriptor_dict['float_cols']
    col_descriptor_dict['non_nume_cols'] = [col for col in df.columns \
                                            if col not in col_descriptor_dict['nume_cols']]

    if verbose:
        for k, v in col_descriptor_dict.items():
            print(f"{k}: ")
            print(f"{v}")
            print("-"*30)

    return col_descriptor_dict


def get_scaler(scaler_type):
    """
    Returns a scikit-learn scaler object based on the specified scaler type.

    Args:
        scaler_type (str): Type of scaler. Options are 'standard', 'min_max', 'robust'.  

    Returns:
        sklearn.preprocessing.scaler: A scikit-learn scaler object.
    """

    if scaler_type == 'standard':
        scaler = StandardScaler()  # Standardize features (zero mean, unit variance)
    elif scaler_type == 'min_max':
        scaler = MinMaxScaler()  # Scale features to a given range (often 0 to 1)
    elif scaler_type == 'robust':
        scaler = RobustScaler()  # Scale features using statistics robust to outliers 
    else:
        raise ValueError(f"Invalid scaler_type: {scaler_type}")  # Handle invalid input 
    
    return scaler


def scale_data_fn(df, scaler_type_cols_map, scaler_type='standard', verbose=False):
    if verbose: print("Before scaling data shape: ", df.shape)
    
    for scaler_type, cols in scaler_type_cols_map.items():
        print(f"Using {scaler_type} scaler for cols: {cols}")
        scaler = get_scaler(scaler_type)
        for col in cols:
            df[[col]] = scaler.fit_transform(df[[col]])
        
    if verbose: print("After scaling data shape: ", df.shape)
    
    return df


def preprocess_data(config, df, encode_dates=False, date_cols=None, id_cols=None, 
                    handle_outliers=False, 
                    scale_data=False, cols_scaler_type_map=None, verbose=False):
    
    df = remove_duplicates(df, verbose=verbose)
    
    if encode_dates:
        df = encode_dates_fn(df, date_cols, verbose)
    
    col_descriptor_dict = get_column_descriptor_dict(config, df, id_cols=id_cols, \
                                                     verbose=verbose)
    
    if handle_outliers:
        df = handle_outliers_fn(df, col_descriptor_dict['nume_cols'], 
                                mode='drop', threshold=3, verbose=verbose)
    
    scaler_type_cols_map = {'standard': col_descriptor_dict['float_cols']}
    if scale_data and scaler_type_cols_map:
        print("scaling data....................")
        df = scale_data_fn(df, scaler_type_cols_map, verbose=verbose)
    
    return df, col_descriptor_dict
    


In [148]:
# files: 'business', 'tip', 'user', 'checkin', 'review'
file_name = 'business'

df = read_data_file(config, file_name='business', verbose=False)

df = filter_data(config, df, file_name, drop_cols=['attributes', 'hours'], 
                 dropna=True, dropinf=True, verbose=False)

df, col_descriptor_dict = preprocess_data(config, df, handle_outliers=False, verbose=False)



In [149]:
# files: 'business', 'tip', 'user', 'checkin', 'review'
file_name = 'business'

df = read_data_file(config, file_name=file_name, verbose=True)

df = filter_data(config, df, file_name, drop_cols=['attributes', 'hours'], 
                 dropna=True, dropinf=True, verbose=True)

df = preprocess_data(config, df, encode_dates=True, scale_data=True, handle_outliers=True, verbose=True)

Reading file: /kaggle/input/yelp-compressed-dataset/business.parquet
df.shape:  (150346, 14)
Before filtering shape:  (150346, 14)
Before dropping columns:  Index(['business_id', 'name', 'address', 'city', 'state', 'postal_code',
       'latitude', 'longitude', 'stars', 'review_count', 'is_open',
       'attributes', 'categories', 'hours'],
      dtype='object')
Before dropping columns shape: (150346, 14)
After dropping columns:  Index(['business_id', 'name', 'address', 'city', 'state', 'postal_code',
       'latitude', 'longitude', 'stars', 'review_count', 'is_open',
       'categories'],
      dtype='object')
After dropping columns shape: (150346, 12)
Before dropping NaNs shape: (150346, 12)
After dropping NaNs shape: (150243, 12)
Before dropping Infs shape: (150243, 12)
After dropping Infs shape: (150243, 12)
Before filterin out non-restaurant data shape: (150243, 12)
After filterin out non-restaurant data shape: (52268, 12)
After filtering shape:  (52268, 12)
Before removing duplic

In [150]:
# files: 'business', 'tip', 'user', 'checkin', 'review'
file_name = 'tip'

df = read_data_file(config, file_name=file_name, verbose=True)

df = filter_data(config, df, file_name, #drop_cols=['attributes', 'hours'], 
                 dropna=True, dropinf=True, verbose=True)

df = preprocess_data(config, df, encode_dates=True, scale_data=True, handle_outliers=True, verbose=True)

Reading file: /kaggle/input/yelp-compressed-dataset/tip.parquet
df.shape:  (908915, 5)
Before filtering shape:  (908915, 5)
Before dropping NaNs shape: (908915, 5)
After dropping NaNs shape: (908915, 5)
Before dropping Infs shape: (908915, 5)
After dropping Infs shape: (908915, 5)
After filtering shape:  (908915, 5)
Before removing duplicates shape:  (908915, 5)
After removing duplicates shape:  (908848, 5)
Before encoding data shape:  (908848, 5)
After encoding data shape:  (908848, 5)
id_cols: 
['user_id', 'business_id']
------------------------------
int_cols: 
['compliment_count']
------------------------------
float_cols: 
[]
------------------------------
nume_cols: 
['compliment_count']
------------------------------
non_nume_cols: 
['user_id', 'business_id', 'text', 'date']
------------------------------
Before outlier handling shape:  (908848, 5)
Dropping outliers...
After outlier handling shape:  (898309, 5)
scaling data....................
Before scaling data shape:  (898309

In [151]:
# files: 'business', 'tip', 'user', 'checkin', 'review'
file_name = 'checkin'

df = read_data_file(config, file_name=file_name, verbose=True)

df = filter_data(config, df, file_name, #drop_cols=['attributes', 'hours'], 
                 dropna=True, dropinf=True, verbose=True)

df = preprocess_data(config, df, encode_dates=False, scale_data=True, handle_outliers=True, verbose=True)

Reading file: /kaggle/input/yelp-compressed-dataset/checkin.parquet
df.shape:  (131930, 2)
Before filtering shape:  (131930, 2)
Before dropping NaNs shape: (131930, 2)
After dropping NaNs shape: (131930, 2)
Before dropping Infs shape: (131930, 2)
After dropping Infs shape: (131930, 2)
After filtering shape:  (131930, 2)
Before removing duplicates shape:  (131930, 2)
After removing duplicates shape:  (131930, 2)
id_cols: 
['business_id']
------------------------------
int_cols: 
[]
------------------------------
float_cols: 
[]
------------------------------
nume_cols: 
[]
------------------------------
non_nume_cols: 
['business_id', 'date']
------------------------------
Before outlier handling shape:  (131930, 2)
Dropping outliers...
After outlier handling shape:  (131930, 2)
scaling data....................
Before scaling data shape:  (131930, 2)
Using standard scaler for cols: []
After scaling data shape:  (131930, 2)


In [None]:
# files: 'business', 'tip', 'user', 'checkin', 'review'
file_name = 'user'

df = read_data_file(config, file_name=file_name, verbose=True)

df = filter_data(config, df, file_name, drop_cols=['friends'], 
                 dropna=True, dropinf=True, verbose=True)

df = preprocess_data(config, df, encode_dates=False, scale_data=True, 
                     handle_outliers=False, verbose=True)

Reading file: /kaggle/input/yelp-compressed-dataset/user.parquet
df.shape:  (1987897, 22)
Before filtering shape:  (1987897, 22)
Before dropping columns:  Index(['user_id', 'name', 'review_count', 'yelping_since', 'useful', 'funny',
       'cool', 'elite', 'friends', 'fans', 'average_stars', 'compliment_hot',
       'compliment_more', 'compliment_profile', 'compliment_cute',
       'compliment_list', 'compliment_note', 'compliment_plain',
       'compliment_cool', 'compliment_funny', 'compliment_writer',
       'compliment_photos'],
      dtype='object')
Before dropping columns shape: (1987897, 22)
After dropping columns:  Index(['user_id', 'name', 'review_count', 'yelping_since', 'useful', 'funny',
       'cool', 'elite', 'fans', 'average_stars', 'compliment_hot',
       'compliment_more', 'compliment_profile', 'compliment_cute',
       'compliment_list', 'compliment_note', 'compliment_plain',
       'compliment_cool', 'compliment_funny', 'compliment_writer',
       'compliment_photos