In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import pyarrow as pa
import pyarrow.parquet as pq

import os
import json
import gc


In [3]:
# config_ref = CFG()
# config = config_ref.get_config()
config = {
            'data_dir': 'C:\\Users\\Vitthal\\Desktop\\projects\\yelp_data',
            
            'data_db_storage_dir': 'C:\\Users\\Vitthal\\Desktop\\projects\\yelp_data\\db_storage',
            'data_db_name': 'yelp_data',
            'data_db_access_uname': 'db_admin_1', 
            'data_db_access_pwd': 'pass1234',
            'create_db_if_not_exists': True, # for data_db
        }

print(config)

{'data_dir': 'C:\\Users\\Vitthal\\Desktop\\projects\\yelp_data', 'data_db_storage_dir': 'C:\\Users\\Vitthal\\Desktop\\projects\\yelp_data\\db_storage', 'data_db_name': 'yelp_data', 'data_db_access_uname': 'db_admin_1', 'data_db_access_pwd': 'pass1234', 'create_db_if_not_exists': True}


In [21]:
def read_data_file(config, file_name=None, verbose=False):
    assert file_name is not None

    filepath = os.path.join(config['data_dir'], file_name+'.parquet')

    if verbose: print(f"Reading file: {filepath}")
    df = pq.read_table(filepath).to_pandas()
    if verbose: print("df.shape: ", df.shape)

    return df

def drop_inf_rows(df):
    df.replace([np.inf, -np.inf], np.nan, inplace=True)
    df.dropna(inplace=True)
    
def filter_data(config, df, file_name=None, dropna=False, dropinf=False, verbose=False):
    
    if verbose: print("Before filtering shape: ", df.shape)

    # Filter out rows with NaN values
    if dropna:
        if verbose: print("Before dropping NaNs shape:", df.shape)
        df.dropna(inplace=True)
        if verbose: print("After dropping NaNs shape:", df.shape)

    # Filter out rows with inf values
    if dropinf:
        if verbose: print("Before dropping Infs shape:", df.shape)
        df.replace([np.inf, -np.inf], np.nan, inplace=True)
        df.dropna(inplace=True)
        if verbose: print("After dropping Infs shape:", df.shape)

    # Filter out ir-relevant data for current analysis
    assert file_name is not None
    if file_name == 'business':
        if verbose: print("Before filterin out non-restaurant data shape:", df.shape)
        df = df[df['categories'].str.lower().str.contains('restaurants')]
        if verbose: print("After filterin out non-restaurant data shape:", df.shape)
    
    if verbose: print("After filtering shape: ", df.shape)

    return df


def handle_outliers(df, cols, mode='drop', threshold=3, verbose=False):
    if verbose: print("Before outlier handling shape: ", df.shape)
    
    if mode == 'drop':
        for col in cols:
            z_scores = (df[col] - df[col].mean()) / df[col].std()
            df = df[z_scores.abs() < threshold]
    elif mode == 'clip':
        for col in cols:
            lower_bound = df[col].mean() - threshold * df[col].std()
            upper_bound = df[col].mean() + threshold * df[col].std()
            df[col] = df[col].clip(lower_bound, upper_bound)
    
    if verbose: print("After outlier handling shape: ", df.shape)

    return df

def preprocess_data(config, df, date_cols=None, verbose=False):
    # Function to clean and pre-process the pandas dataframe for further data analysis
    # call helper function for each of the following steps

    # Remove duplicates rows
    df.drop_duplicates(inplace=True)
    
    # If the date columns exist; and convert to the datetime format
    if date_cols is None:
        # infer date columns based on the names
        date_cols = [col for col in df.columns if col.lower().contains('date')]
    if len(date_cols) > 0:
        for col in date_cols:
            df[col] = pd.to_datetime(df[col])
    
    # create columns type descriptors - int_cols, float_cols, bool_cols, id_cols, numeric_cols, date_cols etc.
    if id_cols is None:
        id_cols = [col for col in df.columns if col.lower().contains('id')]
    
    int_cols = [col for col in df.columns if 'int' in str(df.dtype)]
    float_cols = [col for col in df.columns if 'int' in str(df.dtype)]
    nume_cols = int_cols + float_cols
    non_nume_cols = [col for col in df.columns if col not in nume_cols]
    
    if verbose:
        print("int_cols: ", int_cols)
        print("float_cols: ", float_cols)
        print("nume_cols: ", nume_cols)
        print("non_nume_cols: ", non_nume_cols)
        
    # create outlier detection and handling function for the float cols and int cols that are not id cols.
    df = handle_outliers(df, nume_cols, mode='drop', threshold=3)
    

    return df
    


In [22]:
# files: 'business', 'tip', 'user', 'checkin', 'review'
file_name = 'business'
df = read_data_file(config, file_name='business', verbose=True)
df = filter_data(config, df, file_name, dropna=True, dropinf=True, verbose=True)
df = clean_data(config, df)

Reading file: C:\Users\Vitthal\Desktop\projects\yelp_data\business.parquet


df.shape:  (150346, 14)
Before filtering shape:  (150346, 14)
Before dropping NaNs shape: (150346, 14)
After dropping NaNs shape: (117618, 14)
Before dropping Infs shape: (117618, 14)
After dropping Infs shape: (117618, 14)
Before filterin out non-restaurant data shape: (117618, 14)
After filterin out non-restaurant data shape: (44676, 14)
After filtering shape:  (44676, 14)


In [18]:
# df['categories'].isna().sum()

103