In [1]:
import pandas as pd
import numpy as np
import datetime as datetime

# Set to display all columns
pd.set_option('display.max_columns', None)

# Set to display all rows
pd.set_option('display.max_rows', None)

# Optionally, set to increase the maximum column width
pd.set_option('display.max_colwidth', None)  # Or use a large number instead of None


In [2]:
df = pd.read_csv('/Users/skareti/Desktop/Independent Study/NYTRestaurantReviews/data/top100.csv', index_col = 0)
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1538 entries, 747 to 1510
Data columns (total 11 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Restaurant_name  1537 non-null   object 
 1   Author           1528 non-null   object 
 2   Publishing_Date  1538 non-null   object 
 3   Rating           598 non-null    object 
 4   Price            1536 non-null   object 
 5   Cuisine          1528 non-null   object 
 6   Neighborhood     1513 non-null   object 
 7   CriticsPick      1538 non-null   int64  
 8   Description      1516 non-null   object 
 9   Rank             1538 non-null   float64
 10  top100_2024      1538 non-null   float64
dtypes: float64(2), int64(1), object(8)
memory usage: 144.2+ KB


In [3]:
# Define functions for each step of the process including mapping, filling NA values, and one-hot encoding.

def map_rating_price(df):
    """Map 'Rating' and 'Price' columns to their corresponding categorical variables."""
    rating_mapping_corrected = {
        'Poor': 'Poor',
        'Fair': 'Fair',
        'Satisfactory': 'Satisfactory',
        '1 star': 'Good',
        '2 star': 'Very Good',  
        '3 star': 'Excellent',  
        '4 star': 'Extraordinary' 
    }

    price_mapping = {
        '$': 'Inexpensive',
        '$$': 'Moderate',
        '$$$': 'Expensive',
        '$$$$': 'Very Expensive'
    }

    df['Rating_Categorical'] = df['Rating'].map(rating_mapping_corrected).fillna('NR')
    df['Price_Categorical'] = df['Price'].map(price_mapping)
    
    return df

def one_hot_encode_columns(df, columns):
    """One-hot encode specified columns and concatenate them back to the original DataFrame."""
    for column in columns:
        ohe_df = pd.get_dummies(df[column], prefix=column)
        df = pd.concat([df, ohe_df], axis=1)
    return df

def encode_cuisines(row, top_cuisines):
    """Encode cuisines into one-hot encoded columns for the top cuisines and 'Cuisine_Other'."""
    if pd.isna(row['Cuisine']):
        # Initialize all cuisine columns to 0 including a generic 'Other' category
        encoded = {'Cuisine_' + cuisine: 0 for cuisine in top_cuisines}
        # encoded['Cuisine_Other'] = 0
    else:
        cuisines = row['Cuisine'].split(', ')
        # Set 1 if the cuisine in the row is one of the top cuisines, else 0
        encoded = {'Cuisine_' + cuisine: 1 if cuisine in cuisines else 0 for cuisine in top_cuisines}
        # Set 'Cuisine_Other' to 1 if any cuisine from the row is not in the top cuisines
        # encoded['Cuisine_Other'] = 1 if any(c not in top_cuisines for c in cuisines) else 0
    
    return pd.Series(encoded)


def extract_year_month(df, date_column):
    """Extract 'Year' and 'Month' from a date column and add them to the DataFrame."""
    df['Year'] = pd.to_datetime(df[date_column]).dt.year
    df['Month'] = pd.to_datetime(df[date_column]).dt.month
    return df

def filter_by_author_date(df, author, start_date, end_date):
    """Filter the DataFrame by author and a range of publishing dates."""
    if author:
        df = df[df['Author'] == author]
    if start_date and end_date:
        df['Publishing_Date'] = pd.to_datetime(df['Publishing_Date'])
        df = df[(df['Publishing_Date'] >= pd.to_datetime(start_date)) & (df['Publishing_Date'] <= pd.to_datetime(end_date))]
    return df

def preprocess(df, author=None, start_date=None, end_date=None):
    """Main function with filtering by author and date range."""
    # Filter by author and date range
    df = filter_by_author_date(df, author, start_date, end_date)
    
    # Extract 'Year' and 'Month'
    df = extract_year_month(df, 'Publishing_Date')
    
    # Map 'Rating' and 'Price' to categorical variables
    df = map_rating_price(df)
    
    # # Identify top 20 cuisines
    # exploded_cuisine = df['Cuisine'].str.split(', ').explode()
    # exploded_cuisine_list = exploded_cuisine.value_counts().index.tolist()
    # top_20_cuisines = exploded_cuisine.value_counts().head(20).index.tolist()
    
    # # Encode cuisines
    # df_encoded_cuisines = df.apply(lambda row: encode_cuisines(row, exploded_cuisine_list), axis=1)
    # df = pd.concat([df, df_encoded_cuisines], axis=1)
    
    # One-hot encode 'Rating_Categorical' and 'Price_Categorical'
    df = one_hot_encode_columns(df, ['Rating_Categorical', 'Price_Categorical'])
    
    return df

# Example usage of the main function with filtering parameters
df_processed = preprocess(df)
# Display the first few rows of the processed DataFrame to verify the changes
df_processed.to_csv('/Users/skareti/Desktop/Independent Study/NYTRestaurantReviews/data/PreprocessedData.csv')

