# Preprocessing YelpChi

## Import libraries

In [1]:
# Import libraries
import numpy as np # For numerical processing
import pandas as pd # For dataframe processing
from datetime import datetime # For date processing
from tqdm import tqdm

## Load data

### Review content data

In [2]:
# Load review content data from two text files into numpy arrays
review_content_sets = [
    np.genfromtxt("../datasets/raw/YelpChi/output_review_yelpHotelData_NRYRcleaned.txt", dtype=str, delimiter="\n"), 
    np.genfromtxt("../datasets/raw/YelpChi/output_review_yelpResData_NRYRcleaned.txt", dtype=str, delimiter="\n"),
]

### Review metadata

In [3]:
# Load review metadata from two CSV files into pandas DataFrames
review_metadata_sets = [
    pd.read_csv("../datasets/raw/YelpChi/output_meta_yelpHotelData_NRYRcleaned.txt", header=None, sep=" "), 
    pd.read_csv("../datasets/raw/YelpChi/output_meta_yelpResData_NRYRcleaned.txt", header=None, sep=" "),
]

## Compose final data

### Merge review content with metadata ### Join content and metadata

In [4]:
# Enumerate creates an iterator that yields pairs (index, element) from the review_metadata_sets list
# So it will return (0, first_df), (1, second_df) etc.

# The map call applies the lambda function to each element in the enumerated list
# The lambda function:
    # it[0] is the index
    # it[1] is the DataFrame  
    # We take first len(review_content_sets[index]) rows of the DataFrame (to match content length)

# So this maps each metadata DataFrame to a truncated version 
# with only as many rows as the corresponding content DataFrame

# We convert the map output to a list to enable concatenation

# pd.concat concatenates multiple DataFrames together, ignoring the indexes
# By passing ignore_index=True, it will create a new sequential index
# So we end up with a single DataFrame containing all metadata rows  
# with indices matching the content rows

df_review = pd.concat(
    list(map(lambda it: it[1].head(len(review_content_sets[it[0]])), enumerate(review_metadata_sets))),
    ignore_index=True, 
)

# Keep only the relevant columns
df_review = df_review[[0, 2, 3, 4, 8]] 
# Rename columns
df_review.columns = [
    "date", "user", "product", "label", "rating", 
]

### Re-mapping user / product ID

In [5]:
# Map user IDs to 0..N integer range
df_review["user"] = df_review["user"].apply(
    {user: idx for idx, user in enumerate(df_review["user"].unique())}.get
)
# Map product IDs to 0..N integer range
df_review["product"] = df_review["product"].apply(
    {product: idx for idx, product in enumerate(df_review["product"].unique())}.get
)

In [6]:
max_user_idx_length = len(str(df_review["user"].max()))
max_product_idx_length = len(str(df_review["product"].max()))

df_review["user"] = df_review["user"].apply({
    user: "yelpchi_user_{}".format(str.zfill(str(idx), max_user_idx_length))
    for idx, user in enumerate(df_review["user"].unique())
}.get)
df_review["product"] = df_review["product"].apply({
    product: "yelpchi_product_{}".format(str.zfill(str(idx), max_product_idx_length))
    for idx, product in enumerate(df_review["product"].unique())
}.get)

df_review.head()

Unnamed: 0,date,user,product,label,rating
0,6/8/2011,yelpchi_user_00000,yelpchi_product_000,N,5
1,8/30/2011,yelpchi_user_00001,yelpchi_product_000,N,3
2,6/26/2009,yelpchi_user_00002,yelpchi_product_000,N,5
3,9/16/2010,yelpchi_user_00003,yelpchi_product_000,N,1
4,2/5/2010,yelpchi_user_00004,yelpchi_product_000,N,3


### Normalize data

In [7]:
# Convert date strings to Python date objects
df_review.date = df_review.date.apply(
    lambda value: str(datetime.strptime(value, "%m/%d/%Y").date()),
)
# Concatenate review content arrays into one array
df_review["content"] = np.concatenate(review_content_sets)
# Reorder columns
df_review = df_review[["user", "product", "rating", "label", "date", "content"]]
# Map fraud label values to unified "fraud" value   
df_review.label = df_review.label.apply(lambda label: "fraud" if label == "Y" else "organic") 
# Display first 5 rows
display(df_review.head())

Unnamed: 0,user,product,rating,label,date,content
0,yelpchi_user_00000,yelpchi_product_000,5,organic,2011-06-08,Let me begin by saying that there are two kind...
1,yelpchi_user_00001,yelpchi_product_000,3,organic,2011-08-30,The only place inside the Loop that you can st...
2,yelpchi_user_00002,yelpchi_product_000,5,organic,2009-06-26,I have walked by the Tokyo Hotel countless tim...
3,yelpchi_user_00003,yelpchi_product_000,1,organic,2010-09-16,"If you are considering staying here, watch thi..."
4,yelpchi_user_00004,yelpchi_product_000,3,organic,2010-02-05,"This place is disgusting, absolutely horrible,..."


### Drop missing data (if yes) and checkout

In [8]:
# Drop missing rows
df_review = df_review.dropna()
# Print out DataFrame info to validate 
df_review.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 67385 entries, 0 to 67384
Data columns (total 6 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   user     67385 non-null  object
 1   product  67385 non-null  object
 2   rating   67385 non-null  int64 
 3   label    67385 non-null  object
 4   date     67385 non-null  object
 5   content  67385 non-null  object
dtypes: int64(1), object(5)
memory usage: 3.1+ MB


## Export final data

In [9]:
# Export cleaned DataFrame to a CSV file
df_review.to_csv("../datasets/processed/yelpchi_reviews_preprocessed.csv", index=False)