# Preprocessing YelpNYC

In [1]:
# Set dataset name variable
ds_name = "YelpNYC"

## Import libraries

In [2]:
# Import libraries
import numpy as np # For numerical processing
import pandas as pd # For dataframe processing
from datetime import datetime # For date processing

## Process review metadata

### Read review metadata

In [3]:
# Read in review metadata 
df_meta = pd.read_csv("../datasets/raw/{0}/metadata".format(ds_name), sep="\t", header=None)
df_meta.columns = ["user", "product", "rating", "label", "date"]
df_meta.head()

Unnamed: 0,user,product,rating,label,date
0,923,0,3.0,-1,2014-12-08
1,924,0,3.0,-1,2013-05-16
2,925,0,4.0,-1,2013-07-01
3,926,0,4.0,-1,2011-07-28
4,927,0,4.0,-1,2010-11-01


In [4]:
# Print dataframe information
df_meta.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 359052 entries, 0 to 359051
Data columns (total 5 columns):
 #   Column   Non-Null Count   Dtype  
---  ------   --------------   -----  
 0   user     359052 non-null  int64  
 1   product  359052 non-null  int64  
 2   rating   359052 non-null  float64
 3   label    359052 non-null  int64  
 4   date     359052 non-null  object 
dtypes: float64(1), int64(3), object(1)
memory usage: 13.7+ MB


### Normalize data

In [5]:
# Convert date strings to datetime objects
df_meta.date = df_meta.date.apply(lambda value: str(datetime.strptime(value, "%Y-%m-%d").date()))  

# Convert rating to integer
df_meta.rating = df_meta.rating.astype(int)   

# Map label to category value
df_meta.label = df_meta.label.apply(lambda label: "fraud" if label == -1 else "organic")

df_meta.head()

Unnamed: 0,user,product,rating,label,date
0,923,0,3,fraud,2014-12-08
1,924,0,3,fraud,2013-05-16
2,925,0,4,fraud,2013-07-01
3,926,0,4,fraud,2011-07-28
4,927,0,4,fraud,2010-11-01


## Process review content

### Read review content

In [6]:
# Read in review content
df_content = pd.read_csv("../datasets/raw/{0}/reviewContent".format(ds_name), sep="\t", header=None)
df_content.columns = ["user", "product", "date", "content"]
df_content.head()

Unnamed: 0,user,product,date,content
0,923,0,2014-12-08,The food at snack is a selection of popular Gr...
1,924,0,2013-05-16,This little place in Soho is wonderful. I had ...
2,925,0,2013-07-01,ordered lunch for 15 from Snack last Friday. ...
3,926,0,2011-07-28,This is a beautiful quaint little restaurant o...
4,927,0,2010-11-01,Snack is great place for a casual sit down lu...


In [7]:
df_content.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 358957 entries, 0 to 358956
Data columns (total 4 columns):
 #   Column   Non-Null Count   Dtype 
---  ------   --------------   ----- 
 0   user     358957 non-null  int64 
 1   product  358957 non-null  int64 
 2   date     358957 non-null  object
 3   content  358957 non-null  object
dtypes: int64(2), object(2)
memory usage: 11.0+ MB


### Map content with (user ID, product Id and date), then check if there is duplicates

In [8]:
# Group by user, product, date
# Aggregate content into lists
content_mapping = list(df_content.groupby(["user", "product", "date"]).agg(list).to_dict().values())

# Print max number of reviews per user/product/date  
print(
    "Maximum number of review texts with the same user, product and date:", 
    max(map(lambda it: max(map(len, it.values())), content_mapping)),
)

Maximum number of review texts with the same user, product and date: 1


In [9]:
# Flatten content mapping dictionary
content_mapping = {k: v[0] for it in content_mapping for k, v in it.items()}

## Compose final review data 

### Fill content with corresponding metadata to the final data

In [10]:
# Copy metadata dataframe
df_review = df_meta.copy()  

# Add content column filled with NaNs
df_review["content"] = np.nan
df_review = df_review[["content", *df_meta.columns]]

# Fill content by merging 
df_review.content = df_review.apply(
    lambda row: content_mapping[row[1], row[2], row[-1]] if (row[1], row[2], row[-1]) in content_mapping else np.nan,
    axis=1, 
)
df_review.head()

  lambda row: content_mapping[row[1], row[2], row[-1]] if (row[1], row[2], row[-1]) in content_mapping else np.nan,


Unnamed: 0,content,user,product,rating,label,date
0,The food at snack is a selection of popular Gr...,923,0,3,fraud,2014-12-08
1,This little place in Soho is wonderful. I had ...,924,0,3,fraud,2013-05-16
2,ordered lunch for 15 from Snack last Friday. ...,925,0,4,fraud,2013-07-01
3,This is a beautiful quaint little restaurant o...,926,0,4,fraud,2011-07-28
4,Snack is great place for a casual sit down lu...,927,0,4,fraud,2010-11-01


### Drop NaN data (if yes)

In [11]:
# Drop missing rows
df_review = df_review.dropna()

### Re-mapping User / Product ID

In [12]:
max_user_idx_length = len(str(df_review["user"].nunique()))
max_product_idx_length = len(str(df_review["product"].nunique()))

user_mapping = {
    user: "{}_user_{}".format(ds_name.lower(), str.zfill(str(idx), max_user_idx_length))
    for idx, user in enumerate(df_review["user"].unique())
}
product_mapping = {
    product: "{}_product_{}".format(ds_name.lower(), str.zfill(str(idx), max_product_idx_length))
    for idx, product in enumerate(df_review["product"].unique())
}

df_review["user"] = df_review["user"].apply(user_mapping.get)
df_review["product"] = df_review["product"].apply(product_mapping.get)
df_review.head()

Unnamed: 0,content,user,product,rating,label,date
0,The food at snack is a selection of popular Gr...,yelpnyc_user_000000,yelpnyc_product_000,3,fraud,2014-12-08
1,This little place in Soho is wonderful. I had ...,yelpnyc_user_000001,yelpnyc_product_000,3,fraud,2013-05-16
2,ordered lunch for 15 from Snack last Friday. ...,yelpnyc_user_000002,yelpnyc_product_000,4,fraud,2013-07-01
3,This is a beautiful quaint little restaurant o...,yelpnyc_user_000003,yelpnyc_product_000,4,fraud,2011-07-28
4,Snack is great place for a casual sit down lu...,yelpnyc_user_000004,yelpnyc_product_000,4,fraud,2010-11-01


## Export final data

In [13]:
# Print out DataFrame info to validate 
df_review.info()

<class 'pandas.core.frame.DataFrame'>
Index: 358957 entries, 0 to 359051
Data columns (total 6 columns):
 #   Column   Non-Null Count   Dtype 
---  ------   --------------   ----- 
 0   content  358957 non-null  object
 1   user     358957 non-null  object
 2   product  358957 non-null  object
 3   rating   358957 non-null  int64 
 4   label    358957 non-null  object
 5   date     358957 non-null  object
dtypes: int64(1), object(5)
memory usage: 19.2+ MB


In [14]:
# Export cleaned DataFrame to a CSV file
df_review.to_csv("../datasets/processed/{0}_reviews_preprocessed.csv".format(ds_name.lower()), index=False)