# Notebook - Subset Generation

In [1]:
import pandas as pd

## Load Dataset

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [13]:
trainset = pd.read_parquet('/content/drive/MyDrive/W266_Final Pro/yelp_review_trainset_subset.parquet')
testset = pd.read_parquet('/content/drive/MyDrive/W266_Final Pro/yelp_review_testset_subset.parquet')

## Data preprocess

In [14]:
def preprocessing(df):
    # 1. compute word_count
    df["word_count"] = df["text"].apply(lambda x: len(x.split()))

    # 2. trim out short & long reviews
    df = df[df["word_count"] >= 100]
    df = df[df["word_count"] < 200]

    # can add more....

    return df

In [15]:
trainset = preprocessing(trainset)
testset = preprocessing(testset)

## Sampling to generate a subset

In [16]:
# Random Sample 600 samples from each label
sampled_trainset = pd.concat([trainset[trainset["label"] == 0].sample(n=300, random_state=1, replace=False),
                              trainset[trainset["label"] == 1].sample(n=300, random_state=1, replace=False),
                              trainset[trainset["label"] == 2].sample(n=500, random_state=1, replace=True),
                              trainset[trainset["label"] == 3].sample(n=300, random_state=1, replace=False),
                              trainset[trainset["label"] == 4].sample(n=300, random_state=1, replace=False),])
# Random Sample 100 samples from each label
sampled_testset = pd.concat([testset[testset["label"] == 0].sample(n=50, random_state=1, replace=False),
                             testset[testset["label"] == 1].sample(n=50, random_state=1, replace=False),
                             testset[testset["label"] == 2].sample(n=100, random_state=1, replace=True),
                             testset[testset["label"] == 3].sample(n=50, random_state=1, replace=False),
                             testset[testset["label"] == 4].sample(n=50, random_state=1, replace=False),])

In [17]:
sampled_trainset["label"] = sampled_trainset["label"].apply(lambda x: 0 if x <= 1 else x)
sampled_trainset["label"] = sampled_trainset["label"].apply(lambda x: 1 if x == 2 else x)
sampled_trainset["label"] = sampled_trainset["label"].apply(lambda x: 2 if x >= 3 else x)

sampled_testset["label"] = sampled_testset["label"].apply(lambda x: 0 if x <= 1 else x)
sampled_testset["label"] = sampled_testset["label"].apply(lambda x: 1 if x == 2 else x)
sampled_testset["label"] = sampled_testset["label"].apply(lambda x: 2 if x >= 3 else x)

In [19]:
sampled_trainset = sampled_trainset[sampled_trainset["label"]!=1]
sampled_testset = sampled_testset[sampled_testset["label"]!=1]

## Save to local disk

In [21]:
sampled_trainset.to_parquet('yelp_review_trainset_subset_2_cls.parquet')
sampled_testset.to_parquet('yelp_review_testset_subset_2_cls.parquet')