# Import Libraries

In [10]:
import json
import pandas as pd
from ydata_profiling import ProfileReport

# Load Data

## Load JSON File

Datasets used can be downloaded [here](https://nijianmo.github.io/amazon/index.html). For experimentation purposes, used the smaller dataset for faster iteration

In [22]:
filepath = "/Users/karlpotenciano/Downloads/Grocery_and_Gourmet_Food_5.json"

file = open(filepath, "r")
review_lines = file.readlines()
file.close()

## Transform JSON String to Dict Objects

In [24]:
processed_review_lines = [json.loads(line.strip()) for line in review_lines]

## Convert List of Dict to DataFrame

In [27]:
reviews_df = pd.DataFrame(processed_review_lines)
### Add Review Length as a possible filter
reviews_df['review_length'] = reviews_df.reviewText.str.len()

In [28]:
ProfileReport(reviews_df)

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]



# Refining the data

## Remove Non Verified Reviews

In [43]:
verified_reviews = reviews_df[reviews_df.verified]

## Remove Review Length Outliers

Explain why considering review length

In [44]:
quartiles = verified_reviews.review_length.quantile([0.25, 0.75])

In [45]:
verified_reviews = verified_reviews[(verified_reviews.review_length >= quartiles[0.25]) & (verified_reviews.review_length <= quartiles[0.75])]

## Change Labels

5-4 positive

3 neutral

2-1 negative

In [63]:
def change_dataframe_labels(df: pd.DataFrame) -> pd.DataFrame:
    labels = {
        5.0: 'positive',
        4.0: 'positive',
        3.0: 'neutral', 
        2.0: 'negative',
        1.0: 'negative'
    }
    relevant_fields = ['overall', 'reviewText']
    
    df = df[relevant_fields]
    df.drop_duplicates(inplace=True)
    df['expected_sentiment'] = df.overall.apply(lambda rating: labels.get(rating))
    df.rename({'reviewText': 'text'},axis=1,inplace=True)
    
    df.drop('overall', axis=1, inplace=True)
    
    return df
    
    

In [64]:
procesed_df = change_dataframe_labels(verified_reviews)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.drop_duplicates(inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['expected_sentiment'] = df.overall.apply(lambda rating: labels.get(rating))
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.rename({'reviewText': 'text'},axis=1,inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/use

In [67]:
round(procesed_df.expected_sentiment.value_counts().min() * 0.1)

3106

In [68]:
procesed_df.groupby('expected_sentiment', group_keys=False).apply(lambda x: x.sample(3))

Unnamed: 0,text,expected_sentiment
685647,Looks good but forget the taste- much too swee...,negative
863246,"This stuff just WOULD NOT COOK, no matter how ...",negative
865637,Was cut super thin so it fell apart and was ta...,negative
608284,I like everything but the rather dull taste of...,neutral
1007039,Not nearly as savory as the Nespresso original,neutral
1102788,Normally good! Last order they were terrible! ...,neutral
1001886,Too expensive but best tasting iced tea in the...,positive
699092,These are great! I make snack mixes and these...,positive
524242,Very tasty as a way to add flavor to your wate...,positive
