In [None]:
import pandas as pd
import numpy as np
import contractions
import textstat

category = "Jewelry"

# Prepare data

Link to the data set:

http://snap.stanford.edu/data/web-Amazon-links.html?fbclid=IwAR0uAj7gUjbwrIyN2UpPuZZnytV-P-iV8kcpOJtfmgboeLkDOdwkqqb_G58

## Read the data

In [None]:
def load_reviews_to_df(path) -> pd.DataFrame:
    raw_data = open(path).readlines()
    
    reviews_array = []
    dictionary = {}

    for review in raw_data:
        this_line = review.split(":")
        if len(this_line) > 1:
            dictionary[this_line[0]] = this_line[1].strip()
        else:
            reviews_array.append(dictionary)
            dictionary = {}

    colNames = ['productId', 'title', 'price', 'userId', 
                'profileName', 'helpfulness', 'score', 
                'time', 'summary', 'text']
    
    reviews = pd.DataFrame(reviews_array)
    reviews.columns = colNames
    reviews[['score']] = reviews[['score']].astype(float)
    reviews['time'] = pd.to_datetime(reviews['time'], unit='s')
    
    return reviews

In [None]:
path = f'{category}.txt'
reviews_df = load_reviews_to_df(path)
reviews_df.head()

## Clean up the data

### Extract numbers for helpfulness

In [None]:
reviews_df["helpfulness_num"] = reviews_df["helpfulness"].apply(lambda x: int(x.split("/")[0]))
reviews_df["helpfulness_den"] = reviews_df["helpfulness"].apply(lambda x: int(x.split("/")[1]))

### Clean up the comments

Used tutorial:

https://towardsdatascience.com/text-cleaning-methods-for-natural-language-processing-f2fc1796e8c7

**TBD: do the same transformations with the summary!**

1. Read the slang

Example:

In [None]:
reviews_df["text"][9]

In [None]:
reviews_df["text"][:20].apply(contractions.fix, slang=True)[9]

In [None]:
reviews_df["text_fixed"] = reviews_df["text"].apply(contractions.fix, slang=True)

2. Normalization
    - remove punctuation
    - lower all letters
    - remove numbers
    
3. Remove stop words (noise)

4. Stemming and/or Lemmatisation

Ciekawostka (? do przeczytania) https://www.researchgate.net/publication/220959627_Training_Data_Cleaning_for_Text_Classification

##  Extract text statistics

In [None]:
reviews_df["sign_number"] = reviews_df["text_fixed"].apply(len)
reviews_df["word_number"] = reviews_df["text_fixed"].apply(lambda x: len(x.split(" ")))
reviews_df["unique_word_number"] = reviews_df["text_fixed"].apply(lambda x: len(set(x.split(" "))))
reviews_df["flesch_reading_ease"] = reviews_df["text_fixed"].apply(textstat.flesch_reading_ease)
reviews_df["flesch_kincaid_grade"] = reviews_df["text_fixed"].apply(textstat.flesch_kincaid_grade)
reviews_df["smog_index"] = reviews_df["text_fixed"].apply(textstat.smog_index)
reviews_df["coleman_liau_index"] = reviews_df["text_fixed"].apply(textstat.coleman_liau_index)
reviews_df["automated_readability_index"] = reviews_df["text_fixed"].apply(textstat.automated_readability_index)
reviews_df["dale_chall_readability_score"] = reviews_df["text_fixed"].apply(textstat.dale_chall_readability_score)
reviews_df["difficult_words"] = reviews_df["text_fixed"].apply(textstat.difficult_words)
reviews_df["linsear_write_formula"] = reviews_df["text_fixed"].apply(textstat.linsear_write_formula)
reviews_df["gunning_fog"] = reviews_df["text_fixed"].apply(textstat.gunning_fog)
reviews_df["text_standard"] = reviews_df["text_fixed"].apply(textstat.text_standard)
reviews_df["fernandez_huerta"] = reviews_df["text_fixed"].apply(textstat.fernandez_huerta)
reviews_df["szigriszt_pazos"] = reviews_df["text_fixed"].apply(textstat.szigriszt_pazos)
reviews_df["gutierrez_polini"] = reviews_df["text_fixed"].apply(textstat.gutierrez_polini)
reviews_df["crawford"] = reviews_df["text_fixed"].apply(textstat.crawford)
reviews_df["gulpease_index"] = reviews_df["text_fixed"].apply(textstat.gulpease_index)
reviews_df["osman"] = reviews_df["text_fixed"].apply(textstat.osman)

### Extract time features

In [None]:
reviews_df["time_month"] = pd.DatetimeIndex(reviews_df["time"]).month
reviews_df["time_weekday"] = pd.DatetimeIndex(reviews_df["time"]).weekday

In [None]:
reviews_df.head(10)

# Basic analysis

In [None]:
reviews_df.describe()

1. Unbalanced scores (mostly 5)

2. Few helpfulness information

3. Outliers