In [13]:
import pandas as pd
import numpy as np
import contractions
import textstat
from sklearn.model_selection import train_test_split
import re

category = "Jewelry"

# Prepare data

Link to the data set:

http://snap.stanford.edu/data/web-Amazon-links.html?fbclid=IwAR0uAj7gUjbwrIyN2UpPuZZnytV-P-iV8kcpOJtfmgboeLkDOdwkqqb_G58

## Read the data

In [33]:
def load_reviews_to_df(path) -> pd.DataFrame:
    raw_data = open(path).readlines()
    
    reviews_array = []
    dictionary = {}

    for review in raw_data:
        this_line = review.split(":")
        if len(this_line) > 1:
            dictionary[this_line[0]] = this_line[1].strip()
        else:
            reviews_array.append(dictionary)
            dictionary = {}

    colNames = ['productId', 'title', 'price', 'userId', 
                'profileName', 'helpfulness', 'score', 
                'time', 'summary', 'text']
    
    reviews = pd.DataFrame(reviews_array)
    reviews.columns = colNames
    reviews[['score']] = reviews[['score']].astype(float)
    reviews['time'] = pd.to_datetime(reviews['time'], unit='s')
    
    return reviews

In [34]:
path = f'{category}.txt'
reviews_df = load_reviews_to_df(path)
reviews_df.head()

Unnamed: 0,productId,title,price,userId,profileName,helpfulness,score,time,summary,text
0,B000FTPOMK,"14k Yellow Gold Butterfly Pendant, 16""",unknown,A3T4KKUM1JATUX,Disappointed Sony customer,1/1,4.0,2007-05-03,pretty necklace,"It is a nice made necklace, and the butterfly ..."
1,B000FTPOMK,"14k Yellow Gold Butterfly Pendant, 16""",unknown,A33LVMKCG9IG15,"A. Thorpe ""Amazon lover""",1/1,4.0,2007-04-08,pretty necklace,I bought this necklace on a whim; I love butte...
2,B000FTPOMK,"14k Yellow Gold Butterfly Pendant, 16""",unknown,A1PNSOEZQ4E9UT,"rabbit ""amazon rocks""",0/0,5.0,2007-07-10,Nice and weighty,"Bought via gold box ,it is much better than I ..."
3,B000FTPOMK,"14k Yellow Gold Butterfly Pendant, 16""",unknown,A3F12GSAOU0WRT,Jennifer S. Barbier,0/0,5.0,2007-06-08,Great necklace for kids or adults,I love this necklace. It is great for everyday...
4,B000FTPOMK,"14k Yellow Gold Butterfly Pendant, 16""",unknown,A2H69HCLAE6GME,Julia,0/0,5.0,2007-05-16,Absolutely satisfied with it,"Love this butterfly pendant. Right size, high ..."


## Clean up the data

### Extract numbers for helpfulness

In [4]:
reviews_df["helpfulness_num"] = reviews_df["helpfulness"].apply(lambda x: int(x.split("/")[0]))
reviews_df["helpfulness_den"] = reviews_df["helpfulness"].apply(lambda x: int(x.split("/")[1]))

### Clean up the comments

Used tutorial:

https://towardsdatascience.com/text-cleaning-methods-for-natural-language-processing-f2fc1796e8c7

**TBD: do the same transformations with the summary!**

1. Read the slang

Example:

In [36]:
reviews_df["text"][1]

"I bought this necklace on a whim; I love butterflies and it looked so dainty and sweet. It was actually a little more weighty than I expected, although it's not a solid piece. The chain is shiny and nicer than I expected."

In [37]:
text_example = "I bought this necklace on a whim; I love butterflies and it looked so dainty and sweet. It was actually a little more weighty than I expected, although it's not a solid piece. The chain is shiny and nicer than I expected."
text_example

"I bought this necklace on a whim; I love butterflies and it looked so dainty and sweet. It was actually a little more weighty than I expected, although it's not a solid piece. The chain is shiny and nicer than I expected."

In [40]:
text_example = contractions.fix(text_example, slang=True)
text_example

'I bought this necklace on a whim; I love butterflies and it looked so dainty and sweet. It was actually a little more weighty than I expected, although it is not a solid piece. The chain is shiny and nicer than I expected.'

In [7]:
reviews_df["text_fixed"] = reviews_df["text"].apply(contractions.fix, slang=True)

2. Normalization
    - remove punctuation
    - lower all letters
    - remove numbers
    
3. Remove stop words (noise)

4. Stemming and/or Lemmatisation

Ciekawostka (? do przeczytania) https://www.researchgate.net/publication/220959627_Training_Data_Cleaning_for_Text_Classification

In [41]:
# normalization

text_example = re.sub(r"\d+", "", re.sub(r"(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)|^rt|http.+?", "", text_example.lower()))
text_example

'i bought this necklace on a whim i love butterflies and it looked so dainty and sweet it was actually a little more weighty than i expected although it is not a solid piece the chain is shiny and nicer than i expected'

In [17]:
# import nltk.corpus
# nltk.download('stopwords')
# from nltk.corpus import stopwords
# stop = stopwords.words('english')
# stop

# removing stop words is not a good idea for sentimental anylysis

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/kabalce/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [20]:
from nltk.stem import PorterStemmer 
from nltk.tokenize import word_tokenize

In [25]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /home/kabalce/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [26]:
ps = PorterStemmer()


['i', 'think', 'it', 'is', 'a', 'lovely', 'ring']

In [29]:
# nltk.download('wordnet')
nltk.download('omw-1.4')
from nltk.stem import WordNetLemmatizer

[nltk_data] Downloading package omw-1.4 to /home/kabalce/nltk_data...
[nltk_data]   Unzipping corpora/omw-1.4.zip.


In [42]:
text = word_tokenize(text_example)
" ".join([WordNetLemmatizer().lemmatize(i) for i in text])

'i bought this necklace on a whim i love butterfly and it looked so dainty and sweet it wa actually a little more weighty than i expected although it is not a solid piece the chain is shiny and nicer than i expected'

##  Extract text statistics

In [8]:
reviews_df["sign_number"] = reviews_df["text_fixed"].apply(len)
reviews_df["word_number"] = reviews_df["text_fixed"].apply(lambda x: len(x.split(" ")))
reviews_df["unique_word_number"] = reviews_df["text_fixed"].apply(lambda x: len(set(x.split(" "))))
reviews_df["flesch_reading_ease"] = reviews_df["text_fixed"].apply(textstat.flesch_reading_ease)
reviews_df["flesch_kincaid_grade"] = reviews_df["text_fixed"].apply(textstat.flesch_kincaid_grade)
reviews_df["smog_index"] = reviews_df["text_fixed"].apply(textstat.smog_index)
reviews_df["coleman_liau_index"] = reviews_df["text_fixed"].apply(textstat.coleman_liau_index)
reviews_df["automated_readability_index"] = reviews_df["text_fixed"].apply(textstat.automated_readability_index)
reviews_df["dale_chall_readability_score"] = reviews_df["text_fixed"].apply(textstat.dale_chall_readability_score)
reviews_df["difficult_words"] = reviews_df["text_fixed"].apply(textstat.difficult_words)
reviews_df["linsear_write_formula"] = reviews_df["text_fixed"].apply(textstat.linsear_write_formula)
reviews_df["gunning_fog"] = reviews_df["text_fixed"].apply(textstat.gunning_fog)
reviews_df["text_standard"] = reviews_df["text_fixed"].apply(textstat.text_standard)
reviews_df["fernandez_huerta"] = reviews_df["text_fixed"].apply(textstat.fernandez_huerta)
reviews_df["szigriszt_pazos"] = reviews_df["text_fixed"].apply(textstat.szigriszt_pazos)
reviews_df["gutierrez_polini"] = reviews_df["text_fixed"].apply(textstat.gutierrez_polini)
reviews_df["crawford"] = reviews_df["text_fixed"].apply(textstat.crawford)
reviews_df["gulpease_index"] = reviews_df["text_fixed"].apply(textstat.gulpease_index)
reviews_df["osman"] = reviews_df["text_fixed"].apply(textstat.osman)

### Extract time features

In [9]:
reviews_df["time_month"] = pd.DatetimeIndex(reviews_df["time"]).month
reviews_df["time_weekday"] = pd.DatetimeIndex(reviews_df["time"]).weekday

In [10]:
reviews_df.head(10)

Unnamed: 0,productId,title,price,userId,profileName,helpfulness,score,time,summary,text,...,gunning_fog,text_standard,fernandez_huerta,szigriszt_pazos,gutierrez_polini,crawford,gulpease_index,osman,time_month,time_weekday
0,B000FTPOMK,"14k Yellow Gold Butterfly Pendant, 16""",unknown,A3T4KKUM1JATUX,Disappointed Sony customer,1/1,4.0,2007-05-03,pretty necklace,"It is a nice made necklace, and the butterfly ...",...,3.0,3rd and 4th grade,115.2,112.12,51.19,0.7,84.333333,86.78,5,3
1,B000FTPOMK,"14k Yellow Gold Butterfly Pendant, 16""",unknown,A33LVMKCG9IG15,"A. Thorpe ""Amazon lover""",1/1,4.0,2007-04-08,pretty necklace,I bought this necklace on a whim; I love butte...,...,8.51,5th and 6th grade,114.3,109.92,50.71,1.7,68.069767,82.21,4,6
2,B000FTPOMK,"14k Yellow Gold Butterfly Pendant, 16""",unknown,A1PNSOEZQ4E9UT,"rabbit ""amazon rocks""",0/0,5.0,2007-07-10,Nice and weighty,"Bought via gold box ,it is much better than I ...",...,6.0,5th and 6th grade,119.6,117.08,50.5,1.1,66.333333,85.62,7,1
3,B000FTPOMK,"14k Yellow Gold Butterfly Pendant, 16""",unknown,A3F12GSAOU0WRT,Jennifer S. Barbier,0/0,5.0,2007-06-08,Great necklace for kids or adults,I love this necklace. It is great for everyday...,...,3.2,1st and 2nd grade,126.7,126.15,54.0,-0.4,85.666667,94.94,6,4
4,B000FTPOMK,"14k Yellow Gold Butterfly Pendant, 16""",unknown,A2H69HCLAE6GME,Julia,0/0,5.0,2007-05-16,Absolutely satisfied with it,"Love this butterfly pendant. Right size, high ...",...,6.56,6th and 7th grade,108.0,104.72,48.89,1.6,76.692308,77.6,5,2
5,B000PAMAPI,Sterling Silver Scroll Design Garnet Ring by S...,unknown,A1U36NQMIRUHST,"C. Tremari ""cactuspetelady""",3/3,5.0,2008-05-23,Beautiful ring,I bought this as a graduation gift for my daug...,...,6.95,4th and 5th grade,116.5,111.37,52.02,1.5,72.082707,84.52,5,4
6,B000PEM42S,Sterling Silver Oval Shaped Locket,unknown,A2C4IVOBTG9W81,Not my name,1/1,3.0,2009-02-09,Just ok...,The locket has a very nice appearance and ther...,...,11.17,8th and 9th grade,106.1,102.69,48.72,2.1,61.921348,77.68,2,0
7,B000KN6ZOM,Amazon.com,unknown,A1KH38J5M2WCTX,greenchic,0/0,1.0,2008-01-11,Ordered 2; Both Chains Were Tarnished and Pend...,I have generally had very good experience orde...,...,11.61,10th and 11th grade,98.8,95.13,41.61,2.8,55.226415,54.65,1,4
8,B000PCZ6UC,14K Yellow Gold Ruby and Diamond Bracelet,unknown,AABOLDP3KO54N,Mac Fan,0/0,5.0,2009-01-06,Good Value,"A beautiful, delicate little bracelet. I feel ...",...,6.1,5th and 6th grade,112.1,107.34,47.18,1.6,66.142857,72.68,1,1
9,B000PAMAQM,Sterling Silver Malachite Scroll Design Ring b...,unknown,A31178QVC2BZC6,Hibiscus,3/3,5.0,2007-08-10,nice quality,I think it's a lovely ring. The silver part of...,...,4.92,2nd and 3rd grade,128.3,123.78,55.23,0.5,75.756757,96.78,8,4


# Basic analysis

In [11]:
reviews_df.describe()

Unnamed: 0,score,helpfulness_num,helpfulness_den,sign_number,word_number,unique_word_number,flesch_reading_ease,flesch_kincaid_grade,smog_index,coleman_liau_index,...,linsear_write_formula,gunning_fog,fernandez_huerta,szigriszt_pazos,gutierrez_polini,crawford,gulpease_index,osman,time_month,time_weekday
count,58621.0,58621.0,58621.0,58621.0,58621.0,58621.0,58621.0,58621.0,58621.0,58621.0,...,58621.0,58621.0,58621.0,58621.0,58621.0,58621.0,58621.0,58621.0,58621.0,58621.0
mean,4.197796,1.499667,1.761161,270.478054,51.541529,40.265894,79.382928,6.28797,5.093376,6.084322,...,8.57166,8.569933,111.507818,108.922772,48.92243,1.378948,69.307985,78.471089,6.105457,2.810802
std,1.241825,4.176168,4.669166,223.939763,42.541051,25.58968,15.533905,4.510763,4.308894,2.674303,...,6.31145,4.638919,13.48574,13.415512,5.547165,1.127627,14.253497,16.215347,3.890987,1.953086
min,1.0,0.0,0.0,0.0,1.0,1.0,-388.42,-15.7,0.0,-33.81,...,-1.0,0.0,-216.3,-232.27,-40.96,-19.1,-1.833333,-128.59,1.0,0.0
25%,4.0,0.0,0.0,135.0,25.0,23.0,72.5,3.8,0.0,4.48,...,4.9,6.0,106.3,103.71,46.49,0.9,61.954545,71.07,2.0,1.0
50%,5.0,0.0,0.0,207.0,40.0,33.0,81.22,5.4,6.4,5.98,...,6.8,7.83,113.4,110.85,49.61,1.4,68.090909,80.21,6.0,3.0
75%,5.0,1.0,2.0,331.0,63.0,50.0,88.77,7.6,8.8,7.55,...,10.6,9.89,119.6,116.96,52.32,2.0,75.060606,88.42,10.0,4.0
max,5.0,218.0,229.0,6178.0,1089.0,460.0,206.84,88.3,18.2,35.8,...,68.0,92.39,206.9,143.54,75.6,24.1,369.0,199.78,12.0,6.0


1. Unbalanced scores (mostly 5)

2. Few helpfulness information

3. Outliers

# Try to build any model

Train-test split

Predict the score using the text statistics

In [146]:
predictors = [#'sign_number', 
    'word_number',
       'unique_word_number', # 'flesch_reading_ease', 
              #'flesch_kincaid_grade',
       'smog_index', 'coleman_liau_index', #'automated_readability_index',
       'dale_chall_readability_score', #'difficult_words',
       'linsear_write_formula', 'gunning_fog', #'text_standard',
       'fernandez_huerta', 'szigriszt_pazos', 'gutierrez_polini', #'crawford',
       'gulpease_index', #'osman', 'time_month', 'time_weekday'
             ]


y = "score"

X_train, X_test, y_train, y_test = train_test_split(reviews_df[predictors], reviews_df[y], 
                                                    test_size=0.1, stratify=reviews_df[y])

In [179]:
from xgboost import XGBRegressor

model = XGBRegressor(enable_categorical=True, #scale_pos_weight=100, 
                      weight = [0.001 if x == 5 else 1 for x in y_train],
#                       objective = "multi:softmax", 
                      learning_rate=0.3)
model.fit(X_train, y_train)# , sample_weight=[0.4 if x == 5 else 0.7 for x in y_train])

Parameters: { enable_categorical, weight } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, enable_categorical=True,
             gamma=0, gpu_id=-1, importance_type='gain',
             interaction_constraints='', learning_rate=0.3, max_delta_step=0,
             max_depth=6, min_child_weight=1, missing=nan,
             monotone_constraints='()', n_estimators=100, n_jobs=8,
             num_parallel_tree=1, random_state=0, reg_alpha=0, reg_lambda=1,
             scale_pos_weight=1, subsample=1, tree_method='exact',
             validate_parameters=1, verbosity=None,
             weight=[0.001, 1, 1, 0.001, 0.001, 1, 0.001, 1, 0.001, 0.001,
                     0.001, 0.001, 1, 1, 0.001, 1, 1, 1, 0.001, 1, 0.001, 1, 1,
                     1, 1, 0.001, 0.001, 1, 1, 1, ...])

In [180]:
print(predictors)

model.feature_importances_

['word_number', 'unique_word_number', 'smog_index', 'coleman_liau_index', 'dale_chall_readability_score', 'linsear_write_formula', 'gunning_fog', 'fernandez_huerta', 'szigriszt_pazos', 'gutierrez_polini', 'gulpease_index']


array([0.06980664, 0.07189302, 0.11672159, 0.07355583, 0.08556743,
       0.10787705, 0.09199776, 0.08984181, 0.10347147, 0.09617429,
       0.09309315], dtype=float32)

In [181]:
y_pred = model.predict(X_test)
y_pred = y_pred.round()
# y_test = y_test == 5

In [182]:
print(((y_pred == y_test.values) & (y_pred != 5)).mean())

print((y_pred == y_test.values).mean())

print((y_pred > y_test.values).mean())

print((y_pred < y_test.values).mean())

print((y_pred != 5).mean())

print((y_test != 5).mean())

0.15452839843083746
0.34402183182670987
0.2241173460685656
0.43186082210472454
0.7685485246460856
0.38597987378475185


In [183]:
print(((y_pred == y_test.values) & (y_pred != 5)).mean())

print((y_pred > y_test.values).mean())

print((y_pred < y_test.values).mean())

print((y_pred != 5).mean())

print((y_test != 5).mean())

print((y_test - y_pred).abs().mean())

pd.DataFrame({"test": y_test, "pred": y_pred}).value_counts()

0.15452839843083746
0.2241173460685656
0.43186082210472454
0.7685485246460856
0.38597987378475185
0.8661095002558418


test  pred
5.0   4.0     2404
      5.0     1111
4.0   4.0      834
3.0   4.0      400
1.0   4.0      315
2.0   4.0      248
4.0   5.0      131
5.0   3.0       81
3.0   3.0       54
      5.0       50
1.0   3.0       50
4.0   3.0       42
2.0   3.0       37
1.0   5.0       35
2.0   5.0       30
1.0   2.0       18
      1.0       14
2.0   2.0        4
5.0   2.0        4
3.0   2.0        1
dtype: int64

Text statistics seem to be useless.

# Data for modeling

In [169]:
scores_df = reviews_df[["userId", "productId", "score"]]
scores_df.to_csv("scores.csv")
scores_df

Unnamed: 0,userId,productId,score
0,A3T4KKUM1JATUX,B000FTPOMK,4.0
1,A33LVMKCG9IG15,B000FTPOMK,4.0
2,A1PNSOEZQ4E9UT,B000FTPOMK,5.0
3,A3F12GSAOU0WRT,B000FTPOMK,5.0
4,A2H69HCLAE6GME,B000FTPOMK,5.0
...,...,...,...
58616,A34KBEZHBM0SX3,B000NGJ3CM,5.0
58617,A2JHH8XII9XEC8,B000NGJ3CM,3.0
58618,A30ZWFIQXEIIVO,B000O3KHPQ,5.0
58619,A1PR64NER7KUP6,B000O3KHPQ,5.0
