In [40]:
from datetime import datetime
import re
from functools import reduce 

import pandas as pd

In [55]:
products_df = pd.read_csv(r'./facemasks/products.tsv', sep='\t')
products_df.head()

Unnamed: 0,product_id,product_name,product_price,price_currency,product_availability,product_url,source_url
0,103205,"Hwipure, Disposable KF94 ( N95 / KN95/ FFP2 ) ...",2.95,AUD,http://schema.org/InStock,https://au.iherb.com/pr/Hwipure-Disposable-KF9...,https://au.iherb.com/c/protective-face-masks
1,101774,"HIGUARD, Disposable KF94 ( N95 / KN95/ FFP2 ) ...",2.95,AUD,http://schema.org/InStock,https://au.iherb.com/pr/HIGUARD-Disposable-KF9...,https://au.iherb.com/c/protective-face-masks
2,101955,"SunJoy, KN95, Professional Protective Disposab...",8.86,AUD,http://schema.org/InStock,https://au.iherb.com/pr/SunJoy-KN95-Profession...,https://au.iherb.com/c/protective-face-masks
3,103838,"Lozperi, Copper Mask, Adult, Black, 1 Mask",6.85,AUD,http://schema.org/InStock,https://au.iherb.com/pr/Lozperi-Copper-Mask-Ad...,https://au.iherb.com/c/protective-face-masks
4,102734,"Zidian, Disposable Protective Mask, 50 Pack",15.35,AUD,http://schema.org/InStock,https://au.iherb.com/pr/Zidian-Disposable-Prot...,https://au.iherb.com/c/protective-face-masks


In [56]:
products_df.drop(['price_currency', 'product_availability', 'source_url', 'product_url'], axis=1, inplace=True)
products_df.loc[:, 'size'] = [re.findall(r'([0-9]+) ([a-zA-Z]+)', pn)[0][0] for pn in products_df.product_name]
products_df.sort_values('product_id', inplace=True)

products_df.head()

Unnamed: 0,product_id,product_name,product_price,size
19,99829,"Kosette, Nano Reusable Face Protection Mask, L...",22.44,1
17,99830,"Kosette, Nano Reusable Face Protection Mask, M...",22.44,1
24,100099,"Luseta Beauty, Disposable Protection Face Mask...",49.61,50
25,100234,"Luseta Beauty, Disposable Protection Face Mask...",44.31,50
15,100237,"Kitsch, 100% Cotton Reuseable Face Masks, Leop...",17.72,3


In [57]:
reviews_df = pd.read_csv('./facemasks/reviews.tsv', sep='\t')
reviews_df.head()

Unnamed: 0,abuseCount,customerNickname,helpfulNo,helpfulYes,id,imagesCount,languageCode,postedDate,productId,profileInfo.ugcSummary.answerCount,profileInfo.ugcSummary.reviewCount,ratingValue,reviewText,reviewTitle,reviewed,score,languageCode.1,translation.reviewText,translation.reviewTitle
0,0,iHerb Customer,0,6,05c2b17e-c28d-4792-930d-27e787d8d4ad,1,en-US,2021-01-27T09:04:10.569Z,103839,0.0,34.0,50,The mask quality and the color is good. It fit...,Dotted Pattern Is Nice,True,1614071000.0,en-US,,
1,0,iHerb Customer,0,0,80e44af8-2edf-4b81-a80a-7e7888d03cc0,0,ru-RU,2021-02-07T00:56:39.055Z,103839,0.0,37.0,50,Внуку очень понравилось. Удобная маска.,Прекрасно!,False,1612659000.0,ru-RU,The grandson really liked it. Comfortable mask.,en-US
2,0,iHerb Customer,0,0,9a76e047-21e4-4da3-8b50-9d2396519b6b,0,en-US,2021-02-06T21:40:02.886Z,103839,0.0,3.0,40,Easy to put on & comfortable to wear.,Good,False,1612648000.0,en-US,,
3,0,InnaIgorevna,0,0,2890ac54-8707-418e-be3e-8d46231e3672,0,ru-RU,2021-02-05T16:29:28.906Z,103839,0.0,16.0,40,"Тонкая, приятная к телу маска. Расцветка тоже ...",Хорошо,False,1612543000.0,ru-RU,A thin mask that is pleasant to the body. I li...,en-US
4,0,iHerb Customer,0,0,9db33354-0457-4efa-bc9c-b5f7ee0eff31,0,ru-RU,2021-02-05T09:43:42.367Z,103839,0.0,19.0,50,Отличная маска! Мне подошла идеально. Слегка е...,Прекрасно!,False,1612518000.0,ru-RU,Great mask! It suited me perfectly. There is a...,en-US


In [69]:
def string_to_date(string):
    try:
        date = datetime.strptime(string, "%Y-%m-%dT%H:%M:%S.%fZ")
    except ValueError:
        date = datetime.strptime(string, "%Y-%m-%dT%H:%M:%SZ")
    return date

products_df.loc[:, 'posting_date'] = reviews_df.loc[:, ['productId', 'postedDate']] \
    .groupby('productId') \
    .agg(lambda series: re.sub('[TZ]', ' ', reduce(
        lambda x, y: x if string_to_date(x) < string_to_date(y) else y, 
        series
    ))).values.squeeze()

products_df.to_csv('./facemasks/products_cleaned.csv')
products_df.head()

Unnamed: 0,product_id,product_name,product_price,size,posting_date
19,99829,"Kosette, Nano Reusable Face Protection Mask, L...",22.44,1,2020-05-12 19:56:28.293
17,99830,"Kosette, Nano Reusable Face Protection Mask, M...",22.44,1,2020-05-12 13:35:10.337
24,100099,"Luseta Beauty, Disposable Protection Face Mask...",49.61,50,2020-05-17 07:37:56.32
25,100234,"Luseta Beauty, Disposable Protection Face Mask...",44.31,50,2020-06-03 20:54:06.572
15,100237,"Kitsch, 100% Cotton Reuseable Face Masks, Leop...",17.72,3,2020-06-25 20:28:26.343


In [70]:
reviews_df.loc[~reviews_df['translation.reviewText'].isna(), 'reviewText'] = reviews_df.loc[:, 'translation.reviewText'].dropna()

reviews_df.ratingValue = reviews_df.ratingValue // 10

reviews_df_cleaned = reviews_df.drop(
    ['customerNickname', 'id', 'imagesCount', 'profileInfo.ugcSummary.answerCount', \
    'profileInfo.ugcSummary.reviewCount', 'reviewTitle', 'reviewed', 'score', \
    'languageCode.1', 'translation.reviewText', 'translation.reviewTitle'], axis=1 \
)

reviews_df_cleaned.to_csv('./facemasks/reviews_cleaned.csv')
reviews_df_cleaned.head()

Unnamed: 0,abuseCount,helpfulNo,helpfulYes,languageCode,postedDate,productId,ratingValue,reviewText
0,0,0,6,en-US,2021-01-27T09:04:10.569Z,103839,5,The mask quality and the color is good. It fit...
1,0,0,0,ru-RU,2021-02-07T00:56:39.055Z,103839,5,The grandson really liked it. Comfortable mask.
2,0,0,0,en-US,2021-02-06T21:40:02.886Z,103839,4,Easy to put on & comfortable to wear.
3,0,0,0,ru-RU,2021-02-05T16:29:28.906Z,103839,4,A thin mask that is pleasant to the body. I li...
4,0,0,0,ru-RU,2021-02-05T09:43:42.367Z,103839,5,Great mask! It suited me perfectly. There is a...
