##### Library

In [1]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\samle\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\samle\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\samle\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\samle\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [2]:
import pandas as pd
import re

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

from nltk.corpus import wordnet
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from nltk.stem import WordNetLemmatizer

##### Read Data

In [3]:
source_data_root_path = '../../data/interim/final'

In [4]:
df_products = pd.read_json(f'{source_data_root_path}/products.json.gz', orient="records", compression="gzip")
df_products_categories = pd.read_json(f'{source_data_root_path}/products_categories.json.gz', orient="records", compression="gzip")
df_categories = pd.read_json(f'{source_data_root_path}/categories.json.gz', orient="records", compression="gzip")

##### Extract Categories for Products

In [5]:
df_products_categories.head()

Unnamed: 0,product_id,category_id
0,B00004TLW2,2
1,B00004VUM1,2
2,B00004XSHN,2
3,B000051TOG,2
4,B0000520I5,2


In [6]:
df_products_categories_2 = df_products_categories.merge(df_categories, how="inner", on="category_id", validate="many_to_one")
df_products_categories_2['name'] = df_products_categories_2['name'].str.lower()

In [7]:
df_products_categories_2.head()

Unnamed: 0,product_id,category_id,name,parent_id
0,B00004TLW2,2,cameras,1.0
1,B00004VUM1,2,cameras,1.0
2,B00004XSHN,2,cameras,1.0
3,B000051TOG,2,cameras,1.0
4,B0000520I5,2,cameras,1.0


In [8]:
df_products_categories_final = df_products_categories_2.groupby('product_id')['name'].apply(list).reset_index(name='categories')

In [9]:
df_products_categories_final.head()

Unnamed: 0,product_id,categories
0,B00001W0DG,[headphones]
1,B00004TLW2,[cameras]
2,B00004VUM1,[cameras]
3,B00004WFYN,[headphones]
4,B00004XSHN,[cameras]


In [10]:
df_products_final = df_products.drop(
    columns=['price', 'image_url']
).merge(
    df_products_categories_final, how="inner", on="product_id", validate="one_to_many"
)

In [11]:
df_products_final.head()

Unnamed: 0,product_id,name,description,categories
0,B00001W0DG,Sony MDR-V500DJ Monitor Series Headphones with...,Revel in high-quality audio with the MDR-V500D...,[headphones]
1,B00004TLW2,Fujifilm MX2900 2.3MP Digital Camera w/ 3x Opt...,The FujiFilm MX-2900 digital camera includes s...,[cameras]
2,B00004VUM1,Sony MVC-FD95 Mavica 2MP Digital Camera with 1...,w/ Canon SELPHY CP760 Compact Photo Printer 32...,[cameras]
3,B00004WFYN,Plantronics H141 Duoset Convertible Headset (D...,- Convertible headset<br />- Quick disconnect ...,[headphones]
4,B00004XSHN,Fujifilm FinePix 4900 4.3MP Digital Camera w/ ...,Fuji's FinePix 4900 is one of a new style of c...,[cameras]


##### Clean Data

In [12]:
def lowercase_text(text: str):
    return text.lower()

def remove_html_tags(text: str):
    return re.sub(re.compile('<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});'), '', text)

def remove_url(text: str):
    return re.sub(r'https?://\S+|www\.\S+', '', text)

def remove_noise(text: str):
    REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]')
    BAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]')
    text = REPLACE_BY_SPACE_RE.sub(' ', text)
    text = BAD_SYMBOLS_RE.sub('', text)
    return text

def remove_stopwords(text: str):
    STOPWORDS = set(stopwords.words('english'))
    return ' '.join([token for token in word_tokenize(text) if token not in STOPWORDS])

def remove_n_chars(text: str, n = 1):
    return ' '.join([w for w in text.split() if len(w) > n])

def clean_data(text: str):
    text = lowercase_text(text)
    text = remove_html_tags(text)
    text = remove_url(text)
    text = remove_noise(text)
    text = remove_stopwords(text)
    text = remove_n_chars(text)
    return text

In [13]:
df_preprocessing = df_products_final[['name', 'description', 'categories']]

df_preprocessing['cleaned_name'] = df_preprocessing['name'].apply(clean_data)
df_preprocessing['cleaned_description'] = df_preprocessing['description'].apply(clean_data)

df_preprocessing['keywords'] = df_preprocessing['cleaned_name'] + ' ' + df_preprocessing['cleaned_description']
df_preprocessing = df_preprocessing[['keywords', 'categories']]

In [14]:
df_preprocessing.head()

Unnamed: 0,keywords,categories
0,sony mdrv500dj monitor series headphones swive...,[headphones]
1,fujifilm mx2900 23mp digital camera 3x optical...,[cameras]
2,sony mvcfd95 mavica 2mp digital camera 10x opt...,[cameras]
3,plantronics h141 duoset convertible headset di...,[headphones]
4,fujifilm finepix 4900 43mp digital camera 6x o...,[cameras]


POS Tagging

In [15]:
pos_dict = {
    'J': wordnet.ADJ, 
    'V': wordnet.VERB, 
    'N': wordnet.NOUN, 
    'R': wordnet.ADV
}

def tag_pos(text):
    result = []
    tags = pos_tag(word_tokenize(text))

    for word, tag in tags:
        result.append(tuple([word, pos_dict.get(tag[0])]))
            
    return result

In [16]:
df_preprocessing['pos_tagged_keywords'] = df_preprocessing['keywords'].apply(tag_pos)

In [17]:
df_preprocessing.head()

Unnamed: 0,keywords,categories,pos_tagged_keywords
0,sony mdrv500dj monitor series headphones swive...,[headphones],"[(sony, n), (mdrv500dj, n), (monitor, n), (ser..."
1,fujifilm mx2900 23mp digital camera 3x optical...,[cameras],"[(fujifilm, n), (mx2900, v), (23mp, None), (di..."
2,sony mvcfd95 mavica 2mp digital camera 10x opt...,[cameras],"[(sony, n), (mvcfd95, n), (mavica, v), (2mp, N..."
3,plantronics h141 duoset convertible headset di...,[headphones],"[(plantronics, n), (h141, v), (duoset, v), (co..."
4,fujifilm finepix 4900 43mp digital camera 6x o...,[cameras],"[(fujifilm, n), (finepix, n), (4900, None), (4..."


In [18]:
lemmatizer = WordNetLemmatizer()

def lemmatize_pos_words(pos_words):
    lemmas = []
    for word, tag in pos_words:
        if tag is None:
            lemmas.append(word)
        else:
            lemmas.append(lemmatizer.lemmatize(word, tag))
            
    return " ".join(lemmas)

In [19]:
df_preprocessing['lemmatized_keywords'] = df_preprocessing['pos_tagged_keywords'].apply(lemmatize_pos_words)

In [20]:
df_preprocessing.head()

Unnamed: 0,keywords,categories,pos_tagged_keywords,lemmatized_keywords
0,sony mdrv500dj monitor series headphones swive...,[headphones],"[(sony, n), (mdrv500dj, n), (monitor, n), (ser...",sony mdrv500dj monitor series headphone swivel...
1,fujifilm mx2900 23mp digital camera 3x optical...,[cameras],"[(fujifilm, n), (mx2900, v), (23mp, None), (di...",fujifilm mx2900 23mp digital camera 3x optical...
2,sony mvcfd95 mavica 2mp digital camera 10x opt...,[cameras],"[(sony, n), (mvcfd95, n), (mavica, v), (2mp, N...",sony mvcfd95 mavica 2mp digital camera 10x opt...
3,plantronics h141 duoset convertible headset di...,[headphones],"[(plantronics, n), (h141, v), (duoset, v), (co...",plantronics h141 duoset convertible headset di...
4,fujifilm finepix 4900 43mp digital camera 6x o...,[cameras],"[(fujifilm, n), (finepix, n), (4900, None), (4...",fujifilm finepix 4900 43mp digital camera 6x o...


In [21]:
df_preprocessing[[
    "lemmatized_keywords",
    "categories",
]
].rename(
    columns={'lemmatized_keywords':'keywords'}
).to_json(
    '../../data/processed/products.json.gz', compression="gzip", orient="records", indent=2
)