In [558]:
import csv
import re
import time
import nltk
import string
import pickle
import pyLDAvis
import numpy as np
import pandas as pd
from itertools import chain
from nltk.stem import WordNetLemmatizer 
from nltk.tokenize import RegexpTokenizer

In [559]:
#merge dictionaries, only once you have all the data
#because product_dict is defined above as something else
#here I merged 3 dictionaries together to make combined

product_dict_1 = pickle.load(open('product_dict_374.pkl', 'rb'))
product_dict_2 = pickle.load(open('product_dict_2000.pkl', 'rb'))
product_dict_3 = pickle.load(open('product_dict_3000.pkl', 'rb'))

combined = {}
product_dict_list = [product_dict_1, product_dict_2, product_dict_3]

for product_dict in product_dict_list:
    combined.update(product_dict) 

In [560]:
product_names = []
brand_names = []
description = []
rating = []
review_count=[]
for key, value in combined.items():
    product_names.append(key)
    brand_names.append(value['brand_name'])
    description.append(value['description'])
    rating.append(value['rating'])
    review_count.append(value['review_count'])

In [561]:
df = pd.DataFrame({'product_names':product_names})
df['brand_names'] = brand_names
df['description'] = description
df['rating'] = rating
df['review_count'] = review_count

In [562]:
len(df)

2376

In [563]:
df.rating.value_counts()

0.0    865
5.0    415
4.7    210
4.5    143
4.0    140
4.6    123
4.8    121
4.4     66
4.3     52
4.9     31
3.0     29
4.2     28
4.1     15
3.5     15
3.8     14
2.0     14
3.7     13
1.0     11
3.9      8
3.3      7
3.6      4
2.3      2
2.5      2
3.4      2
3.2      1
2.7      1
Name: rating, dtype: int64

In [564]:
df.isnull().sum()

product_names      0
brand_names        0
description        0
rating            44
review_count     886
dtype: int64

In [565]:
df=df[df.rating > 4.0]

In [566]:
len(df)

1204

In [567]:
with open('perfume_product.pkl', 'wb') as f:
    pickle.dump(df, f)

In [568]:
df_copy=df.copy()

In [569]:
df_copy['name'] = df_copy['product_names'].str.cat(df['brand_names'], sep=' - ')
df_copy=df_copy.drop(['product_names','brand_names'],axis=1)
df_copy = df_copy[['name','description','rating','review_count']]
df_copy.head()

Unnamed: 0,name,description,rating,review_count
0,Eternity - Calvin Klein,"Eternity Perfume by Calvin Klein, Escape into ...",4.7,849.0
1,Light Blue - Dolce & Gabbana,"Light Blue Perfume by Dolce & Gabbana, Light B...",4.7,1208.0
2,Angel - Thierry Mugler,"Angel Perfume by Thierry Mugler, Created in 19...",4.6,1057.0
3,Euphoria - Calvin Klein,"Euphoria Perfume by Calvin Klein, Euphoria, la...",4.7,1179.0
4,Obsession - Calvin Klein,"Obsession Perfume by Calvin Klein, Obsession i...",4.7,652.0


In [570]:
with open('df.pkl', 'wb') as f:
    pickle.dump(df_copy, f)

In [571]:
indices = pd.Series(df_copy.name)
indices[:5]

0         Eternity - Calvin Klein
1    Light Blue - Dolce & Gabbana
2          Angel - Thierry Mugler
3         Euphoria - Calvin Klein
4        Obsession - Calvin Klein
Name: name, dtype: object

In [572]:
df_copy.head()

Unnamed: 0,name,description,rating,review_count
0,Eternity - Calvin Klein,"Eternity Perfume by Calvin Klein, Escape into ...",4.7,849.0
1,Light Blue - Dolce & Gabbana,"Light Blue Perfume by Dolce & Gabbana, Light B...",4.7,1208.0
2,Angel - Thierry Mugler,"Angel Perfume by Thierry Mugler, Created in 19...",4.6,1057.0
3,Euphoria - Calvin Klein,"Euphoria Perfume by Calvin Klein, Euphoria, la...",4.7,1179.0
4,Obsession - Calvin Klein,"Obsession Perfume by Calvin Klein, Obsession i...",4.7,652.0


## Text Pre-processing

In [573]:
#tokenize
tokenizer = RegexpTokenizer(r'\w+')

In [574]:
def remove_punctuation(text):
    #tokenizer breaks string into a list of words
    tokenizer = RegexpTokenizer(r'\w+')
    text = tokenizer.tokenize(text)
    text = " ".join([c for c in text if c not in string.punctuation])
    return text

In [575]:
def make_lower_case(text):
    text = text.lower()
    return text  

In [576]:
df.brand_names=df.brand_names.apply(lambda x: remove_punctuation(x))
df.brand_names=df.brand_names.apply(lambda x: make_lower_case(x))
df.product_names=df.product_names.apply(lambda x: remove_punctuation(x))
df.product_names=df.product_names.apply(lambda x: make_lower_case(x))
df_copy.description=df_copy['description'].apply(lambda x: remove_punctuation(x))
df_copy.description=df_copy['description'].apply(lambda x: make_lower_case(x))

In [577]:
brand_list=[]
for i in df.brand_names:
    brand_word=tokenizer.tokenize(i)
    brand_list.append(brand_word)
    

product_list=[]
for i in df.product_names:
    product_word=tokenizer.tokenize(i)
    product_list.append(product_word)

In [578]:
stopwords = nltk.corpus.stopwords.words('english')
new_stopword=['perfume','cosmetic','skincare','famous', 'green','many','italian','base','sensual','black','red','new',
              'top','launched','designed','american','french','secret','main','moderate','true','wear',
             'united','beautiful','unique','numerous','aromatic','open','scent','bottle','middle','note']

stopwords.extend(set(list(chain(*brand_list))))
stopwords.extend(set(list(chain(*product_list))))
stopwords.extend(new_stopword)

In [579]:
def remove_stop_words(text):
    text = tokenizer.tokenize(text)
    text = [w for w in text if not w in stopwords]
    text = " ".join(text)
    return text  

In [580]:
def lemma_words(text):
    text = text.split()
    lemmatizer = WordNetLemmatizer()
    lemma_words = [lemmatizer.lemmatize(word) for word in text]
    text = " ".join(lemma_words)
    return text

In [581]:
#remove numbers and extra white space
def remove_num(text):
    text = re.sub(r'\d+','', text)
    text =" ".join(text.split())
    return text

In [582]:
def remove_sentence(text):
    text=text.replace(' all products are original authentic name brands we do not sell knockoffs or imitations','')
    return text

In [583]:
df_copy.description[0]

'eternity perfume by calvin klein escape into the eternal bliss of the luscious eternity a delectable women s fragrance by calvin klein this majestic perfume combines floral green and aromatic accords for a tantalizing scent that will keep you feeling refreshed and exhilarated for hours after splashing it on your skin top notes of sweet mandarin orange freesia sage and a variety of citrus and green elements introduce the aroma with a powerfully zesty and energetic atmosphere that awakensthe senses middle notes of marigold lily of the valley narcissus jasmine rose violet carnation and lily infuse the elixir with a decadent floral bouquet you can t possibly ignore finally base notes of purple heliotrope patchouli golden amber sandalwood and white musk complete the fragrance for a seductive enchanting perfume you ll relish in sharing with everyone around you created by expert perfumer sophia grojsman this 1988 signature scent was launched by the illustrious american fashion designer calvi

In [584]:
df_copy.description=df_copy['description'].apply(lambda x: remove_sentence(x))

In [585]:
df_copy.description=df_copy['description'].apply(lambda x: remove_stop_words(x))

In [586]:
df_copy.description=df_copy['description'].apply(lambda x: lemma_words(x))

In [587]:
#remove_num
df_copy.description=df_copy['description'].apply(lambda x: remove_num(x))

In [588]:
def kepp_adj(text):
    words=nltk.word_tokenize(text)
    pos=nltk.pos_tag(words)                        
    jj_tagged = [word for word, tag in pos 
                if tag.startswith('JJ')]
    jj_tagged=[word for word,pos in pos if (pos == 'JJ')]
    text=" ".join(jj_tagged)
    return text

In [589]:
df_copy.description=df_copy['description'].apply(lambda x: kepp_adj(x))

In [590]:
df_copy.description=df_copy['description'].apply(lambda x: remove_stop_words(x))

In [591]:
with open('df_description.pkl', 'wb') as f:
    pickle.dump(df_copy.description, f)

In [63]:
# perfume_vis_data = pyLDAvis.prepare(**df_copy.description)