## Step1：Data Preprocessing

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
#from nltk.stem.porter import PorterStemmer
from nltk.tokenize import RegexpTokenizer

nltk.download('punkt')
from nltk.corpus import wordnet

#nltk.download('tagsets')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer
from nltk.tag import pos_tag

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/lichihsu/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/lichihsu/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/lichihsu/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/lichihsu/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [None]:
def remove_stopwords(text):
    text = text.split()
    stops = set(stopwords.words("english"))
    #text = [ps.stem(w) for w in text if not w in stops and len(w) != 1] # also do stemming and remove len(w)=1
    text = [w for w in text if not w in stops and len(w) != 1]
    text = " ".join(text)
    return text

def remove_punctuation(text):
    text = str(text)
    tokenizer = RegexpTokenizer(r'\w+')
    text = tokenizer.tokenize(text)
    text = " ".join(text)
    return text

def get_wordnet_pos(word):
    """Map POS tag to first character lemmatize() accepts"""
    tag = pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}

    return tag_dict.get(tag, wordnet.NOUN)

def lemmatize_word(text):
    lemmatized_word = [lemmatizer.lemmatize(w, get_wordnet_pos(w)) for w in nltk.word_tokenize(text)]
    return " ".join(lemmatized_word)

def Preprocessing(df):
    df.loc[:, 'text'] = df.loc[:, 'text'].str.lower() # make lower case
    df.loc[:, 'text'] = df.loc[:, 'text'].apply(remove_stopwords) # remove stopwords
    df.loc[:, 'text'] = df.loc[:, 'text'].apply(remove_punctuation) # remove punctuation
    df.loc[:, 'text'] = df.loc[:, 'text'].apply(lemmatize_word)
    return df

In [None]:
df = pd.read_csv("product_info_text_48962.csv")
df['text'] = df['product_info'].str.lower().fillna(" ")

lemmatizer = WordNetLemmatizer()
Preprocessing(df)

Unnamed: 0,product_id,product_name,product_info,text
0,1,Chocolate Sandwich Cookies,One box with 18 snack packs 2 cookies per pack...,one box with 18 snack pack 2 cooky per pack of...
1,2,All-Seasons Salt,Flavorful alternative to table salt Great for ...,flavorful alternative to table salt great for ...
2,3,Robust Golden Unsweetened Oolong Tea,These Lipton Iced Tea bags let you easily brew...,these lipton iced tea bag let you easily brew ...
3,4,Smart Ones Classic Favorites Mini Rigatoni Wit...,One 9 oz box of Smart Ones Three Cheese Ziti W...,one 9 oz box of smart one three cheese ziti wi...
4,5,Green Chile Anytime Sauce,Packed in USA Packed in USA Classic pasta sauc...,packed in usa packed in usa classic pasta sauc...
...,...,...,...,...
48957,49684,"Vodka, Triple Distilled, Twist of Vanilla",Ten times filtered Distilled from premium grai...,ten time filter distil from premium grain 40 a...
48958,49685,En Croute Roast Hazelnut Cranberry,Ground coffee beans Light roast 100 arabica Wo...,ground coffee bean light roast 100 arabica wor...
48959,49686,Artisan Baguette,Take and bake french baguette loaves Chewy and...,take and bake french baguette loaf chewy and r...
48960,49687,Smartblend Healthy Metabolism Dry Cat Food,Real turkey is the 1 ingredient Natural plus e...,real turkey be the 1 ingredient natural plus e...


In [None]:
df.to_csv("product_lemmatized_48962.csv", index=False)

## Step 2：讀取資料

In [None]:
# 讀取經過前處理的文字資料
df = pd.read_csv("product_lemmatized_48962.csv")

In [None]:
# 準備停用字辭典
with open("stop_words.txt",'r',encoding='utf-8')as file:
    stop_words = file.read().split('\n')

## Step 3：Document-Term Matrix

In [None]:
docs=df['text'].values.astype('U').tolist()
cv=CountVectorizer(max_df=0.85,stop_words=stop_words, max_features=10000, min_df=10)
word_count_vector=cv.fit_transform(docs)

In [None]:
word_count_vector.shape

(48962, 10000)

In [None]:
# 獲得各商品的tfidf向量
transformer = TfidfTransformer()

tfidf = transformer.fit_transform(word_count_vector)
weight = tfidf.toarray()

X = tfidf

In [None]:
X.shape

(48962, 10000)

In [None]:
final_df = pd.DataFrame(data = weight, columns = cv.get_feature_names())

In [None]:
# first 10 words with highest weight on document 0:
final_df.T.nlargest(10, 0)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,48952,48953,48954,48955,48956,48957,48958,48959,48960,48961
cooky,0.608061,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.380142,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
chocolate,0.334646,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.363367,0.0,0.0,0.0,0.0,0.0,0.085111,0.0,0.0,0.0
dunkable,0.316578,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
sandwich,0.2905,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.025092,0.0,0.0,0.0,0.0,0.138531,0.084843,0.0,0.0
creme,0.248562,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
wafer,0.182525,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.210201,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
stuf,0.175201,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
cookie,0.165294,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.02225,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
snack,0.154512,0.037681,0.0,0.0,0.0,0.0,0.0,0.0,0.034732,0.0,...,0.120411,0.0,0.0,0.0,0.066668,0.0,0.093072,0.0,0.0,0.0
supremely,0.145594,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Step 4：LSA Topic Modeling

In [None]:
from sklearn.decomposition import TruncatedSVD 
# SVD represent documents and terms in vectors 
svd_model = TruncatedSVD(n_components=100, algorithm='randomized', n_iter=100, random_state=122) 
svd_model.fit(X) 
len(svd_model.components_)

100

In [None]:
terms = cv.get_feature_names() 
result = {}
for i, comp in enumerate(svd_model.components_): 
    terms_comp = zip(terms, comp) 
    sorted_terms = sorted(terms_comp, key= lambda x:x[1],reverse=True)[:10] 
    print("Topic "+str(i)+": ") 
    topic_word = []
    for t in sorted_terms: 
        topic_word.append(t[0])
        print(t[0], " ")
    print("\n")
    result[str(i)] = topic_word

Topic 0: 
organic  
flavor  
cheese  
snack  
free  
taste  
ingredient  
add  
great  
delicious  


Topic 1: 
chocolate  
bar  
candy  
ice  
cream  
milk  
cooky  
snack  
skin  
tea  


Topic 2: 
tea  
skin  
baby  
hair  
organic  
vitamin  
wash  
body  
help  
clean  


Topic 3: 
skin  
chocolate  
sauce  
cream  
hair  
pasta  
ice  
oil  
wash  
candy  


Topic 4: 
tea  
coffee  
ice  
iced  
sauce  
green  
drink  
cup  
blend  
brew  


Topic 5: 
cheese  
tea  
shred  
slice  
cheddar  
cream  
cracker  
ice  
great  
salad  


Topic 6: 
tea  
cheese  
grain  
cracker  
bar  
nature  
free  
cheddar  
oat  
gluten  


Topic 7: 
dog  
snack  
beef  
turkey  
chip  
chicken  
mayer  
oscar  
flavor  
grain  


Topic 8: 
yogurt  
dog  
greek  
cream  
protein  
free  
milk  
coffee  
vitamin  
dairy  


Topic 9: 
dog  
baby  
chocolate  
candy  
chicken  
cat  
turkey  
tea  
food  
mayer  


Topic 10: 
pasta  
sauce  
bar  
tomato  
nature  
vitamin  
energy  
italian  
yogurt

Topic 89: 
waffle  
pancake  
pain  
barilla  
hand  
soap  
tortilla  
honey  
protein  
toothpaste  


Topic 90: 
shave  
razor  
blade  
gel  
vegetable  
egg  
marinade  
venus  
gum  
glide  


Topic 91: 
knorr  
morningstar  
pancake  
pain  
fruit  
side  
honey  
mix  
shave  
chip  


Topic 92: 
potato  
popcorn  
pepper  
pudding  
marinade  
salsa  
tuna  
flour  
milk  
red  


Topic 93: 
ginger  
waffle  
beer  
fillet  
seed  
cola  
knorr  
pad  
pancake  
sunflower  


Topic 94: 
knorr  
shave  
earth  
marinade  
beer  
egg  
pudding  
gluten  
cola  
chang  


Topic 95: 
shave  
razor  
noodle  
toothpaste  
trash  
olive  
pudding  
jell  
macaroni  
greek  


Topic 96: 
body  
ginger  
beer  
marinade  
cola  
energy  
shred  
ale  
archer  
pad  


Topic 97: 
gelato  
energy  
beef  
talenti  
gatorade  
tortilla  
mustard  
milk  
slice  
cake  


Topic 98: 
smucker  
flour  
hand  
kid  
shave  
quaker  
cranberry  
morningstar  
gatorade  
spread  


Topic 99: 


In [None]:
pd.DataFrame.from_dict(result)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,organic,chocolate,tea,skin,tea,cheese,tea,dog,yogurt,dog,...,shave,knorr,potato,ginger,knorr,shave,body,gelato,smucker,spray
1,flavor,bar,skin,chocolate,coffee,tea,cheese,snack,dog,baby,...,razor,morningstar,popcorn,waffle,shave,razor,ginger,energy,flour,cranberry
2,cheese,candy,baby,sauce,ice,shred,grain,beef,greek,chocolate,...,blade,pancake,pepper,beer,earth,noodle,beer,beef,hand,salt
3,snack,ice,hair,cream,iced,slice,cracker,turkey,cream,candy,...,gel,pain,pudding,fillet,marinade,toothpaste,marinade,talenti,kid,pudding
4,free,cream,organic,hair,sauce,cheddar,bar,chip,protein,chicken,...,vegetable,fruit,marinade,seed,beer,trash,cola,gatorade,shave,gum
5,taste,milk,vitamin,pasta,green,cream,nature,chicken,free,cat,...,egg,side,salsa,cola,egg,olive,energy,tortilla,quaker,food
6,ingredient,cooky,wash,ice,drink,cracker,free,mayer,milk,turkey,...,marinade,honey,tuna,knorr,pudding,pudding,shred,mustard,cranberry,cooler
7,add,snack,body,oil,cup,ice,cheddar,oscar,coffee,tea,...,venus,mix,flour,pad,gluten,jell,ale,milk,morningstar,jell
8,great,skin,help,wash,blend,great,oat,flavor,vitamin,food,...,gum,shave,milk,pancake,cola,macaroni,archer,slice,gatorade,ocean
9,delicious,tea,clean,candy,brew,salad,gluten,grain,dairy,mayer,...,glide,chip,red,sunflower,chang,greek,pad,cake,spread,sea
