In [8]:
# read in the text data
import pandas as pd
raw_text = pd.read_csv('Data/product_names.csv', header = None)
raw_text.head()

Unnamed: 0,0
0,Actiontec - MyWirelessTV2 Wireless Video Trans...
1,Turtle Beach - Ear Force XO SEVEN PRO Gaming H...
2,"Sony - Passive 3D Glasses - Black"""
3,Asus - WirelessAC1900 Dual-Band Gigabit Wirele...
4,"Sony - AM/FM Dual-Alarm Clock Radio - Black"""


Check the length of the text data.

In [9]:
len(raw_text)

2921

## Data preprocessing

In [35]:
# import necessary libraries
import os
import re 
import string
import csv 
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /home/hankui/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [36]:
# define the function that removes numbers and special characters from product 
stopWords = set(stopwords.words('english'))

def text_process(mess):

    """
    Takes in a string of text, then performs the following:
    1. Remove all numbers and punctuation
    2. Remove all stopwords
    3. Returns a list of the cleaned text
    """
    
    # remove the numbers in the character string
    mess = re.sub('[0-9]+', '', mess)
    
    # Check characters to see if they are in punctuation
    nopunc = [char for char in mess if char not in string.punctuation]

    # Join the characters again to form the string.
    nopunc = ''.join(nopunc)
    
    # Now just remove any stopwords
    return [word.lower() for word in nopunc.split() if word.lower() not in stopWords]

In [39]:
# preprocess the text data with the function above
names = raw_text.iloc[:,0].to_numpy() 

N = len(names) # the number of product names

proc_text = [text_process(names[i]) for i in range(N)]

proc_text[:5]

[['actiontec',
  'mywirelesstv',
  'wireless',
  'video',
  'transmitter',
  'receiver',
  'black'],
 ['turtle',
  'beach',
  'ear',
  'force',
  'xo',
  'seven',
  'pro',
  'gaming',
  'headset',
  'xbox',
  'one',
  'blackgreen'],
 ['sony', 'passive', 'glasses', 'black'],
 ['asus', 'wirelessac', 'dualband', 'gigabit', 'wireless', 'router', 'black'],
 ['sony', 'amfm', 'dualalarm', 'clock', 'radio', 'black']]

## LDA

In [40]:
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
import numpy as np
np.random.seed(2018)
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /home/hankui/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [41]:
# check the unique words in the dictionary
dictionary = gensim.corpora.Dictionary(proc_text)
count = 0
for k, v in dictionary.iteritems():
    print(k, v)
    count += 1
    if count > 10:
        break

0 actiontec
1 black
2 mywirelesstv
3 receiver
4 transmitter
5 video
6 wireless
7 beach
8 blackgreen
9 ear
10 force


In [42]:
# create the bag of words corpus
bow_corpus = [dictionary.doc2bow(doc) for doc in proc_text]
bow_corpus[5]

[(31, 1), (32, 1), (33, 1), (34, 1)]

In [43]:
# run the LDA model 
K = 5
lda_model = gensim.models.LdaMulticore(bow_corpus, num_topics = K, id2word = dictionary, passes = 2, workers = 2)

In [45]:
print(lda_model.print_topics(num_topics = K, num_words=3))

[(0, '0.041*"black" + 0.019*"wireless" + 0.012*"oz"'), (1, '0.023*"black" + 0.015*"oz" + 0.010*"wireless"'), (2, '0.023*"black" + 0.020*"amp" + 0.012*"drive"'), (3, '0.019*"black" + 0.013*"white" + 0.011*"oz"'), (4, '0.023*"wireless" + 0.023*"oz" + 0.019*"white"')]


In [65]:
# get the LDA labels
grps_lda = []

for i in range(N):
    
    values = [v[1] for v in lda_model[bow_corpus[i]]]

    grps_lda.append(max(range(len(values)), key=values.__getitem__) + 1)


In [68]:
grps_lda[:5]

[2, 4, 1, 1, 5]

In [71]:
# convert the labels to a dataframe first (for using to_csv function in the next step)
grps_lda = pd.DataFrame(grps_lda)

grps_lda.head()

Unnamed: 0,0
0,2
1,4
2,1
3,1
4,5


In [72]:
# output the numerical labels to a csv file 
grps_lda.to_csv('grps_lda.csv', index = False, header = False)

## Combine the above code to check for running time

In [74]:
import datetime

begin_time = datetime.datetime.now()

# check the unique words in the dictionary
dictionary = gensim.corpora.Dictionary(proc_text)

# create the bag of words corpus
bow_corpus = [dictionary.doc2bow(doc) for doc in proc_text]

# run the LDA model 
K = 5
lda_model = gensim.models.LdaMulticore(bow_corpus, num_topics = K, id2word = dictionary, passes = 2, workers = 2)

# get the LDA labels
grps_lda = []

for i in range(N):
    
    values = [v[1] for v in lda_model[bow_corpus[i]]]

    grps_lda.append(max(range(len(values)), key=values.__getitem__) + 1)

print(datetime.datetime.now() - begin_time)

0:00:02.154230
